diff --git a/contrib/eigen/CMakeLists.txt b/contrib/eigen/CMakeLists.txt
index dbb9bcf220f2355e7fe9ccbee844dea1b1ff95ba..f3e69b845551d82198f70203c1b8bb356086d7d4 100644
--- a/contrib/eigen/CMakeLists.txt
+++ b/contrib/eigen/CMakeLists.txt
@@ -1,6 +1,7 @@
-project(Eigen3)
+# cmake_minimum_require must be the first command of the file
+cmake_minimum_required(VERSION 3.5.0)
 
-cmake_minimum_required(VERSION 2.8.5)
+project(Eigen3)
 
 # guard against in-source builds
 
@@ -8,6 +9,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
   message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ")
 endif()
 
+
 # Alias Eigen_*_DIR to Eigen3_*_DIR:
 
 set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR})
@@ -19,16 +21,9 @@ if (NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "Release")
 endif()
 
-string(TOLOWER "${CMAKE_BUILD_TYPE}" cmake_build_type_tolower)
-if(    NOT cmake_build_type_tolower STREQUAL "debug"
-   AND NOT cmake_build_type_tolower STREQUAL "release"
-   AND NOT cmake_build_type_tolower STREQUAL "relwithdebinfo")
-  message(FATAL_ERROR "Unknown build type \"${CMAKE_BUILD_TYPE}\". Allowed values are Debug, Release, RelWithDebInfo (case-insensitive).")
-endif()
-
 
 #############################################################################
-# retrieve version infomation                                               #
+# retrieve version information                                               #
 #############################################################################
 
 # automatically parse the version number
@@ -41,29 +36,28 @@ string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_
 set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}")
 set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})
 
-# if we are not in a mercurial clone
-if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.hg)
-  # if the mercurial program is absent or this will leave the EIGEN_HG_CHANGESET string empty,
+# if we are not in a git clone
+if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.git)
+  # if the git program is absent or this will leave the EIGEN_GIT_REVNUM string empty,
   # but won't stop CMake.
-  execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
-  execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
+  execute_process(COMMAND git ls-remote --refs -q ${CMAKE_SOURCE_DIR} HEAD OUTPUT_VARIABLE EIGEN_GIT_OUTPUT)
 endif()
 
-# if this is the default (aka development) branch, extract the mercurial changeset number from the hg tip output...
-if(EIGEN_BRANCH_OUTPUT MATCHES "default")
-string(REGEX MATCH "^changeset: *[0-9]*:([0-9;a-f]+).*" EIGEN_HG_CHANGESET_MATCH "${EIGEN_HGTIP_OUTPUT}")
-set(EIGEN_HG_CHANGESET "${CMAKE_MATCH_1}")
-endif(EIGEN_BRANCH_OUTPUT MATCHES "default")
+# extract the git rev number from the git output...
+if(EIGEN_GIT_OUTPUT)
+string(REGEX MATCH "^([0-9;a-f]+).*" EIGEN_GIT_CHANGESET_MATCH "${EIGEN_GIT_OUTPUT}")
+set(EIGEN_GIT_REVNUM "${CMAKE_MATCH_1}")
+endif()
 #...and show it next to the version number
-if(EIGEN_HG_CHANGESET)
-  set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER} (mercurial changeset ${EIGEN_HG_CHANGESET})")
-else(EIGEN_HG_CHANGESET)
+if(EIGEN_GIT_REVNUM)
+  set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER} (git rev ${EIGEN_GIT_REVNUM})")
+else()
   set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER}")
-endif(EIGEN_HG_CHANGESET)
-
+endif()
 
 include(CheckCXXCompilerFlag)
 include(GNUInstallDirs)
+include(CMakeDependentOption)
 
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 
@@ -78,7 +72,7 @@ macro(ei_add_cxx_compiler_flag FLAG)
   if(COMPILER_SUPPORT_${SFLAG})
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}")
   endif()
-endmacro(ei_add_cxx_compiler_flag)
+endmacro()
 
 check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
 
@@ -94,6 +88,9 @@ else()
   ei_add_cxx_compiler_flag("-std=c++03")
 endif()
 
+# Determine if we should build shared libraries on this platform.
+get_cmake_property(EIGEN_BUILD_SHARED_LIBS TARGET_SUPPORTS_SHARED_LIBS)
+
 #############################################################################
 # find how to link to the standard libraries                                #
 #############################################################################
@@ -134,7 +131,7 @@ if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
   option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON)
 endif()
 
-set(CMAKE_INCLUDE_CURRENT_DIR ON)
+set(CMAKE_INCLUDE_CURRENT_DIR OFF)
 
 option(EIGEN_SPLIT_LARGE_TESTS "Split large tests into smaller executables" ON)
 
@@ -158,7 +155,7 @@ if(NOT MSVC)
   ei_add_cxx_compiler_flag("-Wall")
   ei_add_cxx_compiler_flag("-Wextra")
   #ei_add_cxx_compiler_flag("-Weverything")              # clang
-  
+
   ei_add_cxx_compiler_flag("-Wundef")
   ei_add_cxx_compiler_flag("-Wcast-align")
   ei_add_cxx_compiler_flag("-Wchar-subscripts")
@@ -173,29 +170,25 @@ if(NOT MSVC)
   ei_add_cxx_compiler_flag("-Wc++11-extensions")
   ei_add_cxx_compiler_flag("-Wdouble-promotion")
 #  ei_add_cxx_compiler_flag("-Wconversion")
-  
-  # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6
-  # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
-  if(NOT CMAKE_COMPILER_IS_GNUCXX)
-    ei_add_cxx_compiler_flag("-Wshadow")
-  endif()
-  
+
+  ei_add_cxx_compiler_flag("-Wshadow")
+
   ei_add_cxx_compiler_flag("-Wno-psabi")
   ei_add_cxx_compiler_flag("-Wno-variadic-macros")
   ei_add_cxx_compiler_flag("-Wno-long-long")
-  
+
   ei_add_cxx_compiler_flag("-fno-check-new")
   ei_add_cxx_compiler_flag("-fno-common")
   ei_add_cxx_compiler_flag("-fstrict-aliasing")
   ei_add_cxx_compiler_flag("-wd981")                    # disable ICC's "operands are evaluated in unspecified order" remark
   ei_add_cxx_compiler_flag("-wd2304")                   # disable ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
-  
-  
+
+
   # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails
   # Moreover we should not set both -strict-ansi and -ansi
   check_cxx_compiler_flag("-strict-ansi" COMPILER_SUPPORT_STRICTANSI)
   ei_add_cxx_compiler_flag("-Qunused-arguments")        # disable clang warning: argument unused during compilation: '-ansi'
-  
+
   if(COMPILER_SUPPORT_STRICTANSI)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -strict-ansi")
   else()
@@ -206,7 +199,7 @@ if(NOT MSVC)
     ei_add_cxx_compiler_flag("-pie")
     ei_add_cxx_compiler_flag("-fPIE")
   endif()
-  
+
   set(CMAKE_REQUIRED_FLAGS "")
 
   option(EIGEN_TEST_SSE2 "Enable/Disable SSE2 in tests/examples" OFF)
@@ -251,12 +244,30 @@ if(NOT MSVC)
     message(STATUS "Enabling FMA in tests/examples")
   endif()
 
+  option(EIGEN_TEST_AVX2 "Enable/Disable AVX2 in tests/examples" OFF)
+  if(EIGEN_TEST_AVX2)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mfma")
+    message(STATUS "Enabling AVX2 in tests/examples")
+  endif()
+
   option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF)
   if(EIGEN_TEST_AVX512)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -fabi-version=6 -DEIGEN_ENABLE_AVX512")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mfma")
+    if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6")
+    endif()
     message(STATUS "Enabling AVX512 in tests/examples")
   endif()
 
+  option(EIGEN_TEST_AVX512DQ "Enable/Disable AVX512DQ in tests/examples" OFF)
+  if(EIGEN_TEST_AVX512DQ)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512dq")
+    if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6")
+    endif()
+    message(STATUS "Enabling AVX512DQ in tests/examples")
+  endif()
+
   option(EIGEN_TEST_F16C "Enable/Disable F16C in tests/examples" OFF)
   if(EIGEN_TEST_F16C)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c")
@@ -275,6 +286,12 @@ if(NOT MSVC)
     message(STATUS "Enabling VSX in tests/examples")
   endif()
 
+  option(EIGEN_TEST_MSA "Enable/Disable MSA in tests/examples" OFF)
+  if(EIGEN_TEST_MSA)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmsa")
+    message(STATUS "Enabling MSA in tests/examples")
+  endif()
+
   option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
   if(EIGEN_TEST_NEON)
     if(EIGEN_TEST_FMA)
@@ -292,12 +309,18 @@ if(NOT MSVC)
     message(STATUS "Enabling NEON in tests/examples")
   endif()
 
-  option(EIGEN_TEST_ZVECTOR "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF)
-  if(EIGEN_TEST_ZVECTOR)
+  option(EIGEN_TEST_Z13 "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF)
+  if(EIGEN_TEST_Z13)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z13 -mzvector")
     message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
   endif()
 
+  option(EIGEN_TEST_Z14 "Enable/Disable S390X(zEC14) ZVECTOR in tests/examples" OFF)
+  if(EIGEN_TEST_Z14)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector")
+    message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
+  endif()
+
   check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP)
   if(COMPILER_SUPPORT_OPENMP)
     option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF)
@@ -307,7 +330,7 @@ if(NOT MSVC)
     endif()
   endif()
 
-else(NOT MSVC)
+else()
 
   # C4127 - conditional expression is constant
   # C4714 - marked as __forceinline not inlined (I failed to deactivate it selectively)
@@ -315,7 +338,7 @@ else(NOT MSVC)
   #         because we are oftentimes returning objects that have a destructor or may
   #         throw exceptions - in particular in the unit tests we are throwing extra many
   #         exceptions to cover indexing errors.
-  # C4505 - unreferenced local function has been removed (impossible to deactive selectively)
+  # C4505 - unreferenced local function has been removed (impossible to deactivate selectively)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /wd4127 /wd4505 /wd4714")
 
   # replace all /Wx by /W4
@@ -335,10 +358,23 @@ else(NOT MSVC)
     if(NOT CMAKE_CL_64)
       # arch is not supported on 64 bit systems, SSE is enabled automatically.
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:SSE2")
-    endif(NOT CMAKE_CL_64)
+    endif()
     message(STATUS "Enabling SSE2 in tests/examples")
-  endif(EIGEN_TEST_SSE2)
-endif(NOT MSVC)
+  endif()
+
+  option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF)
+  if(EIGEN_TEST_AVX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
+    message(STATUS "Enabling AVX in tests/examples")
+  endif()
+
+  option(EIGEN_TEST_FMA "Enable/Disable FMA/AVX2 in tests/examples" OFF)
+  if(EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
+    message(STATUS "Enabling FMA/AVX2 in tests/examples")
+  endif()
+
+endif()
 
 option(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION "Disable explicit vectorization in tests/examples" OFF)
 option(EIGEN_TEST_X87 "Force using X87 instructions. Implies no vectorization." OFF)
@@ -382,7 +418,7 @@ endif()
 
 set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code")
 
-include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
 # Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR
 if(EIGEN_INCLUDE_INSTALL_DIR)
@@ -391,25 +427,26 @@ endif()
 
 if(EIGEN_INCLUDE_INSTALL_DIR AND NOT INCLUDE_INSTALL_DIR)
   set(INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR}
-      CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed")
+      CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen header files are installed")
 else()
   set(INCLUDE_INSTALL_DIR
       "${CMAKE_INSTALL_INCLUDEDIR}/eigen3"
-      CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed"
+      CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen header files are installed"
       )
 endif()
 set(CMAKEPACKAGE_INSTALL_DIR
     "${CMAKE_INSTALL_DATADIR}/eigen3/cmake"
-    CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed"
+    CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen3Config.cmake is installed"
     )
 set(PKGCONFIG_INSTALL_DIR
     "${CMAKE_INSTALL_DATADIR}/pkgconfig"
-    CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where eigen3.pc is installed"
+    CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where eigen3.pc is installed"
     )
 
 foreach(var INCLUDE_INSTALL_DIR CMAKEPACKAGE_INSTALL_DIR PKGCONFIG_INSTALL_DIR)
+  # If an absolute path is specified, make it relative to "{CMAKE_INSTALL_PREFIX}".
   if(IS_ABSOLUTE "${${var}}")
-    message(FATAL_ERROR "${var} must be relative to CMAKE_PREFIX_PATH. Got: ${${var}}")
+    file(RELATIVE_PATH "${var}" "${CMAKE_INSTALL_PREFIX}" "${${var}}")
   endif()
 endforeach()
 
@@ -420,9 +457,9 @@ macro(ei_add_target_property target prop value)
   # if the property wasn't previously set, ${previous} is now "previous-NOTFOUND" which cmake allows catching with plain if()
   if(NOT previous)
     set(previous "")
-  endif(NOT previous)
+  endif()
   set_target_properties(${target} PROPERTIES ${prop} "${previous} ${value}")
-endmacro(ei_add_target_property)
+endmacro()
 
 install(FILES
   signature_of_eigen3_matrix_library
@@ -436,9 +473,14 @@ if(EIGEN_BUILD_PKGCONFIG)
         )
 endif()
 
-add_subdirectory(Eigen)
+install(DIRECTORY Eigen DESTINATION ${INCLUDE_INSTALL_DIR} COMPONENT Devel)
+
+
+option(EIGEN_BUILD_DOC "Enable creation of Eigen documentation" ON)
+if(EIGEN_BUILD_DOC)
+  add_subdirectory(doc EXCLUDE_FROM_ALL)
+endif()
 
-add_subdirectory(doc EXCLUDE_FROM_ALL)
 
 option(BUILD_TESTING "Enable creation of Eigen tests." ON)
 if(BUILD_TESTING)
@@ -449,6 +491,8 @@ if(BUILD_TESTING)
   else()
     add_subdirectory(test EXCLUDE_FROM_ALL)
   endif()
+
+  add_subdirectory(failtest)
 endif()
 
 if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
@@ -461,9 +505,32 @@ endif()
 
 # add SYCL
 option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
+option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF)
 if(EIGEN_TEST_SYCL)
   set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
-  include(FindComputeCpp)
+  find_package(Threads REQUIRED)
+  if(EIGEN_SYCL_TRISYCL)
+    message(STATUS "Using triSYCL")
+    include(FindTriSYCL)
+  else()
+    message(STATUS "Using ComputeCPP SYCL")
+    include(FindComputeCpp)
+    set(COMPUTECPP_DRIVER_DEFAULT_VALUE OFF)
+    if (NOT MSVC)
+      set(COMPUTECPP_DRIVER_DEFAULT_VALUE ON)
+    endif()
+    option(COMPUTECPP_USE_COMPILER_DRIVER
+      "Use ComputeCpp driver instead of a 2 steps compilation"
+      ${COMPUTECPP_DRIVER_DEFAULT_VALUE}
+    )
+  endif(EIGEN_SYCL_TRISYCL)
+  option(EIGEN_DONT_VECTORIZE_SYCL "Don't use vectorisation in the SYCL tests." OFF)
+  if(EIGEN_DONT_VECTORIZE_SYCL)
+    message(STATUS "Disabling SYCL vectorization in tests/examples")
+    # When disabling SYCL vectorization, also disable Eigen default vectorization
+    add_definitions(-DEIGEN_DONT_VECTORIZE=1)
+    add_definitions(-DEIGEN_DONT_VECTORIZE_SYCL=1)
+  endif()
 endif()
 
 add_subdirectory(unsupported)
@@ -476,11 +543,11 @@ add_subdirectory(scripts EXCLUDE_FROM_ALL)
 # TODO: consider also replacing EIGEN_BUILD_BTL by a custom target "make btl"?
 if(EIGEN_BUILD_BTL)
   add_subdirectory(bench/btl EXCLUDE_FROM_ALL)
-endif(EIGEN_BUILD_BTL)
+endif()
 
 if(NOT WIN32)
   add_subdirectory(bench/spbench EXCLUDE_FROM_ALL)
-endif(NOT WIN32)
+endif()
 
 configure_file(scripts/cdashtesting.cmake.in cdashtesting.cmake @ONLY)
 
@@ -492,37 +559,32 @@ message(STATUS "")
 message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}")
 message(STATUS "")
 
-option(EIGEN_FAILTEST "Enable failtests." OFF)
-if(EIGEN_FAILTEST)
-  add_subdirectory(failtest)
-endif()
-
 string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower)
 if(cmake_generator_tolower MATCHES "makefile")
-  message(STATUS "Some things you can do now:")
-  message(STATUS "--------------+--------------------------------------------------------------")
-  message(STATUS "Command       |   Description")
-  message(STATUS "--------------+--------------------------------------------------------------")
-  message(STATUS "make install  | Install Eigen. Headers will be installed to:")
-  message(STATUS "              |     <CMAKE_INSTALL_PREFIX>/<INCLUDE_INSTALL_DIR>")
-  message(STATUS "              |   Using the following values:")
-  message(STATUS "              |     CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
-  message(STATUS "              |     INCLUDE_INSTALL_DIR:  ${INCLUDE_INSTALL_DIR}")
-  message(STATUS "              |   Change the install location of Eigen headers using:")
-  message(STATUS "              |     cmake . -DCMAKE_INSTALL_PREFIX=yourprefix")
-  message(STATUS "              |   Or:")
-  message(STATUS "              |     cmake . -DINCLUDE_INSTALL_DIR=yourdir")
-  message(STATUS "make doc      | Generate the API documentation, requires Doxygen & LaTeX")
-  message(STATUS "make check    | Build and run the unit-tests. Read this page:")
-  message(STATUS "              |   http://eigen.tuxfamily.org/index.php?title=Tests")
-  message(STATUS "make blas     | Build BLAS library (not the same thing as Eigen)")
-  message(STATUS "make uninstall| Removes files installed by make install")
-  message(STATUS "--------------+--------------------------------------------------------------")
+  message(STATUS "Available targets (use: make TARGET):")
 else()
-  message(STATUS "To build/run the unit tests, read this page:")
-  message(STATUS "  http://eigen.tuxfamily.org/index.php?title=Tests")
+  message(STATUS "Available targets (use: cmake --build . --target TARGET):")
 endif()
-
+message(STATUS "---------+--------------------------------------------------------------")
+message(STATUS "Target   |   Description")
+message(STATUS "---------+--------------------------------------------------------------")
+message(STATUS "install  | Install Eigen. Headers will be installed to:")
+message(STATUS "         |     <CMAKE_INSTALL_PREFIX>/<INCLUDE_INSTALL_DIR>")
+message(STATUS "         |   Using the following values:")
+message(STATUS "         |     CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
+message(STATUS "         |     INCLUDE_INSTALL_DIR:  ${INCLUDE_INSTALL_DIR}")
+message(STATUS "         |   Change the install location of Eigen headers using:")
+message(STATUS "         |     cmake . -DCMAKE_INSTALL_PREFIX=yourprefix")
+message(STATUS "         |   Or:")
+message(STATUS "         |     cmake . -DINCLUDE_INSTALL_DIR=yourdir")
+message(STATUS "doc      | Generate the API documentation, requires Doxygen & LaTeX")
+if(BUILD_TESTING)
+  message(STATUS "check    | Build and run the unit-tests. Read this page:")
+  message(STATUS "         |   http://eigen.tuxfamily.org/index.php?title=Tests")
+endif()
+message(STATUS "blas     | Build BLAS library (not the same thing as Eigen)")
+message(STATUS "uninstall| Remove files installed by the install target")
+message(STATUS "---------+--------------------------------------------------------------")
 message(STATUS "")
 
 
@@ -534,82 +596,48 @@ set ( EIGEN_DEFINITIONS "")
 set ( EIGEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${INCLUDE_INSTALL_DIR}" )
 set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )
 
-# Interface libraries require at least CMake 3.0
-if (NOT CMAKE_VERSION VERSION_LESS 3.0)
-  include (CMakePackageConfigHelpers)
-
-  # Imported target support
-  add_library (eigen INTERFACE)
-
-  target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS})
-  target_include_directories (eigen INTERFACE
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-    $<INSTALL_INTERFACE:${INCLUDE_INSTALL_DIR}>
-  )
-
-  # Export as title case Eigen
-  set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen)
-
-  install (TARGETS eigen EXPORT Eigen3Targets)
-
-  configure_package_config_file (
-    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
-    ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
-    PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
-    INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
-    NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
-  )
-  # Remove CMAKE_SIZEOF_VOID_P from Eigen3ConfigVersion.cmake since Eigen does
-  # not depend on architecture specific settings or libraries. More
-  # specifically, an Eigen3Config.cmake generated from a 64 bit target can be
-  # used for 32 bit targets as well (and vice versa).
-  set (_Eigen3_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})
-  unset (CMAKE_SIZEOF_VOID_P)
-  write_basic_package_version_file (Eigen3ConfigVersion.cmake
-                                    VERSION ${EIGEN_VERSION_NUMBER}
-                                    COMPATIBILITY SameMajorVersion)
-  set (CMAKE_SIZEOF_VOID_P ${_Eigen3_CMAKE_SIZEOF_VOID_P})
-
-  # The Eigen target will be located in the Eigen3 namespace. Other CMake
-  # targets can refer to it using Eigen3::Eigen.
-  export (TARGETS eigen NAMESPACE Eigen3:: FILE Eigen3Targets.cmake)
-  # Export Eigen3 package to CMake registry such that it can be easily found by
-  # CMake even if it has not been installed to a standard directory.
-  export (PACKAGE Eigen3)
-
-  install (EXPORT Eigen3Targets NAMESPACE Eigen3:: DESTINATION ${CMAKEPACKAGE_INSTALL_DIR})
-
-else (NOT CMAKE_VERSION VERSION_LESS 3.0)
-  # Fallback to legacy Eigen3Config.cmake without the imported target
-  
-  # If CMakePackageConfigHelpers module is available (CMake >= 2.8.8)
-  # create a relocatable Config file, otherwise leave the hardcoded paths       
-  include(CMakePackageConfigHelpers OPTIONAL RESULT_VARIABLE CPCH_PATH)
-  
-  if(CPCH_PATH)
-    configure_package_config_file (
-      ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
-      ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
-      PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
-      INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
-      NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
-    )
-  else() 
-    # The PACKAGE_* variables are defined by the configure_package_config_file
-    # but without it we define them manually to the hardcoded paths
-    set(PACKAGE_INIT "")
-    set(PACKAGE_EIGEN_INCLUDE_DIR ${EIGEN_INCLUDE_DIR})
-    set(PACKAGE_EIGEN_ROOT_DIR ${EIGEN_ROOT_DIR})
-    configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in
-                     ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
-                     @ONLY ESCAPE_QUOTES )
-  endif()
-
-  write_basic_package_version_file( Eigen3ConfigVersion.cmake
-                                    VERSION ${EIGEN_VERSION_NUMBER}
-                                    COMPATIBILITY SameMajorVersion )
-
-endif (NOT CMAKE_VERSION VERSION_LESS 3.0)
+include (CMakePackageConfigHelpers)
+
+# Imported target support
+add_library (eigen INTERFACE)
+add_library (Eigen3::Eigen ALIAS eigen)
+target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS})
+target_include_directories (eigen INTERFACE
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  $<INSTALL_INTERFACE:${INCLUDE_INSTALL_DIR}>
+)
+
+# Export as title case Eigen
+set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen)
+
+install (TARGETS eigen EXPORT Eigen3Targets)
+
+configure_package_config_file (
+  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
+  PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR
+  INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
+  NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components
+)
+# Remove CMAKE_SIZEOF_VOID_P from Eigen3ConfigVersion.cmake since Eigen does
+# not depend on architecture specific settings or libraries. More
+# specifically, an Eigen3Config.cmake generated from a 64 bit target can be
+# used for 32 bit targets as well (and vice versa).
+set (_Eigen3_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})
+unset (CMAKE_SIZEOF_VOID_P)
+write_basic_package_version_file (Eigen3ConfigVersion.cmake
+                                  VERSION ${EIGEN_VERSION_NUMBER}
+                                  COMPATIBILITY SameMajorVersion)
+set (CMAKE_SIZEOF_VOID_P ${_Eigen3_CMAKE_SIZEOF_VOID_P})
+
+# The Eigen target will be located in the Eigen3 namespace. Other CMake
+# targets can refer to it using Eigen3::Eigen.
+export (TARGETS eigen NAMESPACE Eigen3:: FILE Eigen3Targets.cmake)
+# Export Eigen3 package to CMake registry such that it can be easily found by
+# CMake even if it has not been installed to a standard directory.
+export (PACKAGE Eigen3)
+
+install (EXPORT Eigen3Targets NAMESPACE Eigen3:: DESTINATION ${CMAKEPACKAGE_INSTALL_DIR})
 
 install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
                 ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
@@ -619,3 +647,7 @@ install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
 # Add uninstall target
 add_custom_target ( uninstall
     COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake)
+
+if (EIGEN_SPLIT_TESTSUITE)
+  ei_split_testsuite("${EIGEN_SPLIT_TESTSUITE}")
+endif()
diff --git a/contrib/eigen/COPYING.APACHE b/contrib/eigen/COPYING.APACHE
new file mode 100644
index 0000000000000000000000000000000000000000..61e948d2aaa3f7cd1db246b874d3b8ad1e8204fb
--- /dev/null
+++ b/contrib/eigen/COPYING.APACHE
@@ -0,0 +1,203 @@
+/*
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
\ No newline at end of file
diff --git a/contrib/eigen/COPYING.BSD b/contrib/eigen/COPYING.BSD
index 11971ffe25ba29dd65800b77864490ee6df251c2..8964ddfdd0effc65e8dd90ea1d8c411efb30e5fd 100644
--- a/contrib/eigen/COPYING.BSD
+++ b/contrib/eigen/COPYING.BSD
@@ -23,4 +23,4 @@
  ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
\ No newline at end of file
+*/
diff --git a/contrib/eigen/COPYING.MINPACK b/contrib/eigen/COPYING.MINPACK
index 11d8a9a6c340f8ae340b3f009de8509fcea775fd..132cc3f33fa7fd4169a92b05241db59c8428a7ab 100644
--- a/contrib/eigen/COPYING.MINPACK
+++ b/contrib/eigen/COPYING.MINPACK
@@ -1,52 +1,51 @@
-Minpack Copyright Notice (1999) University of Chicago.  All rights reserved
-
-Redistribution and use in source and binary forms, with or
-without modification, are permitted provided that the
-following conditions are met:
-
-1. Redistributions of source code must retain the above
-copyright notice, this list of conditions and the following
-disclaimer.
-
-2. Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following
-disclaimer in the documentation and/or other materials
-provided with the distribution.
-
-3. The end-user documentation included with the
-redistribution, if any, must include the following
-acknowledgment:
-
-   "This product includes software developed by the
-   University of Chicago, as Operator of Argonne National
-   Laboratory.
-
-Alternately, this acknowledgment may appear in the software
-itself, if and wherever such third-party acknowledgments
-normally appear.
-
-4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS"
-WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE
-UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND
-THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE
-OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY
-OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR
-USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF
-THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4)
-DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION
-UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL
-BE CORRECTED.
-
-5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT
-HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF
-ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT,
-INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF
-ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF
-PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER
-SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT
-(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE,
-EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE
-POSSIBILITY OF SUCH LOSS OR DAMAGES.
-
+Minpack Copyright Notice (1999) University of Chicago.  All rights reserved
+
+Redistribution and use in source and binary forms, with or
+without modification, are permitted provided that the
+following conditions are met:
+
+1. Redistributions of source code must retain the above
+copyright notice, this list of conditions and the following
+disclaimer.
+
+2. Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials
+provided with the distribution.
+
+3. The end-user documentation included with the
+redistribution, if any, must include the following
+acknowledgment:
+
+   "This product includes software developed by the
+   University of Chicago, as Operator of Argonne National
+   Laboratory.
+
+Alternately, this acknowledgment may appear in the software
+itself, if and wherever such third-party acknowledgments
+normally appear.
+
+4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS"
+WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE
+UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND
+THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE
+OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY
+OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR
+USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF
+THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4)
+DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION
+UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL
+BE CORRECTED.
+
+5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT
+HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF
+ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT,
+INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF
+ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF
+PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER
+SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT
+(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE,
+EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE
+POSSIBILITY OF SUCH LOSS OR DAMAGES.
diff --git a/contrib/eigen/CTestConfig.cmake b/contrib/eigen/CTestConfig.cmake
index 45de0e6fcaf0ef57fed52006e1192eba4ce9dddb..0ea24b8e3447872d8bbb08e574678af7ef4f937f 100644
--- a/contrib/eigen/CTestConfig.cmake
+++ b/contrib/eigen/CTestConfig.cmake
@@ -2,8 +2,8 @@
 ## Then modify the CMakeLists.txt file in the root directory of your
 ## project to incorporate the testing dashboard.
 ## # The following are required to uses Dart and the Cdash dashboard
-##   ENABLE_TESTING()
-##   INCLUDE(CTest)
+##   enable_testing()
+##   include(CTest)
 set(CTEST_PROJECT_NAME "Eigen")
 set(CTEST_NIGHTLY_START_TIME "00:00:00 UTC")
 
@@ -11,3 +11,7 @@ set(CTEST_DROP_METHOD "http")
 set(CTEST_DROP_SITE "my.cdash.org")
 set(CTEST_DROP_LOCATION "/submit.php?project=Eigen")
 set(CTEST_DROP_SITE_CDASH TRUE)
+#set(CTEST_PROJECT_SUBPROJECTS
+#Official
+#Unsupported
+#)
diff --git a/contrib/eigen/Eigen/CMakeLists.txt b/contrib/eigen/Eigen/CMakeLists.txt
deleted file mode 100644
index 9eb502b792d4a9607e2d323f2d8114939b43c209..0000000000000000000000000000000000000000
--- a/contrib/eigen/Eigen/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-include(RegexUtils)
-test_escape_string_as_regex()
-
-file(GLOB Eigen_directory_files "*")
-
-escape_string_as_regex(ESCAPED_CMAKE_CURRENT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
-
-foreach(f ${Eigen_directory_files})
-  if(NOT f MATCHES "\\.txt" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/[.].+" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/src")
-    list(APPEND Eigen_directory_files_to_install ${f})
-  endif()
-endforeach(f ${Eigen_directory_files})
-
-install(FILES
-  ${Eigen_directory_files_to_install}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel
-  )
-
-install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
diff --git a/contrib/eigen/Eigen/Cholesky b/contrib/eigen/Eigen/Cholesky
index 1332b540d838247e671ac3e256512501ac985846..a318ceb797dbaa10b2e1185583eb4803d7a83f8f 100644
--- a/contrib/eigen/Eigen/Cholesky
+++ b/contrib/eigen/Eigen/Cholesky
@@ -43,4 +43,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_CHOLESKY_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/contrib/eigen/Eigen/Core b/contrib/eigen/Eigen/Core
index ac7c5b3004f5fc198482419f739cca30130a0c4a..5921e15f9df46319ceb5436296b3271ca4dc9e65 100644
--- a/contrib/eigen/Eigen/Core
+++ b/contrib/eigen/Eigen/Core
@@ -11,251 +11,55 @@
 #ifndef EIGEN_CORE_H
 #define EIGEN_CORE_H
 
-// first thing Eigen does: stop the compiler from committing suicide
+// first thing Eigen does: stop the compiler from reporting useless warnings.
 #include "src/Core/util/DisableStupidWarnings.h"
 
-#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
-  #define EIGEN_CUDACC __CUDACC__
-#endif
-
-#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
-  #define EIGEN_CUDA_ARCH __CUDA_ARCH__
-#endif
-
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-#define EIGEN_CUDACC_VER  ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
-#elif defined(__CUDACC_VER__)
-#define EIGEN_CUDACC_VER __CUDACC_VER__
-#else
-#define EIGEN_CUDACC_VER 0
-#endif
-
-// Handle NVCC/CUDA/SYCL
-#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
-  // Do not try asserts on CUDA and SYCL!
-  #ifndef EIGEN_NO_DEBUG
-  #define EIGEN_NO_DEBUG
-  #endif
-
-  #ifdef EIGEN_INTERNAL_DEBUGGING
-  #undef EIGEN_INTERNAL_DEBUGGING
-  #endif
-
-  #ifdef EIGEN_EXCEPTIONS
-  #undef EIGEN_EXCEPTIONS
-  #endif
-
-  // All functions callable from CUDA code must be qualified with __device__
-  #ifdef __CUDACC__
-    // Do not try to vectorize on CUDA and SYCL!
-    #ifndef EIGEN_DONT_VECTORIZE
-    #define EIGEN_DONT_VECTORIZE
-    #endif
-
-    #define EIGEN_DEVICE_FUNC __host__ __device__
-    // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro
-    // works properly on the device side
-    #include <cuda_runtime.h>
-  #else
-    #define EIGEN_DEVICE_FUNC
-  #endif
-
-#else
-  #define EIGEN_DEVICE_FUNC
+// then include this file where all our macros are defined. It's really important to do it first because
+// it's where we do all the compiler/OS/arch detections and define most defaults.
+#include "src/Core/util/Macros.h"
 
-#endif
+// This detects SSE/AVX/NEON/etc. and configure alignment settings
+#include "src/Core/util/ConfigureVectorization.h"
 
-// When compiling CUDA device code with NVCC, pull in math functions from the
-// global namespace.  In host mode, and when device doee with clang, use the
-// std versions.
-#if defined(__CUDA_ARCH__) && defined(__NVCC__)
-  #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
-#else
-  #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
+// We need cuda_runtime.h/hip_runtime.h to ensure that
+// the EIGEN_USING_STD macro works properly on the device side
+#if defined(EIGEN_CUDACC)
+  #include <cuda_runtime.h>
+#elif defined(EIGEN_HIPCC)
+  #include <hip/hip_runtime.h>
 #endif
 
-#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
-  #define EIGEN_EXCEPTIONS
-#endif
 
 #ifdef EIGEN_EXCEPTIONS
   #include <new>
 #endif
 
-// then include this file where all our macros are defined. It's really important to do it first because
-// it's where we do all the alignment settings (platform detection and honoring the user's will if he
-// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
-#include "src/Core/util/Macros.h"
-
 // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
 // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
-#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6)
+#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6) && EIGEN_GNUC_AT_MOST(5,5)
   #pragma GCC optimize ("-fno-ipa-cp-clone")
 #endif
 
+// Prevent ICC from specializing std::complex operators that silently fail
+// on device. This allows us to use our own device-compatible specializations
+// instead.
+#if defined(EIGEN_COMP_ICC) && defined(EIGEN_GPU_COMPILE_PHASE) \
+    && !defined(_OVERRIDE_COMPLEX_SPECIALIZATION_)
+#define _OVERRIDE_COMPLEX_SPECIALIZATION_ 1
+#endif
 #include <complex>
 
 // this include file manages BLAS and MKL related macros
 // and inclusion of their respective header files
 #include "src/Core/util/MKL_support.h"
 
-// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
-// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
-#if EIGEN_MAX_ALIGN_BYTES==0
-  #ifndef EIGEN_DONT_VECTORIZE
-    #define EIGEN_DONT_VECTORIZE
-  #endif
-#endif
 
-#if EIGEN_COMP_MSVC
-  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
-  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
-    // Remember that usage of defined() in a #define is undefined by the standard.
-    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
-    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
-      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
-    #endif
-  #endif
-#else
-  // Remember that usage of defined() in a #define is undefined by the standard
-  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
-    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
-  #endif
-#endif
-
-#ifndef EIGEN_DONT_VECTORIZE
-
-  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
-
-    // Defines symbols for compile-time detection of which instructions are
-    // used.
-    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_SSE
-    #define EIGEN_VECTORIZE_SSE2
-
-    // Detect sse3/ssse3/sse4:
-    // gcc and icc defines __SSE3__, ...
-    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
-    // want to force the use of those instructions with msvc.
-    #ifdef __SSE3__
-      #define EIGEN_VECTORIZE_SSE3
-    #endif
-    #ifdef __SSSE3__
-      #define EIGEN_VECTORIZE_SSSE3
-    #endif
-    #ifdef __SSE4_1__
-      #define EIGEN_VECTORIZE_SSE4_1
-    #endif
-    #ifdef __SSE4_2__
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-    #ifdef __AVX__
-      #define EIGEN_VECTORIZE_AVX
-      #define EIGEN_VECTORIZE_SSE3
-      #define EIGEN_VECTORIZE_SSSE3
-      #define EIGEN_VECTORIZE_SSE4_1
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-    #ifdef __AVX2__
-      #define EIGEN_VECTORIZE_AVX2
-    #endif
-    #ifdef __FMA__
-      #define EIGEN_VECTORIZE_FMA
-    #endif
-    #if defined(__AVX512F__) && defined(EIGEN_ENABLE_AVX512)
-      #define EIGEN_VECTORIZE_AVX512
-      #define EIGEN_VECTORIZE_AVX2
-      #define EIGEN_VECTORIZE_AVX
-      #define EIGEN_VECTORIZE_FMA
-      #ifdef __AVX512DQ__
-        #define EIGEN_VECTORIZE_AVX512DQ
-      #endif
-      #ifdef __AVX512ER__
-        #define EIGEN_VECTORIZE_AVX512ER
-      #endif
-    #endif
-
-    // include files
-
-    // This extern "C" works around a MINGW-w64 compilation issue
-    // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
-    // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
-    // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
-    // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
-    // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
-    // notice that since these are C headers, the extern "C" is theoretically needed anyways.
-    extern "C" {
-      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
-      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
-      #if EIGEN_COMP_ICC >= 1110
-        #include <immintrin.h>
-      #else
-        #include <mmintrin.h>
-        #include <emmintrin.h>
-        #include <xmmintrin.h>
-        #ifdef  EIGEN_VECTORIZE_SSE3
-        #include <pmmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSSE3
-        #include <tmmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSE4_1
-        #include <smmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSE4_2
-        #include <nmmintrin.h>
-        #endif
-        #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
-        #include <immintrin.h>
-        #endif
-      #endif
-    } // end extern "C"
-  #elif defined __VSX__
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_VSX
-    #include <altivec.h>
-    // We need to #undef all these ugly tokens defined in <altivec.h>
-    // => use __vector instead of vector
-    #undef bool
-    #undef vector
-    #undef pixel
-  #elif defined __ALTIVEC__
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_ALTIVEC
-    #include <altivec.h>
-    // We need to #undef all these ugly tokens defined in <altivec.h>
-    // => use __vector instead of vector
-    #undef bool
-    #undef vector
-    #undef pixel
-  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_NEON
-    #include <arm_neon.h>
-  #elif (defined __s390x__ && defined __VEC__)
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_ZVECTOR
-    #include <vecintrin.h>
-  #endif
-#endif
-
-#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
-  // We can use the optimized fp16 to float and float to fp16 conversion routines
-  #define EIGEN_HAS_FP16_C
-#endif
-
-#if defined __CUDACC__
-  #define EIGEN_VECTORIZE_CUDA
-  #include <vector_types.h>
-  #if EIGEN_CUDACC_VER >= 70500
-    #define EIGEN_HAS_CUDA_FP16
-  #endif
+#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
+  #define EIGEN_HAS_GPU_FP16
 #endif
 
-#if defined EIGEN_HAS_CUDA_FP16
-  #include <host_defines.h>
-  #include <cuda_fp16.h>
+#if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16)
+  #define EIGEN_HAS_GPU_BF16
 #endif
 
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
@@ -290,6 +94,10 @@
 // for min/max:
 #include <algorithm>
 
+#if EIGEN_HAS_CXX11
+#include <array>
+#endif
+
 // for std::is_nothrow_move_assignable
 #ifdef EIGEN_INCLUDE_TYPE_TRAITS
 #include <type_traits>
@@ -305,38 +113,25 @@
   #include <intrin.h>
 #endif
 
-/** \brief Namespace containing all symbols from the %Eigen library. */
-namespace Eigen {
-
-inline static const char *SimdInstructionSetsInUse(void) {
-#if defined(EIGEN_VECTORIZE_AVX512)
-  return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_AVX)
-  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_SSE4_2)
-  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_SSE4_1)
-  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
-#elif defined(EIGEN_VECTORIZE_SSSE3)
-  return "SSE, SSE2, SSE3, SSSE3";
-#elif defined(EIGEN_VECTORIZE_SSE3)
-  return "SSE, SSE2, SSE3";
-#elif defined(EIGEN_VECTORIZE_SSE2)
-  return "SSE, SSE2";
-#elif defined(EIGEN_VECTORIZE_ALTIVEC)
-  return "AltiVec";
-#elif defined(EIGEN_VECTORIZE_VSX)
-  return "VSX";
-#elif defined(EIGEN_VECTORIZE_NEON)
-  return "ARM NEON";
-#elif defined(EIGEN_VECTORIZE_ZVECTOR)
-  return "S390X ZVECTOR";
-#else
-  return "None";
+#if defined(EIGEN_USE_SYCL)
+  #undef min
+  #undef max
+  #undef isnan
+  #undef isinf
+  #undef isfinite
+  #include <CL/sycl.hpp>
+  #include <map>
+  #include <memory>
+  #include <utility>
+  #include <thread>
+  #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM0
+  #define EIGEN_SYCL_LOCAL_THREAD_DIM0 16
+  #endif
+  #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM1
+  #define EIGEN_SYCL_LOCAL_THREAD_DIM1 16
+  #endif
 #endif
-}
 
-} // end namespace Eigen
 
 #if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
 // This will generate an error message:
@@ -345,7 +140,7 @@ inline static const char *SimdInstructionSetsInUse(void) {
 
 namespace Eigen {
 
-// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
+// we use size_t frequently and we'll never remember to prepend it with std:: every time just to
 // ensure QNX/QCC support
 using std::size_t;
 // gcc 4.6.0 wants std:: for ptrdiff_t
@@ -369,60 +164,90 @@ using std::ptrdiff_t;
 #include "src/Core/util/StaticAssert.h"
 #include "src/Core/util/XprHelper.h"
 #include "src/Core/util/Memory.h"
+#include "src/Core/util/IntegralConstant.h"
+#include "src/Core/util/SymbolicIndex.h"
 
 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
 #include "src/Core/GenericPacketMath.h"
 #include "src/Core/MathFunctionsImpl.h"
 #include "src/Core/arch/Default/ConjHelper.h"
+// Generic half float support
+#include "src/Core/arch/Default/Half.h"
+#include "src/Core/arch/Default/BFloat16.h"
+#include "src/Core/arch/Default/TypeCasting.h"
+#include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"
 
 #if defined EIGEN_VECTORIZE_AVX512
   #include "src/Core/arch/SSE/PacketMath.h"
-  #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
+  #include "src/Core/arch/SSE/Complex.h"
   #include "src/Core/arch/AVX/PacketMath.h"
-  #include "src/Core/arch/AVX/MathFunctions.h"
+  #include "src/Core/arch/AVX/TypeCasting.h"
+  #include "src/Core/arch/AVX/Complex.h"
   #include "src/Core/arch/AVX512/PacketMath.h"
+  #include "src/Core/arch/AVX512/TypeCasting.h"
+  #include "src/Core/arch/AVX512/Complex.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/AVX/MathFunctions.h"
   #include "src/Core/arch/AVX512/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_AVX
   // Use AVX for floats and doubles, SSE for integers
   #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
   #include "src/Core/arch/SSE/Complex.h"
-  #include "src/Core/arch/SSE/MathFunctions.h"
   #include "src/Core/arch/AVX/PacketMath.h"
-  #include "src/Core/arch/AVX/MathFunctions.h"
-  #include "src/Core/arch/AVX/Complex.h"
   #include "src/Core/arch/AVX/TypeCasting.h"
-  #include "src/Core/arch/SSE/TypeCasting.h"
+  #include "src/Core/arch/AVX/Complex.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/AVX/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_SSE
   #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
   #include "src/Core/arch/SSE/MathFunctions.h"
   #include "src/Core/arch/SSE/Complex.h"
-  #include "src/Core/arch/SSE/TypeCasting.h"
 #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
   #include "src/Core/arch/AltiVec/PacketMath.h"
   #include "src/Core/arch/AltiVec/MathFunctions.h"
   #include "src/Core/arch/AltiVec/Complex.h"
 #elif defined EIGEN_VECTORIZE_NEON
   #include "src/Core/arch/NEON/PacketMath.h"
+  #include "src/Core/arch/NEON/TypeCasting.h"
   #include "src/Core/arch/NEON/MathFunctions.h"
   #include "src/Core/arch/NEON/Complex.h"
+#elif defined EIGEN_VECTORIZE_SVE
+  #include "src/Core/arch/SVE/PacketMath.h"
+  #include "src/Core/arch/SVE/TypeCasting.h"
+  #include "src/Core/arch/SVE/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_ZVECTOR
   #include "src/Core/arch/ZVector/PacketMath.h"
   #include "src/Core/arch/ZVector/MathFunctions.h"
   #include "src/Core/arch/ZVector/Complex.h"
+#elif defined EIGEN_VECTORIZE_MSA
+  #include "src/Core/arch/MSA/PacketMath.h"
+  #include "src/Core/arch/MSA/MathFunctions.h"
+  #include "src/Core/arch/MSA/Complex.h"
 #endif
 
-// Half float support
-#include "src/Core/arch/CUDA/Half.h"
-#include "src/Core/arch/CUDA/PacketMathHalf.h"
-#include "src/Core/arch/CUDA/TypeCasting.h"
+#if defined EIGEN_VECTORIZE_GPU
+  #include "src/Core/arch/GPU/PacketMath.h"
+  #include "src/Core/arch/GPU/MathFunctions.h"
+  #include "src/Core/arch/GPU/TypeCasting.h"
+#endif
 
-#if defined EIGEN_VECTORIZE_CUDA
-  #include "src/Core/arch/CUDA/PacketMath.h"
-  #include "src/Core/arch/CUDA/MathFunctions.h"
+#if defined(EIGEN_USE_SYCL)
+  #include "src/Core/arch/SYCL/SyclMemoryModel.h"
+  #include "src/Core/arch/SYCL/InteropHeaders.h"
+#if !defined(EIGEN_DONT_VECTORIZE_SYCL)
+  #include "src/Core/arch/SYCL/PacketMath.h"
+  #include "src/Core/arch/SYCL/MathFunctions.h"
+  #include "src/Core/arch/SYCL/TypeCasting.h"
+#endif
 #endif
 
 #include "src/Core/arch/Default/Settings.h"
+// This file provides generic implementations valid for scalar as well
+#include "src/Core/arch/Default/GenericPacketMathFunctions.h"
 
 #include "src/Core/functors/TernaryFunctors.h"
 #include "src/Core/functors/BinaryFunctors.h"
@@ -433,9 +258,16 @@ using std::ptrdiff_t;
 
 // Specialized functors to enable the processing of complex numbers
 // on CUDA devices
+#ifdef EIGEN_CUDACC
 #include "src/Core/arch/CUDA/Complex.h"
+#endif
 
-#include "src/Core/IO.h"
+#include "src/Core/util/IndexedViewHelper.h"
+#include "src/Core/util/ReshapedHelper.h"
+#include "src/Core/ArithmeticSequence.h"
+#ifndef EIGEN_NO_IO
+  #include "src/Core/IO.h"
+#endif
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
@@ -476,6 +308,8 @@ using std::ptrdiff_t;
 #include "src/Core/Ref.h"
 #include "src/Core/Block.h"
 #include "src/Core/VectorBlock.h"
+#include "src/Core/IndexedView.h"
+#include "src/Core/Reshaped.h"
 #include "src/Core/Transpose.h"
 #include "src/Core/DiagonalMatrix.h"
 #include "src/Core/Diagonal.h"
@@ -512,13 +346,21 @@ using std::ptrdiff_t;
 #include "src/Core/CoreIterators.h"
 #include "src/Core/ConditionEstimator.h"
 
+#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
+  #include "src/Core/arch/AltiVec/MatrixProduct.h"
+#elif defined EIGEN_VECTORIZE_NEON
+  #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
+#endif
+
 #include "src/Core/BooleanRedux.h"
 #include "src/Core/Select.h"
 #include "src/Core/VectorwiseOp.h"
+#include "src/Core/PartialReduxEvaluator.h"
 #include "src/Core/Random.h"
 #include "src/Core/Replicate.h"
 #include "src/Core/Reverse.h"
 #include "src/Core/ArrayWrapper.h"
+#include "src/Core/StlIterators.h"
 
 #ifdef EIGEN_USE_BLAS
 #include "src/Core/products/GeneralMatrixMatrix_BLAS.h"
diff --git a/contrib/eigen/Eigen/Eigenvalues b/contrib/eigen/Eigen/Eigenvalues
index 7d6ac787bed280c2f469526e5f0adb8c6ef71931..5467a2e7b3ecf40c3e544bb29717a66c83f7eaa1 100644
--- a/contrib/eigen/Eigen/Eigenvalues
+++ b/contrib/eigen/Eigen/Eigenvalues
@@ -58,4 +58,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_EIGENVALUES_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/contrib/eigen/Eigen/Geometry b/contrib/eigen/Eigen/Geometry
index da88c03bbf5e678875dd5e4e94d217de985bb9a1..bc78110a846693fa45caa0780cf29d7285695f08 100644
--- a/contrib/eigen/Eigen/Geometry
+++ b/contrib/eigen/Eigen/Geometry
@@ -49,14 +49,11 @@
 #include "src/Geometry/AlignedBox.h"
 #include "src/Geometry/Umeyama.h"
 
-// Use the SSE optimized version whenever possible. At the moment the
-// SSE version doesn't compile when AVX is enabled
-#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
-#include "src/Geometry/arch/Geometry_SSE.h"
+// Use the SSE optimized version whenever possible.
+#if (defined EIGEN_VECTORIZE_SSE) || (defined EIGEN_VECTORIZE_NEON)
+#include "src/Geometry/arch/Geometry_SIMD.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_GEOMETRY_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
-
diff --git a/contrib/eigen/Eigen/Householder b/contrib/eigen/Eigen/Householder
index 89cd81b1afbcff1d01d37585174c3136598a90bb..f2fa79969c96339bd317d157a37c72ec71b4024b 100644
--- a/contrib/eigen/Eigen/Householder
+++ b/contrib/eigen/Eigen/Householder
@@ -27,4 +27,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_HOUSEHOLDER_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/contrib/eigen/Eigen/Jacobi b/contrib/eigen/Eigen/Jacobi
index 17c1d785a16280bb93e8d87e50bd62c0ea6c340d..43edc7a1946edaa8c30c8715c50ebcf8d817fa3a 100644
--- a/contrib/eigen/Eigen/Jacobi
+++ b/contrib/eigen/Eigen/Jacobi
@@ -29,5 +29,4 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_JACOBI_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
 
diff --git a/contrib/eigen/Eigen/KLUSupport b/contrib/eigen/Eigen/KLUSupport
new file mode 100644
index 0000000000000000000000000000000000000000..b23d905351566cb425a803cba5868ea43338f9a6
--- /dev/null
+++ b/contrib/eigen/Eigen/KLUSupport
@@ -0,0 +1,41 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_KLUSUPPORT_MODULE_H
+#define EIGEN_KLUSUPPORT_MODULE_H
+
+#include <Eigen/SparseCore>
+
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+extern "C" {
+#include <btf.h>
+#include <klu.h>
+   }
+
+/** \ingroup Support_modules
+  * \defgroup KLUSupport_Module KLUSupport module
+  *
+  * This module provides an interface to the KLU library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
+  * It provides the following factorization class:
+  * - class KLU: a sparse LU factorization, well-suited for circuit simulation.
+  *
+  * \code
+  * #include <Eigen/KLUSupport>
+  * \endcode
+  *
+  * In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must be linked to the klu library and its dependencies.
+  * The dependencies depend on how umfpack has been compiled.
+  * For a cmake based project, you can use our FindKLU.cmake module to help you in this task.
+  *
+  */
+
+#include "src/KLUSupport/KLUSupport.h"
+
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+
+#endif // EIGEN_KLUSUPPORT_MODULE_H
diff --git a/contrib/eigen/Eigen/LU b/contrib/eigen/Eigen/LU
index 6418a86e1925bf1326e5099fe8babdf32be8a449..1236ceb04676f5e180b589f8ca70c3d9362710cf 100644
--- a/contrib/eigen/Eigen/LU
+++ b/contrib/eigen/Eigen/LU
@@ -38,13 +38,10 @@
 #include "src/LU/Determinant.h"
 #include "src/LU/InverseImpl.h"
 
-// Use the SSE optimized version whenever possible. At the moment the
-// SSE version doesn't compile when AVX is enabled
-#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
-  #include "src/LU/arch/Inverse_SSE.h"
+#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON
+  #include "src/LU/arch/InverseSize4.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_LU_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/contrib/eigen/Eigen/OrderingMethods b/contrib/eigen/Eigen/OrderingMethods
index d8ea36193666f87c8cec6557f575a5b34654bef6..29691a62b44d4956a660a677233c7bce3fd05dce 100644
--- a/contrib/eigen/Eigen/OrderingMethods
+++ b/contrib/eigen/Eigen/OrderingMethods
@@ -63,10 +63,7 @@
   * \endcode
   */
 
-#ifndef EIGEN_MPL2_ONLY
 #include "src/OrderingMethods/Amd.h"
-#endif
-
 #include "src/OrderingMethods/Ordering.h"
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/contrib/eigen/Eigen/PaStiXSupport b/contrib/eigen/Eigen/PaStiXSupport
index de3a63b4d1264e6fa8c15582b702c13ddf8e195d..234619accee0e875a418eda983f870b25255c17e 100644
--- a/contrib/eigen/Eigen/PaStiXSupport
+++ b/contrib/eigen/Eigen/PaStiXSupport
@@ -36,6 +36,7 @@ extern "C" {
   * \endcode
   *
   * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies.
+  * This wrapper resuires PaStiX version 5.x compiled without MPI support.
   * The dependencies depend on how PaSTiX has been compiled.
   * For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task.
   *
diff --git a/contrib/eigen/Eigen/PardisoSupport b/contrib/eigen/Eigen/PardisoSupport
old mode 100755
new mode 100644
diff --git a/contrib/eigen/Eigen/QR b/contrib/eigen/Eigen/QR
index 1be1863a1d568b221b7a15602d7a17793f56978c..8465b62ceee1aadf915660451142fa2d800db550 100644
--- a/contrib/eigen/Eigen/QR
+++ b/contrib/eigen/Eigen/QR
@@ -48,4 +48,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_QR_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/contrib/eigen/Eigen/QtAlignedMalloc b/contrib/eigen/Eigen/QtAlignedMalloc
index 4f07df02ae90fe15921ab6b6896c25c2dd530160..6fe82374a5ebdb0dbc17fa6e0119b1a399f126e3 100644
--- a/contrib/eigen/Eigen/QtAlignedMalloc
+++ b/contrib/eigen/Eigen/QtAlignedMalloc
@@ -37,4 +37,3 @@ void *qRealloc(void *ptr, std::size_t size)
 #endif
 
 #endif // EIGEN_QTMALLOC_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/contrib/eigen/Eigen/SVD b/contrib/eigen/Eigen/SVD
index 5d0e75f7f755c382ab8e6b5c606565f9ed43dd87..34517949632099aea944db49f84c4eb8f630ac5c 100644
--- a/contrib/eigen/Eigen/SVD
+++ b/contrib/eigen/Eigen/SVD
@@ -48,4 +48,3 @@
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_SVD_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/contrib/eigen/Eigen/Sparse b/contrib/eigen/Eigen/Sparse
index 136e681a1f66cc70d61285ef7eaa64399162581f..a2ef7a66526812617c782f76dea260f04593ea58 100644
--- a/contrib/eigen/Eigen/Sparse
+++ b/contrib/eigen/Eigen/Sparse
@@ -25,9 +25,7 @@
 
 #include "SparseCore"
 #include "OrderingMethods"
-#ifndef EIGEN_MPL2_ONLY
 #include "SparseCholesky"
-#endif
 #include "SparseLU"
 #include "SparseQR"
 #include "IterativeLinearSolvers"
diff --git a/contrib/eigen/Eigen/SparseCholesky b/contrib/eigen/Eigen/SparseCholesky
index b6a320c4027b4a3c7f21b7b9f00bcf1e3aeb24ed..d2b1f1276da5192664ba0ba39bbb9a00c18dc225 100644
--- a/contrib/eigen/Eigen/SparseCholesky
+++ b/contrib/eigen/Eigen/SparseCholesky
@@ -30,16 +30,8 @@
   * \endcode
   */
 
-#ifdef EIGEN_MPL2_ONLY
-#error The SparseCholesky module has nothing to offer in MPL2 only mode
-#endif
-
 #include "src/SparseCholesky/SimplicialCholesky.h"
-
-#ifndef EIGEN_MPL2_ONLY
 #include "src/SparseCholesky/SimplicialCholesky_impl.h"
-#endif
-
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_SPARSECHOLESKY_MODULE_H
diff --git a/contrib/eigen/Eigen/SparseLU b/contrib/eigen/Eigen/SparseLU
index 38b38b531d1c87cb4d5b8f1819687d8d4dd335eb..37c4a5c5a8b305add93e2aae0eb7eac624f3ce68 100644
--- a/contrib/eigen/Eigen/SparseLU
+++ b/contrib/eigen/Eigen/SparseLU
@@ -23,6 +23,8 @@
 // Ordering interface
 #include "OrderingMethods"
 
+#include "src/Core/util/DisableStupidWarnings.h"
+
 #include "src/SparseLU/SparseLU_gemm_kernel.h"
 
 #include "src/SparseLU/SparseLU_Structs.h"
@@ -43,4 +45,6 @@
 #include "src/SparseLU/SparseLU_Utils.h"
 #include "src/SparseLU/SparseLU.h"
 
+#include "src/Core/util/ReenableStupidWarnings.h"
+
 #endif // EIGEN_SPARSELU_MODULE_H
diff --git a/contrib/eigen/Eigen/src/Cholesky/LDLT.h b/contrib/eigen/Eigen/src/Cholesky/LDLT.h
index 15ccf24f1446c3d4fd33379c1c46dda03b1fd58a..1013ca045df62921d5c89054e13855649741ec86 100644
--- a/contrib/eigen/Eigen/src/Cholesky/LDLT.h
+++ b/contrib/eigen/Eigen/src/Cholesky/LDLT.h
@@ -16,6 +16,15 @@
 namespace Eigen {
 
 namespace internal {
+  template<typename _MatrixType, int _UpLo> struct traits<LDLT<_MatrixType, _UpLo> >
+   : traits<_MatrixType>
+  {
+    typedef MatrixXpr XprKind;
+    typedef SolverStorage StorageKind;
+    typedef int StorageIndex;
+    enum { Flags = 0 };
+  };
+
   template<typename MatrixType, int UpLo> struct LDLT_Traits;
 
   // PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef
@@ -36,7 +45,7 @@ namespace internal {
   * matrix \f$ A \f$ such that \f$ A =  P^TLDL^*P \f$, where P is a permutation matrix, L
   * is lower triangular with a unit diagonal and D is a diagonal matrix.
   *
-  * The decomposition uses pivoting to ensure stability, so that L will have
+  * The decomposition uses pivoting to ensure stability, so that D will have
   * zeros in the bottom right rank(A) - n submatrix. Avoiding the square root
   * on D also stabilizes the computation.
   *
@@ -44,24 +53,23 @@ namespace internal {
   * decomposition to determine whether a system of equations has a solution.
   *
   * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
-  * 
+  *
   * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
   */
 template<typename _MatrixType, int _UpLo> class LDLT
+        : public SolverBase<LDLT<_MatrixType, _UpLo> >
 {
   public:
     typedef _MatrixType MatrixType;
+    typedef SolverBase<LDLT> Base;
+    friend class SolverBase<LDLT>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(LDLT)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
       UpLo = _UpLo
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
-    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar, RowsAtCompileTime, 1, 0, MaxRowsAtCompileTime, 1> TmpMatrixType;
 
     typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
@@ -180,6 +188,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
       return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign;
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A.
       *
       * This function also supports in-place solves using the syntax <tt>x = decompositionObject.solve(x)</tt> .
@@ -191,19 +200,14 @@ template<typename _MatrixType, int _UpLo> class LDLT
       * \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then
       * \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the
       * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
-      * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular.
+      * computes the least-square solution of \f$ A x = b \f$ if \f$ A \f$ is singular.
       *
       * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt()
       */
     template<typename Rhs>
     inline const Solve<LDLT, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      eigen_assert(m_matrix.rows()==b.rows()
-                && "LDLT::solve(): invalid number of rows of the right hand side matrix b");
-      return Solve<LDLT, Rhs>(*this, b.derived());
-    }
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
     template<typename Derived>
     bool solveInPlace(MatrixBase<Derived> &bAndX) const;
@@ -242,12 +246,12 @@ template<typename _MatrixType, int _UpLo> class LDLT
       */
     const LDLT& adjoint() const { return *this; };
 
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the factorization failed because of a zero pivot.
       */
     ComputationInfo info() const
@@ -258,8 +262,10 @@ template<typename _MatrixType, int _UpLo> class LDLT
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
@@ -560,14 +566,22 @@ template<typename _MatrixType, int _UpLo>
 template<typename RhsType, typename DstType>
 void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  eigen_assert(rhs.rows() == rows());
+  _solve_impl_transposed<true>(rhs, dst);
+}
+
+template<typename _MatrixType,int _UpLo>
+template<bool Conjugate, typename RhsType, typename DstType>
+void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
   // dst = P b
   dst = m_transpositions * rhs;
 
   // dst = L^-1 (P b)
-  matrixL().solveInPlace(dst);
+  // dst = L^-*T (P b)
+  matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst);
 
-  // dst = D^-1 (L^-1 P b)
+  // dst = D^-* (L^-1 P b)
+  // dst = D^-1 (L^-*T P b)
   // more precisely, use pseudo-inverse of D (see bug 241)
   using std::abs;
   const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
@@ -579,7 +593,6 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
   // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
   // Using numeric_limits::min() gives us more robustness to denormals.
   RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
-
   for (Index i = 0; i < vecD.size(); ++i)
   {
     if(abs(vecD(i)) > tolerance)
@@ -588,10 +601,12 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
       dst.row(i).setZero();
   }
 
-  // dst = L^-T (D^-1 L^-1 P b)
-  matrixU().solveInPlace(dst);
+  // dst = L^-* (D^-* L^-1 P b)
+  // dst = L^-T (D^-1 L^-*T P b)
+  matrixL().transpose().template conjugateIf<Conjugate>().solveInPlace(dst);
 
-  // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
+  // dst = P^T (L^-* D^-* L^-1 P b) = A^-1 b
+  // dst = P^-T (L^-T D^-1 L^-*T P b) = A^-1 b
   dst = m_transpositions.transpose() * dst;
 }
 #endif
diff --git a/contrib/eigen/Eigen/src/Cholesky/LLT.h b/contrib/eigen/Eigen/src/Cholesky/LLT.h
index e1624d21b690961b2603562d3c9cfc338750df87..8c9b2b398791d24f3d3abc6d5d0704c47fb3637a 100644
--- a/contrib/eigen/Eigen/src/Cholesky/LLT.h
+++ b/contrib/eigen/Eigen/src/Cholesky/LLT.h
@@ -13,6 +13,16 @@
 namespace Eigen {
 
 namespace internal{
+
+template<typename _MatrixType, int _UpLo> struct traits<LLT<_MatrixType, _UpLo> >
+ : traits<_MatrixType>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
+
 template<typename MatrixType, int UpLo> struct LLT_Traits;
 }
 
@@ -54,18 +64,17 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
   * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
   */
 template<typename _MatrixType, int _UpLo> class LLT
+        : public SolverBase<LLT<_MatrixType, _UpLo> >
 {
   public:
     typedef _MatrixType MatrixType;
+    typedef SolverBase<LLT> Base;
+    friend class SolverBase<LLT>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(LLT)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
-    typedef typename MatrixType::StorageIndex StorageIndex;
 
     enum {
       PacketSize = internal::packet_traits<Scalar>::size,
@@ -100,7 +109,7 @@ template<typename _MatrixType, int _UpLo> class LLT
       compute(matrix.derived());
     }
 
-    /** \brief Constructs a LDLT factorization from a given matrix
+    /** \brief Constructs a LLT factorization from a given matrix
       *
       * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
       * \c MatrixType is a Eigen::Ref.
@@ -129,6 +138,7 @@ template<typename _MatrixType, int _UpLo> class LLT
       return Traits::getL(m_matrix);
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
       *
       * Since this LLT class assumes anyway that the matrix A is invertible, the solution
@@ -141,13 +151,8 @@ template<typename _MatrixType, int _UpLo> class LLT
       */
     template<typename Rhs>
     inline const Solve<LLT, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      eigen_assert(m_matrix.rows()==b.rows()
-                && "LLT::solve(): invalid number of rows of the right hand side matrix b");
-      return Solve<LLT, Rhs>(*this, b.derived());
-    }
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
     template<typename Derived>
     void solveInPlace(const MatrixBase<Derived> &bAndX) const;
@@ -180,7 +185,7 @@ template<typename _MatrixType, int _UpLo> class LLT
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears not to be positive definite.
       */
     ComputationInfo info() const
@@ -194,18 +199,20 @@ template<typename _MatrixType, int _UpLo> class LLT
       * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
       * \code x = decomposition.adjoint().solve(b) \endcode
       */
-    const LLT& adjoint() const { return *this; };
+    const LLT& adjoint() const EIGEN_NOEXCEPT { return *this; };
 
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+    inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
     template<typename VectorType>
-    LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
+    LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
@@ -459,7 +466,7 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
   */
 template<typename _MatrixType, int _UpLo>
 template<typename VectorType>
-LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
+LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType);
   eigen_assert(v.size()==m_matrix.cols());
@@ -477,8 +484,17 @@ template<typename _MatrixType,int _UpLo>
 template<typename RhsType, typename DstType>
 void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  dst = rhs;
-  solveInPlace(dst);
+  _solve_impl_transposed<true>(rhs, dst);
+}
+
+template<typename _MatrixType,int _UpLo>
+template<bool Conjugate, typename RhsType, typename DstType>
+void LLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+    dst = rhs;
+
+    matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst);
+    matrixU().template conjugateIf<!Conjugate>().solveInPlace(dst);
 }
 #endif
 
diff --git a/contrib/eigen/Eigen/src/CholmodSupport/CholmodSupport.h b/contrib/eigen/Eigen/src/CholmodSupport/CholmodSupport.h
index 57197202383a6dfb1be48607f84ecaddb715e0e2..adaf52858e4ccf4ae5cd1f3c233257a0f1859e63 100644
--- a/contrib/eigen/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/contrib/eigen/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_CHOLMODSUPPORT_H
 #define EIGEN_CHOLMODSUPPORT_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -32,7 +32,7 @@ template<> struct cholmod_configure_matrix<std::complex<double> > {
   }
 };
 
-// Other scalar types are not yet suppotred by Cholmod
+// Other scalar types are not yet supported by Cholmod
 // template<> struct cholmod_configure_matrix<float> {
 //   template<typename CholmodType>
 //   static void run(CholmodType& mat) {
@@ -79,12 +79,12 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >
 
   res.dtype   = 0;
   res.stype   = -1;
-  
+
   if (internal::is_same<_StorageIndex,int>::value)
   {
     res.itype = CHOLMOD_INT;
   }
-  else if (internal::is_same<_StorageIndex,long>::value)
+  else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value)
   {
     res.itype = CHOLMOD_LONG;
   }
@@ -95,9 +95,9 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >
 
   // setup res.xtype
   internal::cholmod_configure_matrix<_Scalar>::run(res);
-  
+
   res.stype = 0;
-  
+
   return res;
 }
 
@@ -121,9 +121,12 @@ template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
 cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
 {
   cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.matrix().const_cast_derived()));
-  
+
   if(UpLo==Upper) res.stype =  1;
   if(UpLo==Lower) res.stype = -1;
+  // swap stype for rowmajor matrices (only works for real matrices)
+  EIGEN_STATIC_ASSERT((_Options & RowMajorBit) == 0 || NumTraits<_Scalar>::IsComplex == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  if(_Options & RowMajorBit) res.stype *=-1;
 
   return res;
 }
@@ -159,6 +162,44 @@ MappedSparseMatrix<Scalar,Flags,StorageIndex> viewAsEigen(cholmod_sparse& cm)
           static_cast<StorageIndex*>(cm.p), static_cast<StorageIndex*>(cm.i),static_cast<Scalar*>(cm.x) );
 }
 
+namespace internal {
+
+// template specializations for int and long that call the correct cholmod method
+
+#define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \
+    template<typename _StorageIndex> inline ret cm_ ## name       (cholmod_common &Common) { return cholmod_ ## name   (&Common); } \
+    template<>                       inline ret cm_ ## name<SuiteSparse_long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
+
+#define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \
+    template<typename _StorageIndex> inline ret cm_ ## name       (t1& a1, cholmod_common &Common) { return cholmod_ ## name   (&a1, &Common); } \
+    template<>                       inline ret cm_ ## name<SuiteSparse_long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
+
+EIGEN_CHOLMOD_SPECIALIZE0(int, start)
+EIGEN_CHOLMOD_SPECIALIZE0(int, finish)
+
+EIGEN_CHOLMOD_SPECIALIZE1(int, free_factor, cholmod_factor*, L)
+EIGEN_CHOLMOD_SPECIALIZE1(int, free_dense,  cholmod_dense*,  X)
+EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A)
+
+EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A)
+
+template<typename _StorageIndex> inline cholmod_dense*  cm_solve         (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_solve     (sys, &L, &B, &Common); }
+template<>                       inline cholmod_dense*  cm_solve<SuiteSparse_long>   (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_l_solve   (sys, &L, &B, &Common); }
+
+template<typename _StorageIndex> inline cholmod_sparse* cm_spsolve       (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve   (sys, &L, &B, &Common); }
+template<>                       inline cholmod_sparse* cm_spsolve<SuiteSparse_long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
+
+template<typename _StorageIndex>
+inline int  cm_factorize_p       (cholmod_sparse*  A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p   (A, beta, fset, fsize, L, &Common); }
+template<>
+inline int  cm_factorize_p<SuiteSparse_long> (cholmod_sparse*  A, double beta[2], SuiteSparse_long* fset,          std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
+
+#undef EIGEN_CHOLMOD_SPECIALIZE0
+#undef EIGEN_CHOLMOD_SPECIALIZE1
+
+}  // namespace internal
+
+
 enum CholmodMode {
   CholmodAuto, CholmodSimplicialLLt, CholmodSupernodalLLt, CholmodLDLt
 };
@@ -195,7 +236,7 @@ class CholmodBase : public SparseSolverBase<Derived>
     {
       EIGEN_STATIC_ASSERT((internal::is_same<double,RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
       m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
-      cholmod_start(&m_cholmod);
+      internal::cm_start<StorageIndex>(m_cholmod);
     }
 
     explicit CholmodBase(const MatrixType& matrix)
@@ -203,23 +244,23 @@ class CholmodBase : public SparseSolverBase<Derived>
     {
       EIGEN_STATIC_ASSERT((internal::is_same<double,RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
       m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
-      cholmod_start(&m_cholmod);
+      internal::cm_start<StorageIndex>(m_cholmod);
       compute(matrix);
     }
 
     ~CholmodBase()
     {
       if(m_cholmodFactor)
-        cholmod_free_factor(&m_cholmodFactor, &m_cholmod);
-      cholmod_finish(&m_cholmod);
+        internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
+      internal::cm_finish<StorageIndex>(m_cholmod);
     }
-    
+
     inline StorageIndex cols() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
     inline StorageIndex rows() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
-    
+
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
@@ -235,29 +276,29 @@ class CholmodBase : public SparseSolverBase<Derived>
       factorize(matrix);
       return derived();
     }
-    
+
     /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
       *
       * This function is particularly useful when solving for several problems having the same structure.
-      * 
+      *
       * \sa factorize()
       */
     void analyzePattern(const MatrixType& matrix)
     {
       if(m_cholmodFactor)
       {
-        cholmod_free_factor(&m_cholmodFactor, &m_cholmod);
+        internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
         m_cholmodFactor = 0;
       }
       cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
-      m_cholmodFactor = cholmod_analyze(&A, &m_cholmod);
-      
+      m_cholmodFactor = internal::cm_analyze<StorageIndex>(A, m_cholmod);
+
       this->m_isInitialized = true;
       this->m_info = Success;
       m_analysisIsOk = true;
       m_factorizationIsOk = false;
     }
-    
+
     /** Performs a numeric decomposition of \a matrix
       *
       * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed.
@@ -268,17 +309,17 @@ class CholmodBase : public SparseSolverBase<Derived>
     {
       eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
       cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
-      cholmod_factorize_p(&A, m_shiftOffset, 0, 0, m_cholmodFactor, &m_cholmod);
+      internal::cm_factorize_p<StorageIndex>(&A, m_shiftOffset, 0, 0, m_cholmodFactor, m_cholmod);
 
       // If the factorization failed, minor is the column at which it did. On success minor == n.
       this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
       m_factorizationIsOk = true;
     }
-    
+
     /** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations.
      *  See the Cholmod user guide for details. */
     cholmod_common& cholmod() { return m_cholmod; }
-    
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal */
     template<typename Rhs,typename Dest>
@@ -288,22 +329,23 @@ class CholmodBase : public SparseSolverBase<Derived>
       const Index size = m_cholmodFactor->n;
       EIGEN_UNUSED_VARIABLE(size);
       eigen_assert(size==b.rows());
-      
-      // Cholmod needs column-major stoarge without inner-stride, which corresponds to the default behavior of Ref.
+
+      // Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref.
       Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b.derived());
 
       cholmod_dense b_cd = viewAsCholmod(b_ref);
-      cholmod_dense* x_cd = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &b_cd, &m_cholmod);
+      cholmod_dense* x_cd = internal::cm_solve<StorageIndex>(CHOLMOD_A, *m_cholmodFactor, b_cd, m_cholmod);
       if(!x_cd)
       {
         this->m_info = NumericalIssue;
         return;
       }
       // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
+      // NOTE Actually, the copy can be avoided by calling cholmod_solve2 instead of cholmod_solve
       dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
-      cholmod_free_dense(&x_cd, &m_cholmod);
+      internal::cm_free_dense<StorageIndex>(x_cd, m_cholmod);
     }
-    
+
     /** \internal */
     template<typename RhsDerived, typename DestDerived>
     void _solve_impl(const SparseMatrixBase<RhsDerived> &b, SparseMatrixBase<DestDerived> &dest) const
@@ -316,19 +358,20 @@ class CholmodBase : public SparseSolverBase<Derived>
       // note: cs stands for Cholmod Sparse
       Ref<SparseMatrix<typename RhsDerived::Scalar,ColMajor,typename RhsDerived::StorageIndex> > b_ref(b.const_cast_derived());
       cholmod_sparse b_cs = viewAsCholmod(b_ref);
-      cholmod_sparse* x_cs = cholmod_spsolve(CHOLMOD_A, m_cholmodFactor, &b_cs, &m_cholmod);
+      cholmod_sparse* x_cs = internal::cm_spsolve<StorageIndex>(CHOLMOD_A, *m_cholmodFactor, b_cs, m_cholmod);
       if(!x_cs)
       {
         this->m_info = NumericalIssue;
         return;
       }
       // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
+      // NOTE cholmod_spsolve in fact just calls the dense solver for blocks of 4 columns at a time (similar to Eigen's sparse solver)
       dest.derived() = viewAsEigen<typename DestDerived::Scalar,ColMajor,typename DestDerived::StorageIndex>(*x_cs);
-      cholmod_free_sparse(&x_cs, &m_cholmod);
+      internal::cm_free_sparse<StorageIndex>(x_cs, m_cholmod);
     }
     #endif // EIGEN_PARSED_BY_DOXYGEN
-    
-    
+
+
     /** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization.
       *
       * During the numerical factorization, an offset term is added to the diagonal coefficients:\n
@@ -343,7 +386,7 @@ class CholmodBase : public SparseSolverBase<Derived>
       m_shiftOffset[0] = double(offset);
       return derived();
     }
-    
+
     /** \returns the determinant of the underlying matrix from the current factorization */
     Scalar determinant() const
     {
@@ -398,7 +441,7 @@ class CholmodBase : public SparseSolverBase<Derived>
     template<typename Stream>
     void dumpMemory(Stream& /*s*/)
     {}
-    
+
   protected:
     mutable cholmod_common m_cholmod;
     cholmod_factor* m_cholmodFactor;
@@ -435,11 +478,11 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
 {
     typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base;
     using Base::m_cholmod;
-    
+
   public:
-    
+
     typedef _MatrixType MatrixType;
-    
+
     CholmodSimplicialLLT() : Base() { init(); }
 
     CholmodSimplicialLLT(const MatrixType& matrix) : Base()
@@ -486,11 +529,11 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
 {
     typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base;
     using Base::m_cholmod;
-    
+
   public:
-    
+
     typedef _MatrixType MatrixType;
-    
+
     CholmodSimplicialLDLT() : Base() { init(); }
 
     CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
@@ -535,11 +578,11 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
 {
     typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base;
     using Base::m_cholmod;
-    
+
   public:
-    
+
     typedef _MatrixType MatrixType;
-    
+
     CholmodSupernodalLLT() : Base() { init(); }
 
     CholmodSupernodalLLT(const MatrixType& matrix) : Base()
@@ -586,11 +629,11 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
 {
     typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base;
     using Base::m_cholmod;
-    
+
   public:
-    
+
     typedef _MatrixType MatrixType;
-    
+
     CholmodDecomposition() : Base() { init(); }
 
     CholmodDecomposition(const MatrixType& matrix) : Base()
@@ -600,7 +643,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
     }
 
     ~CholmodDecomposition() {}
-    
+
     void setMode(CholmodMode mode)
     {
       switch(mode)
diff --git a/contrib/eigen/Eigen/src/Core/Array.h b/contrib/eigen/Eigen/src/Core/Array.h
index 16770fc7b3a2c5a2e0e84da45f3b2761caf2d0fa..20c789b10c760d814516c65f539cfefc9e352f0e 100644
--- a/contrib/eigen/Eigen/src/Core/Array.h
+++ b/contrib/eigen/Eigen/src/Core/Array.h
@@ -117,7 +117,7 @@ class Array
     {
       return Base::_set(other);
     }
-    
+
     /** Default constructor.
       *
       * For fixed-size matrices, does nothing.
@@ -157,11 +157,50 @@ class Array
     EIGEN_DEVICE_FUNC
     Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
     {
-      other.swap(*this);
+      Base::operator=(std::move(other));
       return *this;
     }
 #endif
 
+    #if EIGEN_HAS_CXX11
+    /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+     *
+     * Example: \include Array_variadic_ctor_cxx11.cpp
+     * Output: \verbinclude Array_variadic_ctor_cxx11.out
+     *
+     * \sa Array(const std::initializer_list<std::initializer_list<Scalar>>&)
+     * \sa Array(const Scalar&), Array(const Scalar&,const Scalar&)
+     */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      : Base(a0, a1, a2, a3, args...) {}
+
+    /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
+      *
+      * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
+      *
+      * Example: \include Array_initializer_list_23_cxx11.cpp
+      * Output: \verbinclude Array_initializer_list_23_cxx11.out
+      *
+      * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
+      *
+      * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed.
+      * Therefore <code> Array<int,Dynamic,1>{{1,2,3,4,5}}</code> is legal and the more verbose syntax
+      * <code>Array<int,Dynamic,1>{{1},{2},{3},{4},{5}}</code> can be avoided:
+      *
+      * Example: \include Array_initializer_list_vector_cxx11.cpp
+      * Output: \verbinclude Array_initializer_list_vector_cxx11.out
+      *
+      * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes,
+      * and implicit transposition is allowed for compile-time 1D arrays only.
+      *
+      * \sa  Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
+    #endif // end EIGEN_HAS_CXX11
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename T>
     EIGEN_DEVICE_FUNC
@@ -178,6 +217,7 @@ class Array
       Base::_check_template_params();
       this->template _init2<T0,T1>(val0, val1);
     }
+
     #else
     /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */
     EIGEN_DEVICE_FUNC explicit Array(const Scalar *data);
@@ -189,7 +229,8 @@ class Array
       */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE explicit Array(Index dim);
-    /** constructs an initialized 1x1 Array with the given coefficient */
+    /** constructs an initialized 1x1 Array with the given coefficient
+      * \sa const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args */
     Array(const Scalar& value);
     /** constructs an uninitialized array with \a rows rows and \a cols columns.
       *
@@ -197,11 +238,14 @@ class Array
       * it is redundant to pass these parameters, so one should use the default constructor
       * Array() instead. */
     Array(Index rows, Index cols);
-    /** constructs an initialized 2D vector with given coefficients */
+    /** constructs an initialized 2D vector with given coefficients
+      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */
     Array(const Scalar& val0, const Scalar& val1);
-    #endif
+    #endif  // end EIGEN_PARSED_BY_DOXYGEN
 
-    /** constructs an initialized 3D vector with given coefficients */
+    /** constructs an initialized 3D vector with given coefficients
+      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2)
     {
@@ -211,7 +255,9 @@ class Array
       m_storage.data()[1] = val1;
       m_storage.data()[2] = val2;
     }
-    /** constructs an initialized 4D vector with given coefficients */
+    /** constructs an initialized 4D vector with given coefficients
+      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3)
     {
@@ -242,8 +288,10 @@ class Array
       : Base(other.derived())
     { }
 
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT{ return 1; }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); }
 
     #ifdef EIGEN_ARRAY_PLUGIN
     #include EIGEN_ARRAY_PLUGIN
@@ -258,7 +306,7 @@ class Array
 /** \defgroup arraytypedefs Global array typedefs
   * \ingroup Core_Module
   *
-  * Eigen defines several typedef shortcuts for most common 1D and 2D array types.
+  * %Eigen defines several typedef shortcuts for most common 1D and 2D array types.
   *
   * The general patterns are the following:
   *
@@ -271,6 +319,12 @@ class Array
   * There are also \c ArraySizeType which are self-explanatory. For example, \c Array4cf is
   * a fixed-size 1D array of 4 complex floats.
   *
+  * With \cpp11, template alias are also defined for common sizes.
+  * They follow the same pattern as above except that the scalar type suffix is replaced by a
+  * template parameter, i.e.:
+  *   - `ArrayRowsCols<Type>` where `Rows` and `Cols` can be \c 2,\c 3,\c 4, or \c X for fixed or dynamic size.
+  *   - `ArraySize<Type>` where `Size` can be \c 2,\c 3,\c 4 or \c X for fixed or dynamic size 1D arrays.
+  *
   * \sa class Array
   */
 
@@ -303,8 +357,42 @@ EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)
 
 #undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES
 #undef EIGEN_MAKE_ARRAY_TYPEDEFS
+#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS
+
+#if EIGEN_HAS_CXX11
+
+#define EIGEN_MAKE_ARRAY_TYPEDEFS(Size, SizeSuffix)               \
+/** \ingroup arraytypedefs */                                     \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Array##SizeSuffix##SizeSuffix = Array<Type, Size, Size>;    \
+/** \ingroup arraytypedefs */                                     \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Array##SizeSuffix = Array<Type, Size, 1>;
+
+#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Size)                     \
+/** \ingroup arraytypedefs */                                     \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Array##Size##X = Array<Type, Size, Dynamic>;                \
+/** \ingroup arraytypedefs */                                     \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Array##X##Size = Array<Type, Dynamic, Size>;
+
+EIGEN_MAKE_ARRAY_TYPEDEFS(2, 2)
+EIGEN_MAKE_ARRAY_TYPEDEFS(3, 3)
+EIGEN_MAKE_ARRAY_TYPEDEFS(4, 4)
+EIGEN_MAKE_ARRAY_TYPEDEFS(Dynamic, X)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(2)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(3)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(4)
+
+#undef EIGEN_MAKE_ARRAY_TYPEDEFS
+#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS
 
-#undef EIGEN_MAKE_ARRAY_TYPEDEFS_LARGE
+#endif // EIGEN_HAS_CXX11
 
 #define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
 using Eigen::Matrix##SizeSuffix##TypeSuffix; \
diff --git a/contrib/eigen/Eigen/src/Core/ArrayBase.h b/contrib/eigen/Eigen/src/Core/ArrayBase.h
index 33f644e2132f13f2e3bae070388fde05c8339371..ea3dd1c3b38c989ee1e1a5fe3a43b10aa8ee9b70 100644
--- a/contrib/eigen/Eigen/src/Core/ArrayBase.h
+++ b/contrib/eigen/Eigen/src/Core/ArrayBase.h
@@ -69,6 +69,7 @@ template<typename Derived> class ArrayBase
     using Base::coeff;
     using Base::coeffRef;
     using Base::lazyAssign;
+    using Base::operator-;
     using Base::operator=;
     using Base::operator+=;
     using Base::operator-=;
@@ -88,7 +89,6 @@ template<typename Derived> class ArrayBase
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
 #define EIGEN_DOC_UNARY_ADDONS(X,Y)
-#   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
 #   include "../plugins/ArrayCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
diff --git a/contrib/eigen/Eigen/src/Core/ArrayWrapper.h b/contrib/eigen/Eigen/src/Core/ArrayWrapper.h
index 688aadd6260a8839951f3d383d2dd9652716fa5b..2e9555b5374dc723aa181d30194056c05bc96e18 100644
--- a/contrib/eigen/Eigen/src/Core/ArrayWrapper.h
+++ b/contrib/eigen/Eigen/src/Core/ArrayWrapper.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_ARRAYWRAPPER_H
 #define EIGEN_ARRAYWRAPPER_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class ArrayWrapper
   * \ingroup Core_Module
@@ -60,14 +60,14 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     EIGEN_DEVICE_FUNC
     explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
 
-    EIGEN_DEVICE_FUNC
-    inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const { return m_expression.innerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); }
 
     EIGEN_DEVICE_FUNC
     inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
@@ -90,9 +90,9 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     EIGEN_DEVICE_FUNC
     inline void evalTo(Dest& dst) const { dst = m_expression; }
 
-    const typename internal::remove_all<NestedExpressionType>::type& 
     EIGEN_DEVICE_FUNC
-    nestedExpression() const 
+    const typename internal::remove_all<NestedExpressionType>::type&
+    nestedExpression() const
     {
       return m_expression;
     }
@@ -158,14 +158,14 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
     EIGEN_DEVICE_FUNC
     explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
 
-    EIGEN_DEVICE_FUNC
-    inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const { return m_expression.innerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); }
 
     EIGEN_DEVICE_FUNC
     inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
@@ -185,8 +185,8 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
     }
 
     EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<NestedExpressionType>::type& 
-    nestedExpression() const 
+    const typename internal::remove_all<NestedExpressionType>::type&
+    nestedExpression() const
     {
       return m_expression;
     }
diff --git a/contrib/eigen/Eigen/src/Core/Assign.h b/contrib/eigen/Eigen/src/Core/Assign.h
index 53806ba33c4abbfb38162c3e6bd25993fe30132f..655412efd7f26d616b002b3d584ec7fc8a5d9ea4 100644
--- a/contrib/eigen/Eigen/src/Core/Assign.h
+++ b/contrib/eigen/Eigen/src/Core/Assign.h
@@ -16,7 +16,7 @@ namespace Eigen {
 
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
   ::lazyAssign(const DenseBase<OtherDerived>& other)
 {
   enum{
diff --git a/contrib/eigen/Eigen/src/Core/AssignEvaluator.h b/contrib/eigen/Eigen/src/Core/AssignEvaluator.h
index dbe435d86b376dd1099451715686b2123759a553..7d76f0c256fd6207819921680217b30697e81ca3 100644
--- a/contrib/eigen/Eigen/src/Core/AssignEvaluator.h
+++ b/contrib/eigen/Eigen/src/Core/AssignEvaluator.h
@@ -17,24 +17,24 @@ namespace Eigen {
 // This implementation is based on Assign.h
 
 namespace internal {
-  
+
 /***************************************************************************
 * Part 1 : the logic deciding a strategy for traversal and unrolling       *
 ***************************************************************************/
 
 // copy_using_evaluator_traits is based on assign_traits
 
-template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc>
+template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc, int MaxPacketSize = -1>
 struct copy_using_evaluator_traits
 {
   typedef typename DstEvaluator::XprType Dst;
   typedef typename Dst::Scalar DstScalar;
-  
+
   enum {
     DstFlags = DstEvaluator::Flags,
     SrcFlags = SrcEvaluator::Flags
   };
-  
+
 public:
   enum {
     DstAlignment = DstEvaluator::Alignment,
@@ -51,13 +51,15 @@ private:
     InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
               : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
               : int(Dst::MaxRowsAtCompileTime),
+    RestrictedInnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(InnerSize,MaxPacketSize),
+    RestrictedLinearSize = EIGEN_SIZE_MIN_PREFER_FIXED(Dst::SizeAtCompileTime,MaxPacketSize),
     OuterStride = int(outer_stride_at_compile_time<Dst>::ret),
     MaxSizeAtCompileTime = Dst::SizeAtCompileTime
   };
 
   // TODO distinguish between linear traversal and inner-traversals
-  typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type LinearPacketType;
-  typedef typename find_best_packet<DstScalar,InnerSize>::type InnerPacketType;
+  typedef typename find_best_packet<DstScalar,RestrictedLinearSize>::type LinearPacketType;
+  typedef typename find_best_packet<DstScalar,RestrictedInnerSize>::type InnerPacketType;
 
   enum {
     LinearPacketSize = unpacket_traits<LinearPacketType>::size,
@@ -97,7 +99,8 @@ private:
 
 public:
   enum {
-    Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal)
+    Traversal =  int(Dst::SizeAtCompileTime) == 0 ? int(AllAtOnceTraversal) // If compile-size is zero, traversing will fail at compile-time.
+              : (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal)
               : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
               : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
               : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
@@ -135,7 +138,7 @@ public:
                           ? int(CompleteUnrolling)
                           : int(NoUnrolling) )
               : int(Traversal) == int(LinearTraversal)
-                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 
+                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling)
                                               : int(NoUnrolling) )
 #if EIGEN_UNALIGNED_VECTORIZE
               : int(Traversal) == int(SliceVectorizedTraversal)
@@ -172,6 +175,8 @@ public:
     EIGEN_DEBUG_VAR(MaySliceVectorize)
     std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
     EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost)
+    EIGEN_DEBUG_VAR(DstEvaluator::CoeffReadCost)
+    EIGEN_DEBUG_VAR(Dst::SizeAtCompileTime)
     EIGEN_DEBUG_VAR(UnrollingLimit)
     EIGEN_DEBUG_VAR(MayUnrollCompletely)
     EIGEN_DEBUG_VAR(MayUnrollInner)
@@ -195,7 +200,7 @@ struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling
   // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
   typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
   typedef typename DstEvaluatorType::XprType DstXprType;
-  
+
   enum {
     outer = Index / DstXprType::InnerSizeAtCompileTime,
     inner = Index % DstXprType::InnerSizeAtCompileTime
@@ -261,7 +266,7 @@ struct copy_using_evaluator_innervec_CompleteUnrolling
   typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
   typedef typename DstEvaluatorType::XprType DstXprType;
   typedef typename Kernel::PacketType PacketType;
-  
+
   enum {
     outer = Index / DstXprType::InnerSizeAtCompileTime,
     inner = Index % DstXprType::InnerSizeAtCompileTime,
@@ -312,6 +317,22 @@ template<typename Kernel,
          int Unrolling = Kernel::AssignmentTraits::Unrolling>
 struct dense_assignment_loop;
 
+/************************
+***** Special Cases *****
+************************/
+
+// Zero-sized assignment is a no-op.
+template<typename Kernel, int Unrolling>
+struct dense_assignment_loop<Kernel, AllAtOnceTraversal, Unrolling>
+{
+  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel& /*kernel*/)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    EIGEN_STATIC_ASSERT(int(DstXprType::SizeAtCompileTime) == 0,
+      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT)
+  }
+};
+
 /************************
 *** Default traversal ***
 ************************/
@@ -426,10 +447,10 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrollin
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
     typedef typename Kernel::PacketType PacketType;
-    
+
     enum { size = DstXprType::SizeAtCompileTime,
            packetSize =unpacket_traits<PacketType>::size,
-           alignedSize = (size/packetSize)*packetSize };
+           alignedSize = (int(size)/packetSize)*packetSize };
 
     copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
     copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, alignedSize, size>::run(kernel);
@@ -530,7 +551,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
     const Scalar *dst_ptr = kernel.dstDataPtr();
     if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0)
     {
-      // the pointer is not aligend-on scalar, so alignment is not possible
+      // the pointer is not aligned-on scalar, so alignment is not possible
       return dense_assignment_loop<Kernel,DefaultTraversal,NoUnrolling>::run(kernel);
     }
     const Index packetAlignedMask = packetSize - 1;
@@ -568,14 +589,15 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
     typedef typename Kernel::PacketType PacketType;
 
-    enum { size = DstXprType::InnerSizeAtCompileTime,
+    enum { innerSize = DstXprType::InnerSizeAtCompileTime,
            packetSize =unpacket_traits<PacketType>::size,
-           vectorizableSize = (size/packetSize)*packetSize };
+           vectorizableSize = (int(innerSize) / int(packetSize)) * int(packetSize),
+           size = DstXprType::SizeAtCompileTime };
 
     for(Index outer = 0; outer < kernel.outerSize(); ++outer)
     {
       copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer);
-      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, size>::run(kernel, outer);
+      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, innerSize>::run(kernel, outer);
     }
   }
 };
@@ -599,73 +621,74 @@ protected:
   typedef typename DstEvaluatorTypeT::XprType DstXprType;
   typedef typename SrcEvaluatorTypeT::XprType SrcXprType;
 public:
-  
+
   typedef DstEvaluatorTypeT DstEvaluatorType;
   typedef SrcEvaluatorTypeT SrcEvaluatorType;
   typedef typename DstEvaluatorType::Scalar Scalar;
   typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
   typedef typename AssignmentTraits::PacketType PacketType;
-  
-  
-  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
     : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
   {
     #ifdef EIGEN_DEBUG_ASSIGN
     AssignmentTraits::debug();
     #endif
   }
-  
-  EIGEN_DEVICE_FUNC Index size() const        { return m_dstExpr.size(); }
-  EIGEN_DEVICE_FUNC Index innerSize() const   { return m_dstExpr.innerSize(); }
-  EIGEN_DEVICE_FUNC Index outerSize() const   { return m_dstExpr.outerSize(); }
-  EIGEN_DEVICE_FUNC Index rows() const        { return m_dstExpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const        { return m_dstExpr.cols(); }
-  EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); }
-  
-  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; }
-  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; }
-  
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_dstExpr.size(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const EIGEN_NOEXCEPT { return m_dstExpr.innerSize(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const EIGEN_NOEXCEPT { return m_dstExpr.outerSize(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dstExpr.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_dstExpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { return m_dstExpr.outerStride(); }
+
+  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() EIGEN_NOEXCEPT { return m_dst; }
+  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const EIGEN_NOEXCEPT { return m_src; }
+
   /// Assign src(row,col) to dst(row,col) through the assignment functor.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col)
   {
     m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
   }
-  
+
   /// \sa assignCoeff(Index,Index)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index)
   {
     m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
   }
-  
+
   /// \sa assignCoeff(Index,Index)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner)
   {
-    Index row = rowIndexByOuterInner(outer, inner); 
-    Index col = colIndexByOuterInner(outer, inner); 
+    Index row = rowIndexByOuterInner(outer, inner);
+    Index col = colIndexByOuterInner(outer, inner);
     assignCoeff(row, col);
   }
-  
-  
+
+
   template<int StoreMode, int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
   {
     m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode,PacketType>(row,col));
   }
-  
+
   template<int StoreMode, int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index)
   {
     m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode,PacketType>(index));
   }
-  
+
   template<int StoreMode, int LoadMode, typename PacketType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
   {
-    Index row = rowIndexByOuterInner(outer, inner); 
+    Index row = rowIndexByOuterInner(outer, inner);
     Index col = colIndexByOuterInner(outer, inner);
     assignPacket<StoreMode,LoadMode,PacketType>(row, col);
   }
-  
+
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner)
   {
     typedef typename DstEvaluatorType::ExpressionTraits Traits;
@@ -688,7 +711,7 @@ public:
   {
     return m_dstExpr.data();
   }
-  
+
 protected:
   DstEvaluatorType& m_dst;
   const SrcEvaluatorType& m_src;
@@ -697,6 +720,27 @@ protected:
   DstXprType& m_dstExpr;
 };
 
+// Special kernel used when computing small products whose operands have dynamic dimensions.  It ensures that the
+// PacketSize used is no larger than 4, thereby increasing the chance that vectorized instructions will be used
+// when computing the product.
+
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor>
+class restricted_packet_dense_assignment_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn>
+{
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn> Base;
+ public:
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::DstXprType DstXprType;
+    typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, 4> AssignmentTraits;
+    typedef typename AssignmentTraits::PacketType PacketType;
+
+    EIGEN_DEVICE_FUNC restricted_packet_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {
+  }
+ };
+
 /***************************************************************************
 * Part 5 : Entry point for dense rectangular assignment
 ***************************************************************************/
@@ -734,13 +778,23 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType
   resize_if_allowed(dst, src, func);
 
   DstEvaluatorType dstEvaluator(dst);
-    
+
   typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
   Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
 
   dense_assignment_loop<Kernel>::run(kernel);
 }
 
+// Specialization for filling the destination with a constant value.
+#ifndef EIGEN_GPU_COMPILE_PHASE
+template<typename DstXprType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const Eigen::CwiseNullaryOp<Eigen::internal::scalar_constant_op<typename DstXprType::Scalar>, DstXprType>& src, const internal::assign_op<typename DstXprType::Scalar,typename DstXprType::Scalar>& func)
+{
+  resize_if_allowed(dst, src, func);
+  std::fill_n(dst.data(), dst.size(), src.functor()());
+}
+#endif
+
 template<typename DstXprType, typename SrcXprType>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src)
 {
@@ -756,13 +810,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType
 // AssignmentKind must define a Kind typedef.
 template<typename DstShape, typename SrcShape> struct AssignmentKind;
 
-// Assignement kind defined in this file:
+// Assignment kind defined in this file:
 struct Dense2Dense {};
 struct EigenBase2EigenBase {};
 
 template<typename,typename> struct AssignmentKind { typedef EigenBase2EigenBase Kind; };
 template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Kind; };
-    
+
 // This is the main assignment class
 template< typename DstXprType, typename SrcXprType, typename Functor,
           typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
@@ -787,7 +841,7 @@ void call_assignment(const Dst& dst, const Src& src)
 {
   call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
 }
-                     
+
 // Deal with "assume-aliasing"
 template<typename Dst, typename Src, typename Func>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -827,14 +881,35 @@ void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
   typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst>::type ActualDstTypeCleaned;
   typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst&>::type ActualDstType;
   ActualDstType actualDst(dst);
-  
+
   // TODO check whether this is the right place to perform these checks:
   EIGEN_STATIC_ASSERT_LVALUE(Dst)
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src)
   EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
-  
+
   Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
 }
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_restricted_packet_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
+{
+    typedef evaluator<Dst> DstEvaluatorType;
+    typedef evaluator<Src> SrcEvaluatorType;
+    typedef restricted_packet_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Func> Kernel;
+
+    EIGEN_STATIC_ASSERT_LVALUE(Dst)
+    EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar);
+
+    SrcEvaluatorType srcEvaluator(src);
+    resize_if_allowed(dst, src, func);
+
+    DstEvaluatorType dstEvaluator(dst);
+    Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+
+    dense_assignment_loop<Kernel>::run(kernel);
+}
+
 template<typename Dst, typename Src>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 void call_assignment_no_alias(Dst& dst, const Src& src)
@@ -875,7 +950,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
 #ifndef EIGEN_NO_DEBUG
     internal::check_for_aliasing(dst, src);
 #endif
-    
+
     call_dense_assignment_loop(dst, src, func);
   }
 };
@@ -899,7 +974,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
     src.evalTo(dst);
   }
 
-  // NOTE The following two functions are templated to avoid their instanciation if not needed
+  // NOTE The following two functions are templated to avoid their instantiation if not needed
   //      This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
   template<typename SrcScalarType>
   EIGEN_DEVICE_FUNC
diff --git a/contrib/eigen/Eigen/src/Core/Assign_MKL.h b/contrib/eigen/Eigen/src/Core/Assign_MKL.h
index 6866095bf8a47a3a00f5dc558cca10f8caa3830c..c6140d185ba07aabfc930cb76f0834ce6231a710 100755
--- a/contrib/eigen/Eigen/src/Core/Assign_MKL.h
+++ b/contrib/eigen/Eigen/src/Core/Assign_MKL.h
@@ -68,16 +68,16 @@ class vml_assign_traits
 
 #define EIGEN_PP_EXPAND(ARG) ARG
 #if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1)
-#define EIGEN_VMLMODE_EXPAND_LA , VML_HA
+#define EIGEN_VMLMODE_EXPAND_xLA , VML_HA
 #else
-#define EIGEN_VMLMODE_EXPAND_LA , VML_LA
+#define EIGEN_VMLMODE_EXPAND_xLA , VML_LA
 #endif
 
-#define EIGEN_VMLMODE_EXPAND__ 
+#define EIGEN_VMLMODE_EXPAND_x_
 
-#define EIGEN_VMLMODE_PREFIX_LA vm
-#define EIGEN_VMLMODE_PREFIX__  v
-#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_,VMLMODE)
+#define EIGEN_VMLMODE_PREFIX_xLA vm
+#define EIGEN_VMLMODE_PREFIX_x_  v
+#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_x,VMLMODE)
 
 #define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
   template< typename DstXprType, typename SrcXprNested>                                                                         \
@@ -89,7 +89,7 @@ class vml_assign_traits
       eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
       if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
         VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
-              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                           \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) );                                           \
       } else {                                                                                                                  \
         const Index outerSize = dst.outerSize();                                                                                \
         for(Index outer = 0; outer < outerSize; ++outer) {                                                                      \
@@ -97,7 +97,7 @@ class vml_assign_traits
                                                       &(src.nestedExpression().coeffRef(0, outer));                             \
           EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                           \
           VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr,                                                                      \
-                (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                             \
+                (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                                             \
         }                                                                                                                       \
       }                                                                                                                         \
     }                                                                                                                           \
@@ -152,7 +152,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
       if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
       {                                                                                                                       \
         VMLOP( dst.size(), (const VMLTYPE*)src.lhs().data(), exponent,                                                        \
-              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                         \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) );                                         \
       } else {                                                                                                                \
         const Index outerSize = dst.outerSize();                                                                              \
         for(Index outer = 0; outer < outerSize; ++outer) {                                                                    \
@@ -160,7 +160,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
                                                       &(src.lhs().coeffRef(0, outer));                                        \
           EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                         \
           VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent,                                                          \
-                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                          \
+                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                                          \
         }                                                                                                                     \
       }                                                                                                                       \
     }                                                                                                                         \
diff --git a/contrib/eigen/Eigen/src/Core/BandMatrix.h b/contrib/eigen/Eigen/src/Core/BandMatrix.h
index 4978c914057f41fbc12e137c3fde78791fdd9ff4..878c0240ac1699abd0b05e2d34cea9fa4929fe72 100644
--- a/contrib/eigen/Eigen/src/Core/BandMatrix.h
+++ b/contrib/eigen/Eigen/src/Core/BandMatrix.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_BANDMATRIX_H
 #define EIGEN_BANDMATRIX_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -45,7 +45,7 @@ class BandMatrixBase : public EigenBase<Derived>
     };
 
   public:
-    
+
     using Base::derived;
     using Base::rows;
     using Base::cols;
@@ -55,10 +55,10 @@ class BandMatrixBase : public EigenBase<Derived>
 
     /** \returns the number of sub diagonals */
     inline Index subs() const { return derived().subs(); }
-    
+
     /** \returns an expression of the underlying coefficient matrix */
     inline const CoefficientsType& coeffs() const { return derived().coeffs(); }
-    
+
     /** \returns an expression of the underlying coefficient matrix */
     inline CoefficientsType& coeffs() { return derived().coeffs(); }
 
@@ -67,7 +67,7 @@ class BandMatrixBase : public EigenBase<Derived>
       * \warning the internal storage must be column major. */
     inline Block<CoefficientsType,Dynamic,1> col(Index i)
     {
-      EIGEN_STATIC_ASSERT((Options&RowMajor)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+      EIGEN_STATIC_ASSERT((int(Options) & int(RowMajor)) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
       Index start = 0;
       Index len = coeffs().rows();
       if (i<=supers())
@@ -90,7 +90,7 @@ class BandMatrixBase : public EigenBase<Derived>
 
     template<int Index> struct DiagonalIntReturnType {
       enum {
-        ReturnOpposite = (Options&SelfAdjoint) && (((Index)>0 && Supers==0) || ((Index)<0 && Subs==0)),
+        ReturnOpposite = (int(Options) & int(SelfAdjoint)) && (((Index) > 0 && Supers == 0) || ((Index) < 0 && Subs == 0)),
         Conjugate = ReturnOpposite && NumTraits<Scalar>::IsComplex,
         ActualIndex = ReturnOpposite ? -Index : Index,
         DiagonalSize = (RowsAtCompileTime==Dynamic || ColsAtCompileTime==Dynamic)
@@ -130,7 +130,7 @@ class BandMatrixBase : public EigenBase<Derived>
       eigen_assert((i<0 && -i<=subs()) || (i>=0 && i<=supers()));
       return Block<const CoefficientsType,1,Dynamic>(coeffs(), supers()-i, std::max<Index>(0,i), 1, diagonalLength(i));
     }
-    
+
     template<typename Dest> inline void evalTo(Dest& dst) const
     {
       dst.resize(rows(),cols());
@@ -192,7 +192,7 @@ struct traits<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
     Options = _Options,
     DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic
   };
-  typedef Matrix<Scalar,DataRowsAtCompileTime,ColsAtCompileTime,Options&RowMajor?RowMajor:ColMajor> CoefficientsType;
+  typedef Matrix<Scalar, DataRowsAtCompileTime, ColsAtCompileTime, int(Options) & int(RowMajor) ? RowMajor : ColMajor> CoefficientsType;
 };
 
 template<typename _Scalar, int Rows, int Cols, int Supers, int Subs, int Options>
@@ -211,16 +211,16 @@ class BandMatrix : public BandMatrixBase<BandMatrix<_Scalar,Rows,Cols,Supers,Sub
     }
 
     /** \returns the number of columns */
-    inline Index rows() const { return m_rows.value(); }
+    inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
 
     /** \returns the number of rows */
-    inline Index cols() const { return m_coeffs.cols(); }
+    inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); }
 
     /** \returns the number of super diagonals */
-    inline Index supers() const { return m_supers.value(); }
+    inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); }
 
     /** \returns the number of sub diagonals */
-    inline Index subs() const { return m_subs.value(); }
+    inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); }
 
     inline const CoefficientsType& coeffs() const { return m_coeffs; }
     inline CoefficientsType& coeffs() { return m_coeffs; }
@@ -275,16 +275,16 @@ class BandMatrixWrapper : public BandMatrixBase<BandMatrixWrapper<_CoefficientsT
     }
 
     /** \returns the number of columns */
-    inline Index rows() const { return m_rows.value(); }
+    inline EIGEN_CONSTEXPR Index rows() const { return m_rows.value(); }
 
     /** \returns the number of rows */
-    inline Index cols() const { return m_coeffs.cols(); }
+    inline EIGEN_CONSTEXPR Index cols() const { return m_coeffs.cols(); }
 
     /** \returns the number of super diagonals */
-    inline Index supers() const { return m_supers.value(); }
+    inline EIGEN_CONSTEXPR Index supers() const { return m_supers.value(); }
 
     /** \returns the number of sub diagonals */
-    inline Index subs() const { return m_subs.value(); }
+    inline EIGEN_CONSTEXPR Index subs() const { return m_subs.value(); }
 
     inline const CoefficientsType& coeffs() const { return m_coeffs; }
 
diff --git a/contrib/eigen/Eigen/src/Core/Block.h b/contrib/eigen/Eigen/src/Core/Block.h
index 11de45c2ecbc8ccc6d48d12f091fe067f64399e5..3206d6633e62c29f4466155f47725511a34ba640 100644
--- a/contrib/eigen/Eigen/src/Core/Block.h
+++ b/contrib/eigen/Eigen/src/Core/Block.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_BLOCK_H
 #define EIGEN_BLOCK_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
@@ -52,7 +52,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
     FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
     Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit,
     // FIXME DirectAccessBit should not be handled by expressions
-    // 
+    //
     // Alignment is needed by MapBase's assertions
     // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the respective evaluator
     Alignment = 0
@@ -61,7 +61,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
 
 template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool InnerPanel = false,
          bool HasDirectAccess = internal::has_direct_access<XprType>::ret> class BlockImpl_dense;
-         
+
 } // end namespace internal
 
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, typename StorageKind> class BlockImpl;
@@ -109,13 +109,13 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
     typedef Impl Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(Block)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
-    
+
     typedef typename internal::remove_all<XprType>::type NestedExpression;
-  
+
     /** Column or Row constructor
       */
-    EIGEN_DEVICE_FUNC
-    inline Block(XprType& xpr, Index i) : Impl(xpr,i)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Block(XprType& xpr, Index i) : Impl(xpr,i)
     {
       eigen_assert( (i>=0) && (
           ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && i<xpr.rows())
@@ -124,8 +124,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
 
     /** Fixed-size constructor
       */
-    EIGEN_DEVICE_FUNC
-    inline Block(XprType& xpr, Index startRow, Index startCol)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Block(XprType& xpr, Index startRow, Index startCol)
       : Impl(xpr, startRow, startCol)
     {
       EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
@@ -135,8 +135,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
 
     /** Dynamic-size constructor
       */
-    EIGEN_DEVICE_FUNC
-    inline Block(XprType& xpr,
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Block(XprType& xpr,
           Index startRow, Index startCol,
           Index blockRows, Index blockCols)
       : Impl(xpr, startRow, startCol, blockRows, blockCols)
@@ -147,7 +147,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
           && startCol >= 0 && blockCols >= 0 && startCol <= xpr.cols() - blockCols);
     }
 };
-         
+
 // The generic default implementation for dense block simplu forward to the internal::BlockImpl_dense
 // that must be specialized for direct and non-direct access...
 template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
@@ -159,10 +159,10 @@ class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Dense>
   public:
     typedef Impl Base;
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
-    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
-    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {}
     EIGEN_DEVICE_FUNC
-    inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+    EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
       : Impl(xpr, startRow, startCol, blockRows, blockCols) {}
 };
 
@@ -294,25 +294,25 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
     EIGEN_DEVICE_FUNC inline Index outerStride() const;
     #endif
 
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
-    { 
-      return m_xpr; 
+    {
+      return m_xpr;
     }
 
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     XprType& nestedExpression() { return m_xpr; }
-      
-    EIGEN_DEVICE_FUNC
-    StorageIndex startRow() const
-    { 
-      return m_startRow.value(); 
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    StorageIndex startRow() const EIGEN_NOEXCEPT
+    {
+      return m_startRow.value();
     }
-      
-    EIGEN_DEVICE_FUNC
-    StorageIndex startCol() const
-    { 
-      return m_startCol.value(); 
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    StorageIndex startCol() const EIGEN_NOEXCEPT
+    {
+      return m_startCol.value();
     }
 
   protected:
@@ -342,9 +342,9 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
 
     /** Column or Row constructor
       */
-    EIGEN_DEVICE_FUNC
-    inline BlockImpl_dense(XprType& xpr, Index i)
-      : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) 
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    BlockImpl_dense(XprType& xpr, Index i)
+      : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor))
                                 || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
              BlockRows==1 ? 1 : xpr.rows(),
              BlockCols==1 ? 1 : xpr.cols()),
@@ -357,8 +357,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
 
     /** Fixed-size constructor
       */
-    EIGEN_DEVICE_FUNC
-    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
       : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
         m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
     {
@@ -367,8 +367,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
 
     /** Dynamic-size constructor
       */
-    EIGEN_DEVICE_FUNC
-    inline BlockImpl_dense(XprType& xpr,
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    BlockImpl_dense(XprType& xpr,
           Index startRow, Index startCol,
           Index blockRows, Index blockCols)
       : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
@@ -377,18 +377,18 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
       init();
     }
 
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
-    { 
-      return m_xpr; 
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const EIGEN_NOEXCEPT
+    {
+      return m_xpr;
     }
 
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     XprType& nestedExpression() { return m_xpr; }
-      
+
     /** \sa MapBase::innerStride() */
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index innerStride() const EIGEN_NOEXCEPT
     {
       return internal::traits<BlockType>::HasSameStorageOrderAsXprType
              ? m_xpr.innerStride()
@@ -396,23 +396,19 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
     }
 
     /** \sa MapBase::outerStride() */
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index outerStride() const EIGEN_NOEXCEPT
     {
-      return m_outerStride;
+      return internal::traits<BlockType>::HasSameStorageOrderAsXprType
+                    ? m_xpr.outerStride()
+                    : m_xpr.innerStride();
     }
 
-    EIGEN_DEVICE_FUNC
-    StorageIndex startRow() const
-    {
-      return m_startRow.value();
-    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    StorageIndex startRow() const EIGEN_NOEXCEPT { return m_startRow.value(); }
 
-    EIGEN_DEVICE_FUNC
-    StorageIndex startCol() const
-    {
-      return m_startCol.value();
-    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    StorageIndex startCol() const EIGEN_NOEXCEPT { return m_startCol.value(); }
 
   #ifndef __SUNPRO_CC
   // FIXME sunstudio is not friendly with the above friend...
@@ -422,8 +418,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal used by allowAligned() */
-    EIGEN_DEVICE_FUNC
-    inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
       : Base(data, blockRows, blockCols), m_xpr(xpr)
     {
       init();
@@ -431,7 +427,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
     #endif
 
   protected:
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     void init()
     {
       m_outerStride = internal::traits<BlockType>::HasSameStorageOrderAsXprType
diff --git a/contrib/eigen/Eigen/src/Core/BooleanRedux.h b/contrib/eigen/Eigen/src/Core/BooleanRedux.h
index 8409d8749adc9faf3002cf1ff9fb26a31001b352..852de8b90a565b534a70ff61493ca2684a9d1638 100644
--- a/contrib/eigen/Eigen/src/Core/BooleanRedux.h
+++ b/contrib/eigen/Eigen/src/Core/BooleanRedux.h
@@ -14,58 +14,56 @@ namespace Eigen {
 
 namespace internal {
 
-template<typename Derived, int UnrollCount>
+template<typename Derived, int UnrollCount, int Rows>
 struct all_unroller
 {
-  typedef typename Derived::ExpressionTraits Traits;
   enum {
-    col = (UnrollCount-1) / Traits::RowsAtCompileTime,
-    row = (UnrollCount-1) % Traits::RowsAtCompileTime
+    col = (UnrollCount-1) / Rows,
+    row = (UnrollCount-1) % Rows
   };
 
-  static inline bool run(const Derived &mat)
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
   {
-    return all_unroller<Derived, UnrollCount-1>::run(mat) && mat.coeff(row, col);
+    return all_unroller<Derived, UnrollCount-1, Rows>::run(mat) && mat.coeff(row, col);
   }
 };
 
-template<typename Derived>
-struct all_unroller<Derived, 0>
+template<typename Derived, int Rows>
+struct all_unroller<Derived, 0, Rows>
 {
-  static inline bool run(const Derived &/*mat*/) { return true; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; }
 };
 
-template<typename Derived>
-struct all_unroller<Derived, Dynamic>
+template<typename Derived, int Rows>
+struct all_unroller<Derived, Dynamic, Rows>
 {
-  static inline bool run(const Derived &) { return false; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
 };
 
-template<typename Derived, int UnrollCount>
+template<typename Derived, int UnrollCount, int Rows>
 struct any_unroller
 {
-  typedef typename Derived::ExpressionTraits Traits;
   enum {
-    col = (UnrollCount-1) / Traits::RowsAtCompileTime,
-    row = (UnrollCount-1) % Traits::RowsAtCompileTime
+    col = (UnrollCount-1) / Rows,
+    row = (UnrollCount-1) % Rows
   };
   
-  static inline bool run(const Derived &mat)
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
   {
-    return any_unroller<Derived, UnrollCount-1>::run(mat) || mat.coeff(row, col);
+    return any_unroller<Derived, UnrollCount-1, Rows>::run(mat) || mat.coeff(row, col);
   }
 };
 
-template<typename Derived>
-struct any_unroller<Derived, 0>
+template<typename Derived, int Rows>
+struct any_unroller<Derived, 0, Rows>
 {
-  static inline bool run(const Derived & /*mat*/) { return false; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; }
 };
 
-template<typename Derived>
-struct any_unroller<Derived, Dynamic>
+template<typename Derived, int Rows>
+struct any_unroller<Derived, Dynamic, Rows>
 {
-  static inline bool run(const Derived &) { return false; }
+  EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
 };
 
 } // end namespace internal
@@ -78,16 +76,16 @@ struct any_unroller<Derived, Dynamic>
   * \sa any(), Cwise::operator<()
   */
 template<typename Derived>
-inline bool DenseBase<Derived>::all() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
 {
   typedef internal::evaluator<Derived> Evaluator;
   enum {
     unroll = SizeAtCompileTime != Dynamic
-          && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+          && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits<Scalar>::AddCost)) <= EIGEN_UNROLLING_LIMIT
   };
   Evaluator evaluator(derived());
   if(unroll)
-    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(evaluator);
+    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
   else
   {
     for(Index j = 0; j < cols(); ++j)
@@ -102,16 +100,16 @@ inline bool DenseBase<Derived>::all() const
   * \sa all()
   */
 template<typename Derived>
-inline bool DenseBase<Derived>::any() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
 {
   typedef internal::evaluator<Derived> Evaluator;
   enum {
     unroll = SizeAtCompileTime != Dynamic
-          && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+          && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits<Scalar>::AddCost)) <= EIGEN_UNROLLING_LIMIT
   };
   Evaluator evaluator(derived());
   if(unroll)
-    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(evaluator);
+    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
   else
   {
     for(Index j = 0; j < cols(); ++j)
@@ -126,7 +124,7 @@ inline bool DenseBase<Derived>::any() const
   * \sa all(), any()
   */
 template<typename Derived>
-inline Eigen::Index DenseBase<Derived>::count() const
+EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase<Derived>::count() const
 {
   return derived().template cast<bool>().template cast<Index>().sum();
 }
diff --git a/contrib/eigen/Eigen/src/Core/CommaInitializer.h b/contrib/eigen/Eigen/src/Core/CommaInitializer.h
index d218e98143f40ec0b1ac74ab5af21b62af3ff8be..c0e29c75c223666d37e6bb25f03af09b8ba65e01 100644
--- a/contrib/eigen/Eigen/src/Core/CommaInitializer.h
+++ b/contrib/eigen/Eigen/src/Core/CommaInitializer.h
@@ -33,6 +33,8 @@ struct CommaInitializer
   inline CommaInitializer(XprType& xpr, const Scalar& s)
     : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1)
   {
+    eigen_assert(m_xpr.rows() > 0 && m_xpr.cols() > 0
+      && "Cannot comma-initialize a 0x0 matrix (operator<<)");
     m_xpr.coeffRef(0,0) = s;
   }
 
@@ -41,6 +43,8 @@ struct CommaInitializer
   inline CommaInitializer(XprType& xpr, const DenseBase<OtherDerived>& other)
     : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows())
   {
+    eigen_assert(m_xpr.rows() >= other.rows() && m_xpr.cols() >= other.cols()
+      && "Cannot comma-initialize a 0x0 matrix (operator<<)");
     m_xpr.block(0, 0, other.rows(), other.cols()) = other;
   }
 
@@ -103,7 +107,7 @@ struct CommaInitializer
   EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception)
 #endif
   {
-      finished();
+    finished();
   }
 
   /** \returns the built matrix once all its coefficients have been set.
@@ -141,7 +145,7 @@ struct CommaInitializer
   * \sa CommaInitializer::finished(), class CommaInitializer
   */
 template<typename Derived>
-inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
 {
   return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
 }
@@ -149,7 +153,7 @@ inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s
 /** \sa operator<<(const Scalar&) */
 template<typename Derived>
 template<typename OtherDerived>
-inline CommaInitializer<Derived>
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived>
 DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
 {
   return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);
diff --git a/contrib/eigen/Eigen/src/Core/CoreEvaluators.h b/contrib/eigen/Eigen/src/Core/CoreEvaluators.h
index 910889efa708ac3d74aae2d63c7a150a7d7723e8..0ff8c8deb820deb7bbaa2f762afc5e4366f29616 100644
--- a/contrib/eigen/Eigen/src/Core/CoreEvaluators.h
+++ b/contrib/eigen/Eigen/src/Core/CoreEvaluators.h
@@ -14,7 +14,7 @@
 #define EIGEN_COREEVALUATORS_H
 
 namespace Eigen {
-  
+
 namespace internal {
 
 // This class returns the evaluator kind from the expression storage kind.
@@ -63,8 +63,8 @@ template< typename T,
 template< typename T,
           typename Kind   = typename evaluator_traits<typename T::NestedExpression>::Kind,
           typename Scalar = typename T::Scalar> struct unary_evaluator;
-          
-// evaluator_traits<T> contains traits for evaluator<T> 
+
+// evaluator_traits<T> contains traits for evaluator<T>
 
 template<typename T>
 struct evaluator_traits_base
@@ -90,7 +90,8 @@ template<typename T>
 struct evaluator : public unary_evaluator<T>
 {
   typedef unary_evaluator<T> Base;
-  EIGEN_DEVICE_FUNC explicit evaluator(const T& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const T& xpr) : Base(xpr) {}
 };
 
 
@@ -99,21 +100,29 @@ template<typename T>
 struct evaluator<const T>
   : evaluator<T>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   explicit evaluator(const T& xpr) : evaluator<T>(xpr) {}
 };
 
 // ---------- base class for all evaluators ----------
 
 template<typename ExpressionType>
-struct evaluator_base : public noncopyable
+struct evaluator_base
 {
   // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices.
   typedef traits<ExpressionType> ExpressionTraits;
-  
+
   enum {
     Alignment = 0
   };
+  // noncopyable:
+  // Don't make this class inherit noncopyable as this kills EBO (Empty Base Optimization)
+  // and make complex evaluator much larger than then should do.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator_base() {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~evaluator_base() {}
+private:
+  EIGEN_DEVICE_FUNC evaluator_base(const evaluator_base&);
+  EIGEN_DEVICE_FUNC const evaluator_base& operator=(const evaluator_base&);
 };
 
 // -------------------- Matrix and Array --------------------
@@ -123,6 +132,33 @@ struct evaluator_base : public noncopyable
 // Here we directly specialize evaluator. This is not really a unary expression, and it is, by definition, dense,
 // so no need for more sophisticated dispatching.
 
+// this helper permits to completely eliminate m_outerStride if it is known at compiletime.
+template<typename Scalar,int OuterStride> class plainobjectbase_evaluator_data {
+public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
+  {
+#ifndef EIGEN_INTERNAL_DEBUGGING
+    EIGEN_UNUSED_VARIABLE(outerStride);
+#endif
+    eigen_internal_assert(outerStride==OuterStride);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index outerStride() const EIGEN_NOEXCEPT { return OuterStride; }
+  const Scalar *data;
+};
+
+template<typename Scalar> class plainobjectbase_evaluator_data<Scalar,Dynamic> {
+public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Index outerStride() const { return m_outerStride; }
+  const Scalar *data;
+protected:
+  Index m_outerStride;
+};
+
 template<typename Derived>
 struct evaluator<PlainObjectBase<Derived> >
   : evaluator_base<Derived>
@@ -136,23 +172,28 @@ struct evaluator<PlainObjectBase<Derived> >
     IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime,
     RowsAtCompileTime = PlainObjectType::RowsAtCompileTime,
     ColsAtCompileTime = PlainObjectType::ColsAtCompileTime,
-    
+
     CoeffReadCost = NumTraits<Scalar>::ReadCost,
     Flags = traits<Derived>::EvaluatorFlags,
     Alignment = traits<Derived>::Alignment
   };
-  
-  EIGEN_DEVICE_FUNC evaluator()
-    : m_data(0),
-      m_outerStride(IsVectorAtCompileTime  ? 0 
-                                           : int(IsRowMajor) ? ColsAtCompileTime 
-                                           : RowsAtCompileTime)
+  enum {
+    // We do not need to know the outer stride for vectors
+    OuterStrideAtCompileTime = IsVectorAtCompileTime  ? 0
+                                                      : int(IsRowMajor) ? ColsAtCompileTime
+                                                                        : RowsAtCompileTime
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  evaluator()
+    : m_d(0,OuterStrideAtCompileTime)
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
-  
-  EIGEN_DEVICE_FUNC explicit evaluator(const PlainObjectType& m)
-    : m_data(m.data()), m_outerStride(IsVectorAtCompileTime ? 0 : m.outerStride()) 
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const PlainObjectType& m)
+    : m_d(m.data(),IsVectorAtCompileTime ? 0 : m.outerStride())
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
@@ -161,30 +202,30 @@ struct evaluator<PlainObjectBase<Derived> >
   CoeffReturnType coeff(Index row, Index col) const
   {
     if (IsRowMajor)
-      return m_data[row * m_outerStride.value() + col];
+      return m_d.data[row * m_d.outerStride() + col];
     else
-      return m_data[row + col * m_outerStride.value()];
+      return m_d.data[row + col * m_d.outerStride()];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   {
-    return m_data[index];
+    return m_d.data[index];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index row, Index col)
   {
     if (IsRowMajor)
-      return const_cast<Scalar*>(m_data)[row * m_outerStride.value() + col];
+      return const_cast<Scalar*>(m_d.data)[row * m_d.outerStride() + col];
     else
-      return const_cast<Scalar*>(m_data)[row + col * m_outerStride.value()];
+      return const_cast<Scalar*>(m_d.data)[row + col * m_d.outerStride()];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index index)
   {
-    return const_cast<Scalar*>(m_data)[index];
+    return const_cast<Scalar*>(m_d.data)[index];
   }
 
   template<int LoadMode, typename PacketType>
@@ -192,16 +233,16 @@ struct evaluator<PlainObjectBase<Derived> >
   PacketType packet(Index row, Index col) const
   {
     if (IsRowMajor)
-      return ploadt<PacketType, LoadMode>(m_data + row * m_outerStride.value() + col);
+      return ploadt<PacketType, LoadMode>(m_d.data + row * m_d.outerStride() + col);
     else
-      return ploadt<PacketType, LoadMode>(m_data + row + col * m_outerStride.value());
+      return ploadt<PacketType, LoadMode>(m_d.data + row + col * m_d.outerStride());
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
   {
-    return ploadt<PacketType, LoadMode>(m_data + index);
+    return ploadt<PacketType, LoadMode>(m_d.data + index);
   }
 
   template<int StoreMode,typename PacketType>
@@ -210,26 +251,22 @@ struct evaluator<PlainObjectBase<Derived> >
   {
     if (IsRowMajor)
       return pstoret<Scalar, PacketType, StoreMode>
-	            (const_cast<Scalar*>(m_data) + row * m_outerStride.value() + col, x);
+	            (const_cast<Scalar*>(m_d.data) + row * m_d.outerStride() + col, x);
     else
       return pstoret<Scalar, PacketType, StoreMode>
-                    (const_cast<Scalar*>(m_data) + row + col * m_outerStride.value(), x);
+                    (const_cast<Scalar*>(m_d.data) + row + col * m_d.outerStride(), x);
   }
 
   template<int StoreMode, typename PacketType>
   EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketType& x)
   {
-    return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_data) + index, x);
+    return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x);
   }
 
 protected:
-  const Scalar *m_data;
 
-  // We do not need to know the outer stride for vectors
-  variable_if_dynamic<Index, IsVectorAtCompileTime  ? 0 
-                                                    : int(IsRowMajor) ? ColsAtCompileTime 
-                                                    : RowsAtCompileTime> m_outerStride;
+  plainobjectbase_evaluator_data<Scalar,OuterStrideAtCompileTime> m_d;
 };
 
 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@@ -237,11 +274,13 @@ struct evaluator<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
   : evaluator<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > >
 {
   typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
-  
-  EIGEN_DEVICE_FUNC evaluator() {}
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
-    : evaluator<PlainObjectBase<XprType> >(m) 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  evaluator() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& m)
+    : evaluator<PlainObjectBase<XprType> >(m)
   { }
 };
 
@@ -251,10 +290,12 @@ struct evaluator<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
 {
   typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
 
-  EIGEN_DEVICE_FUNC evaluator() {}
-  
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
-    : evaluator<PlainObjectBase<XprType> >(m) 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  evaluator() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& m)
+    : evaluator<PlainObjectBase<XprType> >(m)
   { }
 };
 
@@ -265,14 +306,15 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased>
   : evaluator_base<Transpose<ArgType> >
 {
   typedef Transpose<ArgType> XprType;
-  
+
   enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,    
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
     Flags = evaluator<ArgType>::Flags ^ RowMajorBit,
     Alignment = evaluator<ArgType>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
 
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -457,10 +499,10 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
 {
   typedef CwiseNullaryOp<NullaryOp,PlainObjectType> XprType;
   typedef typename internal::remove_all<PlainObjectType>::type PlainObjectTypeCleaned;
-  
+
   enum {
     CoeffReadCost = internal::functor_traits<NullaryOp>::Cost,
-    
+
     Flags = (evaluator<PlainObjectTypeCleaned>::Flags
           &  (  HereditaryBits
               | (functor_has_linear_access<NullaryOp>::ret  ? LinearAccessBit : 0)
@@ -517,19 +559,17 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
   : evaluator_base<CwiseUnaryOp<UnaryOp, ArgType> >
 {
   typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
-  
+
   enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
-    
+    CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
+
     Flags = evaluator<ArgType>::Flags
           & (HereditaryBits | LinearAccessBit | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
     Alignment = evaluator<ArgType>::Alignment
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit unary_evaluator(const XprType& op)
-    : m_functor(op.functor()), 
-      m_argImpl(op.nestedExpression()) 
+  explicit unary_evaluator(const XprType& op) : m_d(op)
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -540,32 +580,43 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
   {
-    return m_functor(m_argImpl.coeff(row, col));
+    return m_d.func()(m_d.argImpl.coeff(row, col));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   {
-    return m_functor(m_argImpl.coeff(index));
+    return m_d.func()(m_d.argImpl.coeff(index));
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
   PacketType packet(Index row, Index col) const
   {
-    return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(row, col));
+    return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(row, col));
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
   {
-    return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(index));
+    return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(index));
   }
 
 protected:
-  const UnaryOp m_functor;
-  evaluator<ArgType> m_argImpl;
+
+  // this helper permits to completely eliminate the functor if it is empty
+  struct Data
+  {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const UnaryOp& func() const { return op; }
+    UnaryOp op;
+    evaluator<ArgType> argImpl;
+  };
+
+  Data m_d;
 };
 
 // -------------------- CwiseTernaryOp --------------------
@@ -577,7 +628,7 @@ struct evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
 {
   typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
   typedef ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> > Base;
-  
+
   EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 
@@ -586,10 +637,10 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
   : evaluator_base<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> >
 {
   typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
-  
+
   enum {
-    CoeffReadCost = evaluator<Arg1>::CoeffReadCost + evaluator<Arg2>::CoeffReadCost + evaluator<Arg3>::CoeffReadCost + functor_traits<TernaryOp>::Cost,
-    
+    CoeffReadCost = int(evaluator<Arg1>::CoeffReadCost) + int(evaluator<Arg2>::CoeffReadCost) + int(evaluator<Arg3>::CoeffReadCost) + int(functor_traits<TernaryOp>::Cost),
+
     Arg1Flags = evaluator<Arg1>::Flags,
     Arg2Flags = evaluator<Arg2>::Flags,
     Arg3Flags = evaluator<Arg3>::Flags,
@@ -609,11 +660,7 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
         evaluator<Arg3>::Alignment)
   };
 
-  EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr)
-    : m_functor(xpr.functor()),
-      m_arg1Impl(xpr.arg1()), 
-      m_arg2Impl(xpr.arg2()), 
-      m_arg3Impl(xpr.arg3())  
+  EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) : m_d(xpr)
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<TernaryOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -624,38 +671,48 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
   {
-    return m_functor(m_arg1Impl.coeff(row, col), m_arg2Impl.coeff(row, col), m_arg3Impl.coeff(row, col));
+    return m_d.func()(m_d.arg1Impl.coeff(row, col), m_d.arg2Impl.coeff(row, col), m_d.arg3Impl.coeff(row, col));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   {
-    return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
+    return m_d.func()(m_d.arg1Impl.coeff(index), m_d.arg2Impl.coeff(index), m_d.arg3Impl.coeff(index));
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
   PacketType packet(Index row, Index col) const
   {
-    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode,PacketType>(row, col),
-                              m_arg2Impl.template packet<LoadMode,PacketType>(row, col),
-                              m_arg3Impl.template packet<LoadMode,PacketType>(row, col));
+    return m_d.func().packetOp(m_d.arg1Impl.template packet<LoadMode,PacketType>(row, col),
+                               m_d.arg2Impl.template packet<LoadMode,PacketType>(row, col),
+                               m_d.arg3Impl.template packet<LoadMode,PacketType>(row, col));
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
   {
-    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode,PacketType>(index),
-                              m_arg2Impl.template packet<LoadMode,PacketType>(index),
-                              m_arg3Impl.template packet<LoadMode,PacketType>(index));
+    return m_d.func().packetOp(m_d.arg1Impl.template packet<LoadMode,PacketType>(index),
+                               m_d.arg2Impl.template packet<LoadMode,PacketType>(index),
+                               m_d.arg3Impl.template packet<LoadMode,PacketType>(index));
   }
 
 protected:
-  const TernaryOp m_functor;
-  evaluator<Arg1> m_arg1Impl;
-  evaluator<Arg2> m_arg2Impl;
-  evaluator<Arg3> m_arg3Impl;
+  // this helper permits to completely eliminate the functor if it is empty
+  struct Data
+  {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Data(const XprType& xpr) : op(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TernaryOp& func() const { return op; }
+    TernaryOp op;
+    evaluator<Arg1> arg1Impl;
+    evaluator<Arg2> arg2Impl;
+    evaluator<Arg3> arg3Impl;
+  };
+
+  Data m_d;
 };
 
 // -------------------- CwiseBinaryOp --------------------
@@ -667,8 +724,9 @@ struct evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
 {
   typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
   typedef binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > Base;
-  
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 
 template<typename BinaryOp, typename Lhs, typename Rhs>
@@ -676,10 +734,10 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
   : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
 {
   typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
-  
+
   enum {
-    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
-    
+    CoeffReadCost = int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+
     LhsFlags = evaluator<Lhs>::Flags,
     RhsFlags = evaluator<Rhs>::Flags,
     SameType = is_same<typename Lhs::Scalar,typename Rhs::Scalar>::value,
@@ -696,10 +754,8 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
     Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<Lhs>::Alignment,evaluator<Rhs>::Alignment)
   };
 
-  EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr)
-    : m_functor(xpr.functor()),
-      m_lhsImpl(xpr.lhs()), 
-      m_rhsImpl(xpr.rhs())  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit binary_evaluator(const XprType& xpr) : m_d(xpr)
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -710,35 +766,46 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
   {
-    return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col));
+    return m_d.func()(m_d.lhsImpl.coeff(row, col), m_d.rhsImpl.coeff(row, col));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   {
-    return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index));
+    return m_d.func()(m_d.lhsImpl.coeff(index), m_d.rhsImpl.coeff(index));
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
   PacketType packet(Index row, Index col) const
   {
-    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(row, col),
-                              m_rhsImpl.template packet<LoadMode,PacketType>(row, col));
+    return m_d.func().packetOp(m_d.lhsImpl.template packet<LoadMode,PacketType>(row, col),
+                               m_d.rhsImpl.template packet<LoadMode,PacketType>(row, col));
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
   {
-    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(index),
-                              m_rhsImpl.template packet<LoadMode,PacketType>(index));
+    return m_d.func().packetOp(m_d.lhsImpl.template packet<LoadMode,PacketType>(index),
+                               m_d.rhsImpl.template packet<LoadMode,PacketType>(index));
   }
 
 protected:
-  const BinaryOp m_functor;
-  evaluator<Lhs> m_lhsImpl;
-  evaluator<Rhs> m_rhsImpl;
+
+  // this helper permits to completely eliminate the functor if it is empty
+  struct Data
+  {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Data(const XprType& xpr) : op(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const BinaryOp& func() const { return op; }
+    BinaryOp op;
+    evaluator<Lhs> lhsImpl;
+    evaluator<Rhs> rhsImpl;
+  };
+
+  Data m_d;
 };
 
 // -------------------- CwiseUnaryView --------------------
@@ -748,18 +815,16 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
   : evaluator_base<CwiseUnaryView<UnaryOp, ArgType> >
 {
   typedef CwiseUnaryView<UnaryOp, ArgType> XprType;
-  
+
   enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
-    
+    CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
+
     Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)),
-    
+
     Alignment = 0 // FIXME it is not very clear why alignment is necessarily lost...
   };
 
-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
-    : m_unaryOp(op.functor()), 
-      m_argImpl(op.nestedExpression()) 
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) : m_d(op)
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -771,30 +836,41 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
   {
-    return m_unaryOp(m_argImpl.coeff(row, col));
+    return m_d.func()(m_d.argImpl.coeff(row, col));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   {
-    return m_unaryOp(m_argImpl.coeff(index));
+    return m_d.func()(m_d.argImpl.coeff(index));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index row, Index col)
   {
-    return m_unaryOp(m_argImpl.coeffRef(row, col));
+    return m_d.func()(m_d.argImpl.coeffRef(row, col));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index index)
   {
-    return m_unaryOp(m_argImpl.coeffRef(index));
+    return m_d.func()(m_d.argImpl.coeffRef(index));
   }
 
 protected:
-  const UnaryOp m_unaryOp;
-  evaluator<ArgType> m_argImpl;
+
+  // this helper permits to completely eliminate the functor if it is empty
+  struct Data
+  {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const UnaryOp& func() const { return op; }
+    UnaryOp op;
+    evaluator<ArgType> argImpl;
+  };
+
+  Data m_d;
 };
 
 // -------------------- Map --------------------
@@ -811,14 +887,15 @@ struct mapbase_evaluator : evaluator_base<Derived>
   typedef typename XprType::PointerType PointerType;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  
+
   enum {
     IsRowMajor = XprType::RowsAtCompileTime,
     ColsAtCompileTime = XprType::ColsAtCompileTime,
     CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
 
-  EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit mapbase_evaluator(const XprType& map)
     : m_data(const_cast<PointerType>(map.data())),
       m_innerStride(map.innerStride()),
       m_outerStride(map.outerStride())
@@ -882,17 +959,21 @@ struct mapbase_evaluator : evaluator_base<Derived>
     internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
   }
 protected:
-  EIGEN_DEVICE_FUNC
-  inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
-  EIGEN_DEVICE_FUNC
-  inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index rowStride() const EIGEN_NOEXCEPT {
+    return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index colStride() const EIGEN_NOEXCEPT {
+     return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value();
+  }
 
   PointerType m_data;
   const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
   const internal::variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
 };
 
-template<typename PlainObjectType, int MapOptions, typename StrideType> 
+template<typename PlainObjectType, int MapOptions, typename StrideType>
 struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
   : public mapbase_evaluator<Map<PlainObjectType, MapOptions, StrideType>, PlainObjectType>
 {
@@ -900,7 +981,7 @@ struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
   typedef typename XprType::Scalar Scalar;
   // TODO: should check for smaller packet types once we can handle multi-sized packet types
   typedef typename packet_traits<Scalar>::type PacketScalar;
-  
+
   enum {
     InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
                              ? int(PlainObjectType::InnerStrideAtCompileTime)
@@ -912,34 +993,35 @@ struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
     HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
     HasNoStride = HasNoInnerStride && HasNoOuterStride,
     IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
-    
+
     PacketAccessMask = bool(HasNoInnerStride) ? ~int(0) : ~int(PacketAccessBit),
     LinearAccessMask = bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime) ? ~int(0) : ~int(LinearAccessBit),
     Flags = int( evaluator<PlainObjectType>::Flags) & (LinearAccessMask&PacketAccessMask),
-    
+
     Alignment = int(MapOptions)&int(AlignedMask)
   };
 
   EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map)
-    : mapbase_evaluator<XprType, PlainObjectType>(map) 
+    : mapbase_evaluator<XprType, PlainObjectType>(map)
   { }
 };
 
 // -------------------- Ref --------------------
 
-template<typename PlainObjectType, int RefOptions, typename StrideType> 
+template<typename PlainObjectType, int RefOptions, typename StrideType>
 struct evaluator<Ref<PlainObjectType, RefOptions, StrideType> >
   : public mapbase_evaluator<Ref<PlainObjectType, RefOptions, StrideType>, PlainObjectType>
 {
   typedef Ref<PlainObjectType, RefOptions, StrideType> XprType;
-  
+
   enum {
     Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Flags,
     Alignment = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Alignment
   };
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& ref)
-    : mapbase_evaluator<XprType, PlainObjectType>(ref) 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& ref)
+    : mapbase_evaluator<XprType, PlainObjectType>(ref)
   { }
 };
 
@@ -947,8 +1029,8 @@ struct evaluator<Ref<PlainObjectType, RefOptions, StrideType> >
 
 template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel,
          bool HasDirectAccess = internal::has_direct_access<ArgType>::ret> struct block_evaluator;
-         
-template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel> 
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
   : block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel>
 {
@@ -956,15 +1038,15 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
   typedef typename XprType::Scalar Scalar;
   // TODO: should check for smaller packet types once we can handle multi-sized packet types
   typedef typename packet_traits<Scalar>::type PacketScalar;
-  
+
   enum {
     CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
-    
+
     RowsAtCompileTime = traits<XprType>::RowsAtCompileTime,
     ColsAtCompileTime = traits<XprType>::ColsAtCompileTime,
     MaxRowsAtCompileTime = traits<XprType>::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = traits<XprType>::MaxColsAtCompileTime,
-    
+
     ArgTypeIsRowMajor = (int(evaluator<ArgType>::Flags)&RowMajorBit) != 0,
     IsRowMajor = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? 1
                : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0
@@ -978,14 +1060,14 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
                              ? int(outer_stride_at_compile_time<ArgType>::ret)
                              : int(inner_stride_at_compile_time<ArgType>::ret),
     MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0,
-    
-    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,    
+
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
     FlagsRowMajorBit = XprType::Flags&RowMajorBit,
     Flags0 = evaluator<ArgType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
                                            DirectAccessBit |
                                            MaskPacketAccessBit),
     Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
-    
+
     PacketAlignment = unpacket_traits<PacketScalar>::alignment,
     Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic)
                              && (OuterStrideAtCompileTime!=0)
@@ -993,7 +1075,8 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
     Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
   };
   typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& block) : block_evaluator_type(block)
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
@@ -1006,8 +1089,9 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAcc
 {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
 
-  EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
-    : unary_evaluator<XprType>(block) 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit block_evaluator(const XprType& block)
+    : unary_evaluator<XprType>(block)
   {}
 };
 
@@ -1017,79 +1101,74 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
 {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
 
-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
-    : m_argImpl(block.nestedExpression()), 
-      m_startRow(block.startRow()), 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& block)
+    : m_argImpl(block.nestedExpression()),
+      m_startRow(block.startRow()),
       m_startCol(block.startCol()),
-      m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0)
+      m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0)
   { }
- 
+
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
   enum {
     RowsAtCompileTime = XprType::RowsAtCompileTime,
-    ForwardLinearAccess = InnerPanel && bool(evaluator<ArgType>::Flags&LinearAccessBit)
+    ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator<ArgType>::Flags&LinearAccessBit)
   };
- 
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
-  { 
-    return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); 
+  {
+    return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col);
   }
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
-  { 
-    if (ForwardLinearAccess)
-      return m_argImpl.coeff(m_linear_offset.value() + index); 
-    else
-      return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+  {
+    return linear_coeff_impl(index, bool_constant<ForwardLinearAccess>());
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index row, Index col)
-  { 
-    return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); 
+  {
+    return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col);
   }
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index index)
-  { 
-    if (ForwardLinearAccess)
-      return m_argImpl.coeffRef(m_linear_offset.value() + index); 
-    else
-      return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+  {
+    return linear_coeffRef_impl(index, bool_constant<ForwardLinearAccess>());
   }
- 
+
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
-  PacketType packet(Index row, Index col) const 
-  { 
-    return m_argImpl.template packet<LoadMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col); 
+  PacketType packet(Index row, Index col) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col);
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
-  PacketType packet(Index index) const 
-  { 
+  PacketType packet(Index index) const
+  {
     if (ForwardLinearAccess)
       return m_argImpl.template packet<LoadMode,PacketType>(m_linear_offset.value() + index);
     else
       return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
                                          RowsAtCompileTime == 1 ? index : 0);
   }
-  
+
   template<int StoreMode, typename PacketType>
   EIGEN_STRONG_INLINE
-  void writePacket(Index row, Index col, const PacketType& x) 
+  void writePacket(Index row, Index col, const PacketType& x)
   {
-    return m_argImpl.template writePacket<StoreMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col, x); 
+    return m_argImpl.template writePacket<StoreMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col, x);
   }
-  
+
   template<int StoreMode, typename PacketType>
   EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketType& x) 
+  void writePacket(Index index, const PacketType& x)
   {
     if (ForwardLinearAccess)
       return m_argImpl.template writePacket<StoreMode,PacketType>(m_linear_offset.value() + index, x);
@@ -1098,18 +1177,40 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
                                               RowsAtCompileTime == 1 ? index : 0,
                                               x);
   }
- 
+
 protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const
+  {
+    return m_argImpl.coeff(m_linear_offset.value() + index);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType linear_coeff_impl(Index index, internal::false_type /* not ForwardLinearAccess */) const
+  {
+    return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& linear_coeffRef_impl(Index index, internal::true_type /* ForwardLinearAccess */)
+  {
+    return m_argImpl.coeffRef(m_linear_offset.value() + index);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& linear_coeffRef_impl(Index index, internal::false_type /* not ForwardLinearAccess */)
+  {
+    return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+  }
+
   evaluator<ArgType> m_argImpl;
   const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
   const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
-  const variable_if_dynamic<Index, InnerPanel ? Dynamic : 0> m_linear_offset;
+  const variable_if_dynamic<Index, ForwardLinearAccess ? Dynamic : 0> m_linear_offset;
 };
 
-// TODO: This evaluator does not actually use the child evaluator; 
+// TODO: This evaluator does not actually use the child evaluator;
 // all action is via the data() as returned by the Block expression.
 
-template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel> 
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAccess */ true>
   : mapbase_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>,
                       typename Block<ArgType, BlockRows, BlockCols, InnerPanel>::PlainObject>
@@ -1117,8 +1218,9 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAc
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
   typedef typename XprType::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
-    : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit block_evaluator(const XprType& block)
+    : mapbase_evaluator<XprType, typename XprType::PlainObject>(block)
   {
     // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
     eigen_assert(((internal::UIntPtr(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
@@ -1141,18 +1243,19 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
                                          evaluator<ElseMatrixType>::CoeffReadCost),
 
     Flags = (unsigned int)evaluator<ThenMatrixType>::Flags & evaluator<ElseMatrixType>::Flags & HereditaryBits,
-    
+
     Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ThenMatrixType>::Alignment, evaluator<ElseMatrixType>::Alignment)
   };
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& select)
     : m_conditionImpl(select.conditionMatrix()),
       m_thenImpl(select.thenMatrix()),
       m_elseImpl(select.elseMatrix())
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
- 
+
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1172,7 +1275,7 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
     else
       return m_elseImpl.coeff(index);
   }
- 
+
 protected:
   evaluator<ConditionMatrixType> m_conditionImpl;
   evaluator<ThenMatrixType> m_thenImpl;
@@ -1182,7 +1285,7 @@ protected:
 
 // -------------------- Replicate --------------------
 
-template<typename ArgType, int RowFactor, int ColFactor> 
+template<typename ArgType, int RowFactor, int ColFactor>
 struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
   : evaluator_base<Replicate<ArgType, RowFactor, ColFactor> >
 {
@@ -1193,22 +1296,23 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
   };
   typedef typename internal::nested_eval<ArgType,Factor>::type ArgTypeNested;
   typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
-  
+
   enum {
     CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
     LinearAccessMask = XprType::IsVectorAtCompileTime ? LinearAccessBit : 0,
     Flags = (evaluator<ArgTypeNestedCleaned>::Flags & (HereditaryBits|LinearAccessMask) & ~RowMajorBit) | (traits<XprType>::Flags & RowMajorBit),
-    
+
     Alignment = evaluator<ArgTypeNestedCleaned>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& replicate)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& replicate)
     : m_arg(replicate.nestedExpression()),
       m_argImpl(m_arg),
       m_rows(replicate.nestedExpression().rows()),
       m_cols(replicate.nestedExpression().cols())
   {}
- 
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
   {
@@ -1219,10 +1323,10 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
     const Index actual_col = internal::traits<XprType>::ColsAtCompileTime==1 ? 0
                            : ColFactor==1 ? col
                            : col % m_cols.value();
-    
+
     return m_argImpl.coeff(actual_row, actual_col);
   }
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   {
@@ -1230,7 +1334,7 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
     const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
                                   ? (ColFactor==1 ?  index : index%m_cols.value())
                                   : (RowFactor==1 ?  index : index%m_rows.value());
-    
+
     return m_argImpl.coeff(actual_index);
   }
 
@@ -1247,7 +1351,7 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
 
     return m_argImpl.template packet<LoadMode,PacketType>(actual_row, actual_col);
   }
-  
+
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
@@ -1258,7 +1362,7 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
 
     return m_argImpl.template packet<LoadMode,PacketType>(actual_index);
   }
- 
+
 protected:
   const ArgTypeNested m_arg;
   evaluator<ArgTypeNestedCleaned> m_argImpl;
@@ -1266,64 +1370,6 @@ protected:
   const variable_if_dynamic<Index, ArgType::ColsAtCompileTime> m_cols;
 };
 
-
-// -------------------- PartialReduxExpr --------------------
-
-template< typename ArgType, typename MemberOp, int Direction>
-struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
-  : evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
-{
-  typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
-  typedef typename internal::nested_eval<ArgType,1>::type ArgTypeNested;
-  typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
-  typedef typename ArgType::Scalar InputScalar;
-  typedef typename XprType::Scalar Scalar;
-  enum {
-    TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) :  int(ArgType::ColsAtCompileTime)
-  };
-  typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
-  enum {
-    CoeffReadCost = TraversalSize==Dynamic ? HugeCost
-                  : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
-    
-    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit))) | LinearAccessBit,
-    
-    Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
-  };
-
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr)
-    : m_arg(xpr.nestedExpression()), m_functor(xpr.functor())
-  {
-    EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : int(CostOpType::value));
-    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
-  }
-
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const Scalar coeff(Index i, Index j) const
-  {
-    if (Direction==Vertical)
-      return m_functor(m_arg.col(j));
-    else
-      return m_functor(m_arg.row(i));
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const Scalar coeff(Index index) const
-  {
-    if (Direction==Vertical)
-      return m_functor(m_arg.col(index));
-    else
-      return m_functor(m_arg.row(index));
-  }
-
-protected:
-  typename internal::add_const_on_value_type<ArgTypeNested>::type m_arg;
-  const MemberOp m_functor;
-};
-
-
 // -------------------- MatrixWrapper and ArrayWrapper --------------------
 //
 // evaluator_wrapper_base<T> is a common base class for the
@@ -1340,7 +1386,8 @@ struct evaluator_wrapper_base
     Alignment = evaluator<ArgType>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
 
   typedef typename ArgType::Scalar Scalar;
   typedef typename ArgType::CoeffReturnType CoeffReturnType;
@@ -1407,7 +1454,8 @@ struct unary_evaluator<MatrixWrapper<TArgType> >
 {
   typedef MatrixWrapper<TArgType> XprType;
 
-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& wrapper)
     : evaluator_wrapper_base<MatrixWrapper<TArgType> >(wrapper.nestedExpression())
   { }
 };
@@ -1418,7 +1466,8 @@ struct unary_evaluator<ArrayWrapper<TArgType> >
 {
   typedef ArrayWrapper<TArgType> XprType;
 
-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& wrapper)
     : evaluator_wrapper_base<ArrayWrapper<TArgType> >(wrapper.nestedExpression())
   { }
 };
@@ -1445,9 +1494,9 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
     ReversePacket = (Direction == BothDirections)
                     || ((Direction == Vertical)   && IsColMajor)
                     || ((Direction == Horizontal) && IsRowMajor),
-                    
+
     CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
-    
+
     // let's enable LinearAccess only with vectorization because of the product overhead
     // FIXME enable DirectAccess with negative strides?
     Flags0 = evaluator<ArgType>::Flags,
@@ -1456,16 +1505,17 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
                  ? LinearAccessBit : 0,
 
     Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess),
-    
+
     Alignment = 0 // FIXME in some rare cases, Alignment could be preserved, like a Vector4f.
   };
 
-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& reverse)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& reverse)
     : m_argImpl(reverse.nestedExpression()),
       m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1),
       m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1)
   { }
- 
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
   {
@@ -1540,7 +1590,7 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
     m_argImpl.template writePacket<LoadMode>
       (m_rows.value() * m_cols.value() - index - PacketSize, preverse(x));
   }
- 
+
 protected:
   evaluator<ArgType> m_argImpl;
 
@@ -1558,20 +1608,21 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
   : evaluator_base<Diagonal<ArgType, DiagIndex> >
 {
   typedef Diagonal<ArgType, DiagIndex> XprType;
-  
+
   enum {
     CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
-    
+
     Flags = (unsigned int)(evaluator<ArgType>::Flags & (HereditaryBits | DirectAccessBit) & ~RowMajorBit) | LinearAccessBit,
-    
+
     Alignment = 0
   };
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& diagonal)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit evaluator(const XprType& diagonal)
     : m_argImpl(diagonal.nestedExpression()),
       m_index(diagonal.index())
   { }
- 
+
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
@@ -1604,8 +1655,10 @@ protected:
   const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
 
 private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+  Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
 };
 
 
@@ -1629,25 +1682,25 @@ class EvalToTemp
   : public dense_xpr_base<EvalToTemp<ArgType> >::type
 {
  public:
- 
+
   typedef typename dense_xpr_base<EvalToTemp>::type Base;
   EIGEN_GENERIC_PUBLIC_INTERFACE(EvalToTemp)
- 
+
   explicit EvalToTemp(const ArgType& arg)
     : m_arg(arg)
   { }
- 
+
   const ArgType& arg() const
   {
     return m_arg;
   }
 
-  Index rows() const 
+  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT
   {
     return m_arg.rows();
   }
 
-  Index cols() const 
+  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT
   {
     return m_arg.cols();
   }
@@ -1655,7 +1708,7 @@ class EvalToTemp
  private:
   const ArgType& m_arg;
 };
- 
+
 template<typename ArgType>
 struct evaluator<EvalToTemp<ArgType> >
   : public evaluator<typename ArgType::PlainObject>
@@ -1663,7 +1716,7 @@ struct evaluator<EvalToTemp<ArgType> >
   typedef EvalToTemp<ArgType>                   XprType;
   typedef typename ArgType::PlainObject         PlainObject;
   typedef evaluator<PlainObject> Base;
-  
+
   EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
     : m_result(xpr.arg())
   {
diff --git a/contrib/eigen/Eigen/src/Core/CoreIterators.h b/contrib/eigen/Eigen/src/Core/CoreIterators.h
index 4eb42b93af18d06efd093dfd918297a00274ee31..b967196813b21194a0119ecf2eb721fa32ce07e8 100644
--- a/contrib/eigen/Eigen/src/Core/CoreIterators.h
+++ b/contrib/eigen/Eigen/src/Core/CoreIterators.h
@@ -48,6 +48,11 @@ public:
     * Explicit zeros are not skipped over. To skip explicit zeros, see class SparseView
     */
   EIGEN_STRONG_INLINE InnerIterator& operator++()   { m_iter.operator++(); return *this; }
+  EIGEN_STRONG_INLINE InnerIterator& operator+=(Index i) { m_iter.operator+=(i); return *this; }
+  EIGEN_STRONG_INLINE InnerIterator operator+(Index i) 
+  { InnerIterator result(*this); result+=i; return result; }
+    
+
   /// \returns the column or row index of the current coefficient.
   EIGEN_STRONG_INLINE Index index() const           { return m_iter.index(); }
   /// \returns the row index of the current coefficient.
diff --git a/contrib/eigen/Eigen/src/Core/CwiseBinaryOp.h b/contrib/eigen/Eigen/src/Core/CwiseBinaryOp.h
index a36765e396bd439ef85d096a0fd6d3a126a9fbfe..2202b1cc6b78c19de49e23ab81fd6e0982372996 100644
--- a/contrib/eigen/Eigen/src/Core/CwiseBinaryOp.h
+++ b/contrib/eigen/Eigen/src/Core/CwiseBinaryOp.h
@@ -74,7 +74,7 @@ class CwiseBinaryOpImpl;
   * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
   */
 template<typename BinaryOp, typename LhsType, typename RhsType>
-class CwiseBinaryOp : 
+class CwiseBinaryOp :
   public CwiseBinaryOpImpl<
           BinaryOp, LhsType, RhsType,
           typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
@@ -83,7 +83,7 @@ class CwiseBinaryOp :
   internal::no_assignment_operator
 {
   public:
-    
+
     typedef typename internal::remove_all<BinaryOp>::type Functor;
     typedef typename internal::remove_all<LhsType>::type Lhs;
     typedef typename internal::remove_all<RhsType>::type Rhs;
@@ -100,8 +100,14 @@ class CwiseBinaryOp :
     typedef typename internal::remove_reference<LhsNested>::type _LhsNested;
     typedef typename internal::remove_reference<RhsNested>::type _RhsNested;
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
+#if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11
+    //Required for Visual Studio or the Copy constructor will probably not get inlined!
+    EIGEN_STRONG_INLINE
+    CwiseBinaryOp(const CwiseBinaryOp<BinaryOp,LhsType,RhsType>&) = default;
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
       : m_lhs(aLhs), m_rhs(aRhs), m_functor(func)
     {
       EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar);
@@ -110,31 +116,25 @@ class CwiseBinaryOp :
       eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
     }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rows() const {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT {
       // return the fixed size type if available to enable compile time optimizations
-      if (internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic)
-        return m_rhs.rows();
-      else
-        return m_lhs.rows();
+      return internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic ? m_rhs.rows() : m_lhs.rows();
     }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index cols() const {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT {
       // return the fixed size type if available to enable compile time optimizations
-      if (internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic)
-        return m_rhs.cols();
-      else
-        return m_lhs.cols();
+      return internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic ? m_rhs.cols() : m_lhs.cols();
     }
 
     /** \returns the left hand side nested expression */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const _LhsNested& lhs() const { return m_lhs; }
     /** \returns the right hand side nested expression */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const _RhsNested& rhs() const { return m_rhs; }
     /** \returns the functor representing the binary operation */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const BinaryOp& functor() const { return m_functor; }
 
   protected:
@@ -158,7 +158,7 @@ public:
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -171,7 +171,7 @@ MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
   call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -181,4 +181,3 @@ MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 } // end namespace Eigen
 
 #endif // EIGEN_CWISE_BINARY_OP_H
-
diff --git a/contrib/eigen/Eigen/src/Core/CwiseNullaryOp.h b/contrib/eigen/Eigen/src/Core/CwiseNullaryOp.h
index ddd607e3832110602ff54acb97c67b94ecabbe18..289ec510a8ac733646e6f5915fd35fb8812ef559 100644
--- a/contrib/eigen/Eigen/src/Core/CwiseNullaryOp.h
+++ b/contrib/eigen/Eigen/src/Core/CwiseNullaryOp.h
@@ -74,10 +74,10 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
             && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
     }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rows() const { return m_rows.value(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index cols() const { return m_cols.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const { return m_rows.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const { return m_cols.value(); }
 
     /** \returns the functor representing the nullary operation */
     EIGEN_DEVICE_FUNC
@@ -105,7 +105,12 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const CwiseNullaryOp<CustomNullaryOp,typename DenseBase<Derived>::PlainObject>
+#else
+const CwiseNullaryOp<CustomNullaryOp,PlainObject>
+#endif
 DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
 {
   return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
@@ -126,12 +131,17 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
   *
   * Here is an example with C++11 random generators: \include random_cpp11.cpp
   * Output: \verbinclude random_cpp11.out
-  * 
+  *
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+#else
+const CwiseNullaryOp<CustomNullaryOp, PlainObject>
+#endif
 DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -150,7 +160,12 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+#else
+const CwiseNullaryOp<CustomNullaryOp, PlainObject>
+#endif
 DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
 {
   return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
@@ -170,7 +185,7 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
 {
   return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
@@ -217,27 +232,32 @@ DenseBase<Derived>::Constant(const Scalar& value)
 
 /** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(Index,const Scalar&,const Scalar&)
   *
-  * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&)
+  * \only_for_vectors
+  *
+  * Example: \include DenseBase_LinSpaced_seq_deprecated.cpp
+  * Output: \verbinclude DenseBase_LinSpaced_seq_deprecated.out
+  *
+  * \sa LinSpaced(Index,const Scalar&, const Scalar&), setLinSpaced(Index,const Scalar&,const Scalar&)
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low,high,size));
 }
 
 /** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(const Scalar&,const Scalar&)
   *
-  * \sa LinSpaced(Scalar,Scalar)
+  * \sa LinSpaced(const Scalar&, const Scalar&)
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar>(low,high,Derived::SizeAtCompileTime));
 }
 
 /**
@@ -268,7 +288,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomA
 DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low,high,size));
 }
 
 /**
@@ -281,7 +301,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar>(low,high,Derived::SizeAtCompileTime));
 }
 
 /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
@@ -363,6 +383,33 @@ PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
   return setConstant(val);
 }
 
+/** Resizes to the given size, changing only the number of columns, and sets all
+  * coefficients in this expression to the given value \a val. For the parameter
+  * of type NoChange_t, just pass the special value \c NoChange.
+  *
+  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setConstant(NoChange_t, Index cols, const Scalar& val)
+{
+  return setConstant(rows(), cols, val);
+}
+
+/** Resizes to the given size, changing only the number of rows, and sets all
+  * coefficients in this expression to the given value \a val. For the parameter
+  * of type NoChange_t, just pass the special value \c NoChange.
+  *
+  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setConstant(Index rows, NoChange_t, const Scalar& val)
+{
+  return setConstant(rows, cols(), val);
+}
+
+
 /**
   * \brief Sets a linearly spaced vector.
   *
@@ -383,7 +430,7 @@ template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize));
+  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar>(low,high,newSize));
 }
 
 /**
@@ -536,6 +583,32 @@ PlainObjectBase<Derived>::setZero(Index rows, Index cols)
   return setConstant(Scalar(0));
 }
 
+/** Resizes to the given size, changing only the number of columns, and sets all
+  * coefficients in this expression to zero. For the parameter of type NoChange_t,
+  * just pass the special value \c NoChange.
+  *
+  * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Zero()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setZero(NoChange_t, Index cols)
+{
+  return setZero(rows(), cols);
+}
+
+/** Resizes to the given size, changing only the number of rows, and sets all
+  * coefficients in this expression to zero. For the parameter of type NoChange_t,
+  * just pass the special value \c NoChange.
+  *
+  * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Zero()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setZero(Index rows, NoChange_t)
+{
+  return setZero(rows, cols());
+}
+
 // ones:
 
 /** \returns an expression of a matrix where all coefficients equal one.
@@ -662,6 +735,32 @@ PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
   return setConstant(Scalar(1));
 }
 
+/** Resizes to the given size, changing only the number of rows, and sets all
+  * coefficients in this expression to one. For the parameter of type NoChange_t,
+  * just pass the special value \c NoChange.
+  *
+ * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(NoChange_t, Index), class CwiseNullaryOp, MatrixBase::Ones()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setOnes(Index rows, NoChange_t)
+{
+  return setOnes(rows, cols());
+}
+
+/** Resizes to the given size, changing only the number of columns, and sets all
+  * coefficients in this expression to one. For the parameter of type NoChange_t,
+  * just pass the special value \c NoChange.
+  *
+ * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(Index, NoChange_t) class CwiseNullaryOp, MatrixBase::Ones()
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setOnes(NoChange_t, Index cols)
+{
+  return setOnes(rows(), cols);
+}
+
 // Identity:
 
 /** \returns an expression of the identity matrix (not necessarily square).
@@ -861,6 +960,42 @@ template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
 { return Derived::Unit(3); }
 
+/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector
+  *
+  * \param i index of the unique coefficient to be set to 1
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index i)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  eigen_assert(i<size());
+  derived().setZero();
+  derived().coeffRef(i) = Scalar(1);
+  return derived();
+}
+
+/** \brief Resizes to the given \a newSize, and writes the i-th unit (basis) vector into *this.
+  *
+  * \param newSize the new size of the vector
+  * \param i index of the unique coefficient to be set to 1
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index newSize, Index i)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  eigen_assert(i<newSize);
+  derived().resize(newSize);
+  return setUnit(i);
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_CWISE_NULLARY_OP_H
diff --git a/contrib/eigen/Eigen/src/Core/CwiseUnaryOp.h b/contrib/eigen/Eigen/src/Core/CwiseUnaryOp.h
index 1d2dd19f2b1158332c514e36111464e2e6f9c73b..e68c4f7480824e51d0ab360a393b520af6665199 100644
--- a/contrib/eigen/Eigen/src/Core/CwiseUnaryOp.h
+++ b/contrib/eigen/Eigen/src/Core/CwiseUnaryOp.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_CWISE_UNARY_OP_H
 #define EIGEN_CWISE_UNARY_OP_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 template<typename UnaryOp, typename XprType>
@@ -24,7 +24,7 @@ struct traits<CwiseUnaryOp<UnaryOp, XprType> >
   typedef typename XprType::Nested XprTypeNested;
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
   enum {
-    Flags = _XprTypeNested::Flags & RowMajorBit 
+    Flags = _XprTypeNested::Flags & RowMajorBit
   };
 };
 }
@@ -65,10 +65,10 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal
     explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
       : m_xpr(xpr), m_functor(func) {}
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index rows() const { return m_xpr.rows(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index cols() const { return m_xpr.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
 
     /** \returns the functor representing the unary operation */
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
diff --git a/contrib/eigen/Eigen/src/Core/CwiseUnaryView.h b/contrib/eigen/Eigen/src/Core/CwiseUnaryView.h
index 5a30fa8df189c4170a21d7e3ec3b39778755037e..a06d7621ec143a3c3354bf4079ae7bd25c6f03b4 100644
--- a/contrib/eigen/Eigen/src/Core/CwiseUnaryView.h
+++ b/contrib/eigen/Eigen/src/Core/CwiseUnaryView.h
@@ -64,24 +64,26 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
     typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
     typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
-    explicit inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
+    explicit EIGEN_DEVICE_FUNC inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
       : m_matrix(mat), m_functor(func) {}
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
 
-    EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
     /** \returns the functor representing unary operation */
-    const ViewOp& functor() const { return m_functor; }
+    EIGEN_DEVICE_FUNC const ViewOp& functor() const { return m_functor; }
 
     /** \returns the nested expression */
-    const typename internal::remove_all<MatrixTypeNested>::type&
+    EIGEN_DEVICE_FUNC const typename internal::remove_all<MatrixTypeNested>::type&
     nestedExpression() const { return m_matrix; }
 
     /** \returns the nested expression */
-    typename internal::remove_reference<MatrixTypeNested>::type&
-    nestedExpression() { return m_matrix.const_cast_derived(); }
+    EIGEN_DEVICE_FUNC typename internal::remove_reference<MatrixTypeNested>::type&
+    nestedExpression() { return m_matrix; }
 
   protected:
     MatrixTypeNested m_matrix;
@@ -108,16 +110,16 @@ class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
 
     EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl)
-    
+
     EIGEN_DEVICE_FUNC inline Scalar* data() { return &(this->coeffRef(0)); }
     EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeff(0)); }
 
-    EIGEN_DEVICE_FUNC inline Index innerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const
     {
       return derived().nestedExpression().innerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
     }
 
-    EIGEN_DEVICE_FUNC inline Index outerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const
     {
       return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
     }
diff --git a/contrib/eigen/Eigen/src/Core/DenseBase.h b/contrib/eigen/Eigen/src/Core/DenseBase.h
index c55a68230cd16a2b558739b6354dee310fdb96c5..9b16db68d48d2bfbf7e5c076ef2ba1504074f3f3 100644
--- a/contrib/eigen/Eigen/src/Core/DenseBase.h
+++ b/contrib/eigen/Eigen/src/Core/DenseBase.h
@@ -14,15 +14,15 @@
 namespace Eigen {
 
 namespace internal {
-  
+
 // The index type defined by EIGEN_DEFAULT_DENSE_INDEX_TYPE must be a signed type.
 // This dummy function simply aims at checking that at compile time.
 static inline void check_DenseIndex_is_signed() {
-  EIGEN_STATIC_ASSERT(NumTraits<DenseIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE); 
+  EIGEN_STATIC_ASSERT(NumTraits<DenseIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE)
 }
 
 } // end namespace internal
-  
+
 /** \class DenseBase
   * \ingroup Core_Module
   *
@@ -64,12 +64,12 @@ template<typename Derived> class DenseBase
 
     /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc. */
     typedef typename internal::traits<Derived>::Scalar Scalar;
-    
+
     /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc.
       *
       * It is an alias for the Scalar type */
     typedef Scalar value_type;
-    
+
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value> Base;
 
@@ -150,13 +150,18 @@ template<typename Derived> class DenseBase
           * \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime
           */
 
-      IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
-                           || internal::traits<Derived>::MaxColsAtCompileTime == 1,
+      IsVectorAtCompileTime = internal::traits<Derived>::RowsAtCompileTime == 1
+                           || internal::traits<Derived>::ColsAtCompileTime == 1,
         /**< This is set to true if either the number of rows or the number of
           * columns is known at compile-time to be equal to 1. Indeed, in that case,
           * we are dealing with a column-vector (if there is only one column) or with
           * a row-vector (if there is only one row). */
 
+      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
+        /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors,
+         * and 2 for matrices.
+         */
+
       Flags = internal::traits<Derived>::Flags,
         /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
           * constructed from this one. See the \ref flags "list of flags".
@@ -170,11 +175,11 @@ template<typename Derived> class DenseBase
       InnerStrideAtCompileTime = internal::inner_stride_at_compile_time<Derived>::ret,
       OuterStrideAtCompileTime = internal::outer_stride_at_compile_time<Derived>::ret
     };
-    
+
     typedef typename internal::find_best_packet<Scalar,SizeAtCompileTime>::type PacketScalar;
 
     enum { IsPlainObjectBase = 0 };
-    
+
     /** The plain matrix type corresponding to this expression.
       * \sa PlainObject */
     typedef Matrix<typename internal::traits<Derived>::Scalar,
@@ -184,7 +189,7 @@ template<typename Derived> class DenseBase
                 internal::traits<Derived>::MaxRowsAtCompileTime,
                 internal::traits<Derived>::MaxColsAtCompileTime
           > PlainMatrix;
-    
+
     /** The plain array type corresponding to this expression.
       * \sa PlainObject */
     typedef Array<typename internal::traits<Derived>::Scalar,
@@ -206,7 +211,7 @@ template<typename Derived> class DenseBase
 
     /** \returns the number of nonzero coefficients which is in practice the number
       * of stored coefficients. */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index nonZeros() const { return size(); }
 
     /** \returns the outer size.
@@ -214,7 +219,7 @@ template<typename Derived> class DenseBase
       * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
       * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a
       * column-major matrix, and the number of rows for a row-major matrix. */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     Index outerSize() const
     {
       return IsVectorAtCompileTime ? 1
@@ -224,9 +229,9 @@ template<typename Derived> class DenseBase
     /** \returns the inner size.
       *
       * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
-      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a 
+      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a
       * column-major matrix, and the number of columns for a row-major matrix. */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     Index innerSize() const
     {
       return IsVectorAtCompileTime ? this->size()
@@ -261,9 +266,9 @@ template<typename Derived> class DenseBase
     /** \internal Represents a matrix with all coefficients equal to one another*/
     typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
     /** \internal \deprecated Represents a vector with linearly spaced coefficients that allows sequential access only. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar>,PlainObject> SequentialLinSpacedReturnType;
+    EIGEN_DEPRECATED typedef CwiseNullaryOp<internal::linspaced_op<Scalar>,PlainObject> SequentialLinSpacedReturnType;
     /** \internal Represents a vector with linearly spaced coefficients that allows random access. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar>,PlainObject> RandomAccessLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar>,PlainObject> RandomAccessLinSpacedReturnType;
     /** \internal the return type of MatrixBase::eigenvalues() */
     typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real, internal::traits<Derived>::ColsAtCompileTime, 1> EigenvaluesReturnType;
 
@@ -297,17 +302,17 @@ template<typename Derived> class DenseBase
     Derived& operator=(const ReturnByValue<OtherDerived>& func);
 
     /** \internal
-      * Copies \a other into *this without evaluating other. \returns a reference to *this.
-      * \deprecated */
+      * Copies \a other into *this without evaluating other. \returns a reference to *this. */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    /** \deprecated */
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
     Derived& lazyAssign(const DenseBase<OtherDerived>& other);
 
     EIGEN_DEVICE_FUNC
     CommaInitializer<Derived> operator<< (const Scalar& s);
 
-    /** \deprecated it now returns \c *this */
     template<unsigned int Added,unsigned int Removed>
+    /** \deprecated it now returns \c *this */
     EIGEN_DEPRECATED
     const Derived& flagged() const
     { return derived(); }
@@ -332,12 +337,13 @@ template<typename Derived> class DenseBase
     EIGEN_DEVICE_FUNC static const ConstantReturnType
     Constant(const Scalar& value);
 
-    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
     LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high);
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
+    LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
+
     EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
     LinSpaced(Index size, const Scalar& low, const Scalar& high);
-    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
-    LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
     EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
     LinSpaced(const Scalar& low, const Scalar& high);
 
@@ -369,7 +375,7 @@ template<typename Derived> class DenseBase
     template<typename OtherDerived> EIGEN_DEVICE_FUNC
     bool isApprox(const DenseBase<OtherDerived>& other,
                   const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     bool isMuchSmallerThan(const RealScalar& other,
                            const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
     template<typename OtherDerived> EIGEN_DEVICE_FUNC
@@ -380,7 +386,7 @@ template<typename Derived> class DenseBase
     EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
     EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
     EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    
+
     inline bool hasNaN() const;
     inline bool allFinite() const;
 
@@ -394,8 +400,8 @@ template<typename Derived> class DenseBase
       *
       * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
       * a const reference, in order to avoid a useless copy.
-      * 
-      * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
+      *
+      * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
       */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE EvalReturnType eval() const
@@ -405,12 +411,12 @@ template<typename Derived> class DenseBase
       // size types on MSVC.
       return typename internal::eval<Derived>::type(derived());
     }
-    
+
     /** swaps *this with the expression \a other.
       *
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     void swap(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
@@ -422,7 +428,7 @@ template<typename Derived> class DenseBase
       *
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     void swap(PlainObjectBase<OtherDerived>& other)
     {
       eigen_assert(rows()==other.rows() && cols()==other.cols());
@@ -443,18 +449,58 @@ template<typename Derived> class DenseBase
 
     EIGEN_DEVICE_FUNC Scalar prod() const;
 
+    template<int NaNPropagation>
     EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar minCoeff() const;
+    template<int NaNPropagation>
     EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff() const;
 
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+
+    // By default, the fastest version with undefined NaN propagation semantics is
+    // used.
+    // TODO(rmlarsen): Replace with default template argument when we move to
+    // c++11 or beyond.
+    EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar minCoeff() const {
+      return minCoeff<PropagateFast>();
+    }
+    EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar maxCoeff() const {
+      return maxCoeff<PropagateFast>();
+    }
+
+    template<int NaNPropagation, typename IndexType>
+    EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<int NaNPropagation, typename IndexType>
+    EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<int NaNPropagation, typename IndexType>
+    EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const;
-    template<typename IndexType> EIGEN_DEVICE_FUNC
+    template<int NaNPropagation, typename IndexType>
+    EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const;
 
+    // TODO(rmlarsen): Replace these methods with a default template argument.
+    template<typename IndexType>
+    EIGEN_DEVICE_FUNC inline
+    typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const {
+      return minCoeff<PropagateFast>(row, col);
+    }
+    template<typename IndexType>
+    EIGEN_DEVICE_FUNC inline
+    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const {
+      return maxCoeff<PropagateFast>(row, col);
+    }
+    template<typename IndexType>
+     EIGEN_DEVICE_FUNC inline
+    typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const {
+      return minCoeff<PropagateFast>(index);
+    }
+    template<typename IndexType>
+    EIGEN_DEVICE_FUNC inline
+    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const {
+      return maxCoeff<PropagateFast>(index);
+    }
+  
     template<typename BinaryOp>
     EIGEN_DEVICE_FUNC
     Scalar redux(const BinaryOp& func) const;
@@ -493,7 +539,7 @@ template<typename Derived> class DenseBase
     typedef VectorwiseOp<Derived, Vertical> ColwiseReturnType;
     typedef const VectorwiseOp<const Derived, Vertical> ConstColwiseReturnType;
 
-    /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
+    /** \returns a VectorwiseOp wrapper of *this for broadcasting and partial reductions
     *
     * Example: \include MatrixBase_rowwise.cpp
     * Output: \verbinclude MatrixBase_rowwise.out
@@ -506,7 +552,7 @@ template<typename Derived> class DenseBase
     }
     EIGEN_DEVICE_FUNC RowwiseReturnType rowwise();
 
-    /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
+    /** \returns a VectorwiseOp wrapper of *this broadcasting and partial reductions
     *
     * Example: \include MatrixBase_colwise.cpp
     * Output: \verbinclude MatrixBase_colwise.out
@@ -524,16 +570,16 @@ template<typename Derived> class DenseBase
     static const RandomReturnType Random();
 
     template<typename ThenDerived,typename ElseDerived>
-    const Select<Derived,ThenDerived,ElseDerived>
+    inline EIGEN_DEVICE_FUNC const Select<Derived,ThenDerived,ElseDerived>
     select(const DenseBase<ThenDerived>& thenMatrix,
            const DenseBase<ElseDerived>& elseMatrix) const;
 
     template<typename ThenDerived>
-    inline const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
+    inline EIGEN_DEVICE_FUNC const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
     select(const DenseBase<ThenDerived>& thenMatrix, const typename ThenDerived::Scalar& elseScalar) const;
 
     template<typename ElseDerived>
-    inline const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
+    inline EIGEN_DEVICE_FUNC const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
     select(const typename ElseDerived::Scalar& thenScalar, const DenseBase<ElseDerived>& elseMatrix) const;
 
     template<int p> RealScalar lpNorm() const;
@@ -567,16 +613,59 @@ template<typename Derived> class DenseBase
     }
     EIGEN_DEVICE_FUNC void reverseInPlace();
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    /** STL-like <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">RandomAccessIterator</a>
+      * iterator type as returned by the begin() and end() methods.
+      */
+    typedef random_access_iterator_type iterator;
+    /** This is the const version of iterator (aka read-only) */
+    typedef random_access_iterator_type const_iterator;
+    #else
+    typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit,
+                                            internal::pointer_based_stl_iterator<Derived>,
+                                            internal::generic_randaccess_stl_iterator<Derived>
+                                          >::type iterator_type;
+
+    typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit,
+                                            internal::pointer_based_stl_iterator<const Derived>,
+                                            internal::generic_randaccess_stl_iterator<const Derived>
+                                          >::type const_iterator_type;
+
+    // Stl-style iterators are supported only for vectors.
+
+    typedef typename internal::conditional< IsVectorAtCompileTime,
+                                            iterator_type,
+                                            void
+                                          >::type iterator;
+
+    typedef typename internal::conditional< IsVectorAtCompileTime,
+                                            const_iterator_type,
+                                            void
+                                          >::type const_iterator;
+    #endif
+
+    inline iterator begin();
+    inline const_iterator begin() const;
+    inline const_iterator cbegin() const;
+    inline iterator end();
+    inline const_iterator end() const;
+    inline const_iterator cend() const;
+
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
 #define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 #define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
+#define EIGEN_DOC_UNARY_ADDONS(X,Y)
+#   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/BlockMethods.h"
+#   include "../plugins/IndexedViewMethods.h"
+#   include "../plugins/ReshapedMethods.h"
 #   ifdef EIGEN_DENSEBASE_PLUGIN
 #     include EIGEN_DENSEBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
 #undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 #undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
+#undef EIGEN_DOC_UNARY_ADDONS
 
     // disable the use of evalTo for dense objects with a nice compilation error
     template<typename Dest>
diff --git a/contrib/eigen/Eigen/src/Core/DenseCoeffsBase.h b/contrib/eigen/Eigen/src/Core/DenseCoeffsBase.h
index c4af48ab699f438e82ae488b8bb1382f3693c7b3..37fcdb59119ff83758ce004d7526b69a73dbb275 100644
--- a/contrib/eigen/Eigen/src/Core/DenseCoeffsBase.h
+++ b/contrib/eigen/Eigen/src/Core/DenseCoeffsBase.h
@@ -22,11 +22,12 @@ template<typename T> struct add_const_on_value_type_if_arithmetic
 /** \brief Base class providing read-only coefficient access to matrices and arrays.
   * \ingroup Core_Module
   * \tparam Derived Type of the derived class
-  * \tparam #ReadOnlyAccessors Constant indicating read-only access
+  *
+  * \note #ReadOnlyAccessors Constant indicating read-only access
   *
   * This class defines the \c operator() \c const function and friends, which can be used to read specific
   * entries of a matrix or array.
-  * 
+  *
   * \sa DenseCoeffsBase<Derived, WriteAccessors>, DenseCoeffsBase<Derived, DirectAccessors>,
   *     \ref TopicClassHierarchy
   */
@@ -288,12 +289,13 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
 /** \brief Base class providing read/write coefficient access to matrices and arrays.
   * \ingroup Core_Module
   * \tparam Derived Type of the derived class
-  * \tparam #WriteAccessors Constant indicating read/write access
+  *
+  * \note #WriteAccessors Constant indicating read/write access
   *
   * This class defines the non-const \c operator() function and friends, which can be used to write specific
   * entries of a matrix or array. This class inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which
   * defines the const variant for reading specific entries.
-  * 
+  *
   * \sa DenseCoeffsBase<Derived, DirectAccessors>, \ref TopicClassHierarchy
   */
 template<typename Derived>
@@ -466,7 +468,8 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
 /** \brief Base class providing direct read-only coefficient access to matrices and arrays.
   * \ingroup Core_Module
   * \tparam Derived Type of the derived class
-  * \tparam #DirectAccessors Constant indicating direct access
+  *
+  * \note #DirectAccessors Constant indicating direct access
   *
   * This class defines functions to work with strides which can be used to access entries directly. This class
   * inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which defines functions to access entries read-only using
@@ -492,7 +495,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
       *
       * \sa outerStride(), rowStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index innerStride() const
     {
       return derived().innerStride();
@@ -503,14 +506,14 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
       *
       * \sa innerStride(), rowStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index outerStride() const
     {
       return derived().outerStride();
     }
 
     // FIXME shall we remove it ?
-    inline Index stride() const
+    EIGEN_CONSTEXPR inline Index stride() const
     {
       return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
     }
@@ -519,7 +522,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
       *
       * \sa innerStride(), outerStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index rowStride() const
     {
       return Derived::IsRowMajor ? outerStride() : innerStride();
@@ -529,7 +532,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
       *
       * \sa innerStride(), outerStride(), rowStride()
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index colStride() const
     {
       return Derived::IsRowMajor ? innerStride() : outerStride();
@@ -539,7 +542,8 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
 /** \brief Base class providing direct read/write coefficient access to matrices and arrays.
   * \ingroup Core_Module
   * \tparam Derived Type of the derived class
-  * \tparam #DirectWriteAccessors Constant indicating direct access
+  *
+  * \note #DirectWriteAccessors Constant indicating direct access
   *
   * This class defines functions to work with strides which can be used to access entries directly. This class
   * inherits DenseCoeffsBase<Derived, WriteAccessors> which defines functions to access entries read/write using
@@ -566,8 +570,8 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
       *
       * \sa outerStride(), rowStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT
     {
       return derived().innerStride();
     }
@@ -577,14 +581,14 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
       *
       * \sa innerStride(), rowStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT
     {
       return derived().outerStride();
     }
 
     // FIXME shall we remove it ?
-    inline Index stride() const
+    EIGEN_CONSTEXPR inline Index stride() const EIGEN_NOEXCEPT
     {
       return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
     }
@@ -593,8 +597,8 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
       *
       * \sa innerStride(), outerStride(), colStride()
       */
-    EIGEN_DEVICE_FUNC
-    inline Index rowStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rowStride() const EIGEN_NOEXCEPT
     {
       return Derived::IsRowMajor ? outerStride() : innerStride();
     }
@@ -603,8 +607,8 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
       *
       * \sa innerStride(), outerStride(), rowStride()
       */
-    EIGEN_DEVICE_FUNC
-    inline Index colStride() const
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index colStride() const EIGEN_NOEXCEPT
     {
       return Derived::IsRowMajor ? innerStride() : outerStride();
     }
@@ -615,7 +619,7 @@ namespace internal {
 template<int Alignment, typename Derived, bool JustReturnZero>
 struct first_aligned_impl
 {
-  static inline Index run(const Derived&)
+  static EIGEN_CONSTEXPR inline Index run(const Derived&) EIGEN_NOEXCEPT
   { return 0; }
 };
 
diff --git a/contrib/eigen/Eigen/src/Core/DenseStorage.h b/contrib/eigen/Eigen/src/Core/DenseStorage.h
index 7d6d4e66d4b2750c94ad23ad0e1b999a8f3f42ad..08ef6c530617401094344790e64a7ac21addb2ad 100644
--- a/contrib/eigen/Eigen/src/Core/DenseStorage.h
+++ b/contrib/eigen/Eigen/src/Core/DenseStorage.h
@@ -47,21 +47,21 @@ struct plain_array
 
   EIGEN_DEVICE_FUNC
   plain_array()
-  { 
+  {
     check_static_allocation_size<T,Size>();
   }
 
   EIGEN_DEVICE_FUNC
   plain_array(constructor_without_unaligned_array_assert)
-  { 
+  {
     check_static_allocation_size<T,Size>();
   }
 };
 
 #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
-#elif EIGEN_GNUC_AT_LEAST(4,7) 
-  // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned.
+#elif EIGEN_GNUC_AT_LEAST(4,7)
+  // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned.
   // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
   // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
   template<typename PtrType>
@@ -85,15 +85,15 @@ struct plain_array<T, Size, MatrixOrArrayOptions, 8>
   EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size];
 
   EIGEN_DEVICE_FUNC
-  plain_array() 
+  plain_array()
   {
     EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7);
     check_static_allocation_size<T,Size>();
   }
 
   EIGEN_DEVICE_FUNC
-  plain_array(constructor_without_unaligned_array_assert) 
-  { 
+  plain_array(constructor_without_unaligned_array_assert)
+  {
     check_static_allocation_size<T,Size>();
   }
 };
@@ -104,15 +104,15 @@ struct plain_array<T, Size, MatrixOrArrayOptions, 16>
   EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size];
 
   EIGEN_DEVICE_FUNC
-  plain_array() 
-  { 
+  plain_array()
+  {
     EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15);
     check_static_allocation_size<T,Size>();
   }
 
   EIGEN_DEVICE_FUNC
-  plain_array(constructor_without_unaligned_array_assert) 
-  { 
+  plain_array(constructor_without_unaligned_array_assert)
+  {
     check_static_allocation_size<T,Size>();
   }
 };
@@ -123,15 +123,15 @@ struct plain_array<T, Size, MatrixOrArrayOptions, 32>
   EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size];
 
   EIGEN_DEVICE_FUNC
-  plain_array() 
+  plain_array()
   {
     EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31);
     check_static_allocation_size<T,Size>();
   }
 
   EIGEN_DEVICE_FUNC
-  plain_array(constructor_without_unaligned_array_assert) 
-  { 
+  plain_array(constructor_without_unaligned_array_assert)
+  {
     check_static_allocation_size<T,Size>();
   }
 };
@@ -142,15 +142,15 @@ struct plain_array<T, Size, MatrixOrArrayOptions, 64>
   EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size];
 
   EIGEN_DEVICE_FUNC
-  plain_array() 
-  { 
+  plain_array()
+  {
     EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63);
     check_static_allocation_size<T,Size>();
   }
 
   EIGEN_DEVICE_FUNC
-  plain_array(constructor_without_unaligned_array_assert) 
-  { 
+  plain_array(constructor_without_unaligned_array_assert)
+  {
     check_static_allocation_size<T,Size>();
   }
 };
@@ -163,6 +163,30 @@ struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
   EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
 };
 
+struct plain_array_helper {
+  template<typename T, int Size, int MatrixOrArrayOptions, int Alignment>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static void copy(const plain_array<T, Size, MatrixOrArrayOptions, Alignment>& src, const Eigen::Index size,
+                         plain_array<T, Size, MatrixOrArrayOptions, Alignment>& dst) {
+    smart_copy(src.array, src.array + size, dst.array);
+  }
+  
+  template<typename T, int Size, int MatrixOrArrayOptions, int Alignment>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static void swap(plain_array<T, Size, MatrixOrArrayOptions, Alignment>& a, const Eigen::Index a_size,
+                   plain_array<T, Size, MatrixOrArrayOptions, Alignment>& b, const Eigen::Index b_size) {
+    if (a_size < b_size) {
+      std::swap_ranges(b.array, b.array + a_size, a.array);
+      smart_move(b.array + a_size, b.array + b_size, a.array + a_size);
+    } else if (a_size > b_size) {
+      std::swap_ranges(a.array, a.array + b_size, b.array);
+      smart_move(a.array + b_size, a.array + a_size, b.array + b_size);
+    } else {
+      std::swap_ranges(a.array, a.array + a_size, b.array);
+    }
+  }
+};
+
 } // end namespace internal
 
 /** \internal
@@ -190,16 +214,41 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
     EIGEN_DEVICE_FUNC
     explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()) {}
-    EIGEN_DEVICE_FUNC 
+#if !EIGEN_HAS_CXX11 || defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN)
+    EIGEN_DEVICE_FUNC
     DenseStorage(const DenseStorage& other) : m_data(other.m_data) {
       EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
     }
-    EIGEN_DEVICE_FUNC 
+#else
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) = default;
+#endif
+#if !EIGEN_HAS_CXX11
+    EIGEN_DEVICE_FUNC
     DenseStorage& operator=(const DenseStorage& other)
-    { 
+    {
       if (this != &other) m_data = other.m_data;
-      return *this; 
+      return *this;
+    }
+#else
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) = default;
+#endif
+#if EIGEN_HAS_RVALUE_REFERENCES
+#if !EIGEN_HAS_CXX11
+    EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
+      : m_data(std::move(other.m_data))
+    {
     }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
+    {
+      if (this != &other)
+        m_data = std::move(other.m_data);
+      return *this;
+    }
+#else
+    EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&&) = default;
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&&) = default;
+#endif
+#endif
     EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
       EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols);
@@ -207,9 +256,11 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
       EIGEN_UNUSED_VARIABLE(rows);
       EIGEN_UNUSED_VARIABLE(cols);
     }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
-    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
-    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data, other.m_data);
+    }
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return _Cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
     EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
@@ -226,8 +277,8 @@ template<typename T, int _Rows, int _Cols, int _Options> class DenseStorage<T, 0
     EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) { return *this; }
     EIGEN_DEVICE_FUNC DenseStorage(Index,Index,Index) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& ) {}
-    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
-    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return _Cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
     EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
     EIGEN_DEVICE_FUNC const T *data() const { return 0; }
@@ -254,20 +305,28 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
     EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {}
     EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {}
-    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) 
-    { 
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows), m_cols(other.m_cols)
+    {
+      internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
       if (this != &other)
       {
-        m_data = other.m_data;
         m_rows = other.m_rows;
         m_cols = other.m_cols;
+        internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data);
       }
-      return *this; 
+      return *this;
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
+    {
+      internal::plain_array_helper::swap(m_data, m_rows * m_cols, other.m_data, other.m_rows * other.m_cols);
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
@@ -285,20 +344,29 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
     EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {}
     EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows) {}
-    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) 
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows)
+    {
+      internal::plain_array_helper::copy(other.m_data, m_rows * _Cols, m_data);
+    }
+    
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
       {
-        m_data = other.m_data;
         m_rows = other.m_rows;
+        internal::plain_array_helper::copy(other.m_data, m_rows * _Cols, m_data);
       }
-      return *this; 
+      return *this;
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
-    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
-    EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
+    { 
+      internal::plain_array_helper::swap(m_data, m_rows * _Cols, other.m_data, other.m_rows * _Cols);
+      numext::swap(m_rows, other.m_rows);
+    }
+    EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols(void) const EIGEN_NOEXCEPT {return _Cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
     EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index) { m_rows = rows; }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
@@ -314,22 +382,29 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
     EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {}
     EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_cols(other.m_cols) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) 
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(other.m_cols)
+    {
+      internal::plain_array_helper::copy(other.m_data, _Rows * m_cols, m_data);
+    }
     EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
       {
-        m_data = other.m_data;
         m_cols = other.m_cols;
+        internal::plain_array_helper::copy(other.m_data, _Rows * m_cols, m_data);
       }
       return *this;
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
-    EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
-    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
-    void resize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      internal::plain_array_helper::swap(m_data, _Rows * m_cols, other.m_data, _Rows * other.m_cols);
+      numext::swap(m_cols, other.m_cols);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows(void) const EIGEN_NOEXCEPT {return _Rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
     EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -381,18 +456,21 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_rows, other.m_rows);
-      swap(m_cols, other.m_cols);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
+      numext::swap(m_cols, other.m_cols);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
-    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
-    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
+    EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
     void conservativeResize(Index size, Index rows, Index cols)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*m_cols);
@@ -446,7 +524,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
         this->swap(tmp);
       }
       return *this;
-    }    
+    }
 #if EIGEN_HAS_RVALUE_REFERENCES
     EIGEN_DEVICE_FUNC
     DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
@@ -459,16 +537,18 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_cols, other.m_cols);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_cols, other.m_cols);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
-    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
-    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_cols,other.m_cols);
+    }
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, _Rows*m_cols);
@@ -520,7 +600,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
         this->swap(tmp);
       }
       return *this;
-    }    
+    }
 #if EIGEN_HAS_RVALUE_REFERENCES
     EIGEN_DEVICE_FUNC
     DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
@@ -533,16 +613,18 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_rows, other.m_rows);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
-    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
-    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+    }
+    EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
+    EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) {return _Cols;}
     void conservativeResize(Index size, Index rows, Index)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*_Cols);
diff --git a/contrib/eigen/Eigen/src/Core/Diagonal.h b/contrib/eigen/Eigen/src/Core/Diagonal.h
index afcaf3575677ced89e094715fe97c3b8b9864c5d..3112d2c16a9b715d3f96f1bc658ccc12f8d62dd7 100644
--- a/contrib/eigen/Eigen/src/Core/Diagonal.h
+++ b/contrib/eigen/Eigen/src/Core/Diagonal.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_DIAGONAL_H
 #define EIGEN_DIAGONAL_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class Diagonal
   * \ingroup Core_Module
@@ -84,20 +84,16 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
                                : numext::mini<Index>(m_matrix.rows(),m_matrix.cols()-m_index.value());
     }
 
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return 1; }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return 1; }
 
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const
-    {
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT {
       return m_matrix.outerStride() + 1;
     }
 
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const
-    {
-      return 0;
-    }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return 0; }
 
     typedef typename internal::conditional<
                        internal::is_lvalue<MatrixType>::value,
@@ -149,8 +145,8 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
     }
 
     EIGEN_DEVICE_FUNC
-    inline const typename internal::remove_all<typename MatrixType::Nested>::type& 
-    nestedExpression() const 
+    inline const typename internal::remove_all<typename MatrixType::Nested>::type&
+    nestedExpression() const
     {
       return m_matrix;
     }
@@ -167,12 +163,12 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
 
   private:
     // some compilers may fail to optimize std::max etc in case of compile-time constants...
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index absDiagIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index absDiagIndex() const EIGEN_NOEXCEPT { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rowOffset() const EIGEN_NOEXCEPT { return m_index.value()>0 ? 0 : -m_index.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index colOffset() const EIGEN_NOEXCEPT { return m_index.value()>0 ? m_index.value() : 0; }
     // trigger a compile-time error if someone try to call packet
     template<int LoadMode> typename MatrixType::PacketReturnType packet(Index) const;
     template<int LoadMode> typename MatrixType::PacketReturnType packet(Index,Index) const;
@@ -187,7 +183,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
   *
   * \sa class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType
 MatrixBase<Derived>::diagonal()
 {
   return DiagonalReturnType(derived());
@@ -195,7 +191,7 @@ MatrixBase<Derived>::diagonal()
 
 /** This is the const version of diagonal(). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
 MatrixBase<Derived>::diagonal() const
 {
   return ConstDiagonalReturnType(derived());
@@ -213,7 +209,7 @@ MatrixBase<Derived>::diagonal() const
   *
   * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index)
 {
   return DiagonalDynamicIndexReturnType(derived(), index);
@@ -221,7 +217,7 @@ MatrixBase<Derived>::diagonal(Index index)
 
 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index) const
 {
   return ConstDiagonalDynamicIndexReturnType(derived(), index);
@@ -240,6 +236,7 @@ MatrixBase<Derived>::diagonal(Index index) const
   * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
 template<int Index_>
+EIGEN_DEVICE_FUNC
 inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal()
 {
@@ -249,6 +246,7 @@ MatrixBase<Derived>::diagonal()
 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
 template<int Index_>
+EIGEN_DEVICE_FUNC
 inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal() const
 {
diff --git a/contrib/eigen/Eigen/src/Core/DiagonalMatrix.h b/contrib/eigen/Eigen/src/Core/DiagonalMatrix.h
index ecfdce8efa1c9be1a2d6bcc7f36d3ab16fc2622a..542685c65948acef3858ae1857ac87f443440179 100644
--- a/contrib/eigen/Eigen/src/Core/DiagonalMatrix.h
+++ b/contrib/eigen/Eigen/src/Core/DiagonalMatrix.h
@@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase<Derived>
 
     EIGEN_DEVICE_FUNC
     DenseMatrixType toDenseMatrix() const { return derived(); }
-    
+
     EIGEN_DEVICE_FUNC
     inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
     EIGEN_DEVICE_FUNC
@@ -83,6 +83,30 @@ class DiagonalBase : public EigenBase<Derived>
     {
       return DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >(scalar * other.diagonal());
     }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    inline unspecified_expression_type
+    #else
+    inline const DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(DiagonalVectorType,typename OtherDerived::DiagonalVectorType,sum) >
+    #endif
+    operator+(const DiagonalBase<OtherDerived>& other) const
+    {
+      return (diagonal() + other.diagonal()).asDiagonal();
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    inline unspecified_expression_type
+    #else
+    inline const DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(DiagonalVectorType,typename OtherDerived::DiagonalVectorType,difference) >
+    #endif
+    operator-(const DiagonalBase<OtherDerived>& other) const
+    {
+      return (diagonal() - other.diagonal()).asDiagonal();
+    }
 };
 
 #endif
@@ -154,6 +178,30 @@ class DiagonalMatrix
     EIGEN_DEVICE_FUNC
     inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {}
 
+    #if EIGEN_HAS_CXX11
+    /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients. \cpp11
+      * 
+      * There exists C++98 anologue constructors for fixed-size diagonal matrices having 2 or 3 coefficients.
+      * 
+      * \warning To construct a diagonal matrix of fixed size, the number of values passed to this 
+      * constructor must match the fixed dimension of \c *this.
+      * 
+      * \sa DiagonalMatrix(const Scalar&, const Scalar&)
+      * \sa DiagonalMatrix(const Scalar&, const Scalar&, const Scalar&)
+      */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const ArgTypes&... args)
+      : m_diagonal(a0, a1, a2, args...) {}
+
+    /** \brief Constructs a DiagonalMatrix and initializes it by elements given by an initializer list of initializer
+      * lists \cpp11
+      */
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE DiagonalMatrix(const std::initializer_list<std::initializer_list<Scalar>>& list)
+      : m_diagonal(list) {}
+    #endif  // EIGEN_HAS_CXX11
+
     /** Copy constructor. */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
@@ -273,7 +321,7 @@ class DiagonalWrapper
   * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
   **/
 template<typename Derived>
-inline const DiagonalWrapper<const Derived>
+EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
   return DiagonalWrapper<const Derived>(derived());
diff --git a/contrib/eigen/Eigen/src/Core/DiagonalProduct.h b/contrib/eigen/Eigen/src/Core/DiagonalProduct.h
index d372b938f656c042c0413c88aa5eaf4e5bb977d6..7911d1cd174a3b59195b81169ed7ec9546342464 100644
--- a/contrib/eigen/Eigen/src/Core/DiagonalProduct.h
+++ b/contrib/eigen/Eigen/src/Core/DiagonalProduct.h
@@ -17,7 +17,7 @@ namespace Eigen {
   */
 template<typename Derived>
 template<typename DiagonalDerived>
-inline const Product<Derived, DiagonalDerived, LazyProduct>
+EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct>
 MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
 {
   return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
diff --git a/contrib/eigen/Eigen/src/Core/Dot.h b/contrib/eigen/Eigen/src/Core/Dot.h
index 1fe7a84a48dc94c24d287a36607587486c2293cf..5c3441b9266a4e1929a29f373e5c509f77c5e0ef 100644
--- a/contrib/eigen/Eigen/src/Core/Dot.h
+++ b/contrib/eigen/Eigen/src/Core/Dot.h
@@ -86,14 +86,14 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 
 //---------- implementation of L2 norm and related functions ----------
 
-/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm.
+/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the squared Frobenius norm.
   * In both cases, it consists in the sum of the square of all the matrix entries.
   * For vectors, this is also equals to the dot product of \c *this with itself.
   *
   * \sa dot(), norm(), lpNorm()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
 {
   return numext::real((*this).cwiseAbs2().sum());
 }
@@ -105,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
   * \sa lpNorm(), dot(), squaredNorm()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
   return numext::sqrt(squaredNorm());
 }
@@ -120,7 +120,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
   * \sa norm(), normalize()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::normalized() const
 {
   typedef typename internal::nested_eval<Derived,2>::type _Nested;
@@ -142,7 +142,7 @@ MatrixBase<Derived>::normalized() const
   * \sa norm(), normalized()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
 {
   RealScalar z = squaredNorm();
   // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
@@ -163,7 +163,7 @@ EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
   * \sa stableNorm(), stableNormalize(), normalized()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::stableNormalized() const
 {
   typedef typename internal::nested_eval<Derived,3>::type _Nested;
@@ -188,7 +188,7 @@ MatrixBase<Derived>::stableNormalized() const
   * \sa stableNorm(), stableNormalized(), normalize()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
 {
   RealScalar w = cwiseAbs().maxCoeff();
   RealScalar z = (derived()/w).squaredNorm();
@@ -207,7 +207,7 @@ struct lpNorm_selector
   EIGEN_DEVICE_FUNC
   static inline RealScalar run(const MatrixBase<Derived>& m)
   {
-    EIGEN_USING_STD_MATH(pow)
+    EIGEN_USING_STD(pow)
     return pow(m.cwiseAbs().array().pow(p).sum(), RealScalar(1)/p);
   }
 };
@@ -260,9 +260,9 @@ struct lpNorm_selector<Derived, Infinity>
 template<typename Derived>
 template<int p>
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 #else
-MatrixBase<Derived>::RealScalar
+EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
 #endif
 MatrixBase<Derived>::lpNorm() const
 {
diff --git a/contrib/eigen/Eigen/src/Core/EigenBase.h b/contrib/eigen/Eigen/src/Core/EigenBase.h
index b195506a91e8c6c60fe3da6151878a3100caf390..6b3c7d3745e2ae96233789208c8dd2d586a3baa3 100644
--- a/contrib/eigen/Eigen/src/Core/EigenBase.h
+++ b/contrib/eigen/Eigen/src/Core/EigenBase.h
@@ -15,7 +15,7 @@ namespace Eigen {
 
 /** \class EigenBase
   * \ingroup Core_Module
-  * 
+  *
   * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
   *
   * In other words, an EigenBase object is an object that can be copied into a MatrixBase.
@@ -29,11 +29,12 @@ namespace Eigen {
 template<typename Derived> struct EigenBase
 {
 //   typedef typename internal::plain_matrix_type<Derived>::type PlainObject;
-  
+
   /** \brief The interface type of indices
     * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
-    * \deprecated Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead.
     * \sa StorageIndex, \ref TopicPreprocessorDirectives.
+    * DEPRECATED: Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead.
+    * Deprecation is not marked with a doxygen comment because there are too many existing usages to add the deprecation attribute.
     */
   typedef Eigen::Index Index;
 
@@ -55,15 +56,15 @@ template<typename Derived> struct EigenBase
   { return *static_cast<const Derived*>(this); }
 
   /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
-  EIGEN_DEVICE_FUNC
-  inline Index rows() const { return derived().rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
   /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
-  EIGEN_DEVICE_FUNC
-  inline Index cols() const { return derived().cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); }
   /** \returns the number of coefficients, which is rows()*cols().
     * \sa rows(), cols(), SizeAtCompileTime. */
-  EIGEN_DEVICE_FUNC
-  inline Index size() const { return rows() * cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index size() const EIGEN_NOEXCEPT { return rows() * cols(); }
 
   /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
   template<typename Dest>
diff --git a/contrib/eigen/Eigen/src/Core/ForceAlignedAccess.h b/contrib/eigen/Eigen/src/Core/ForceAlignedAccess.h
index 7b08b45e67c8bd72a7b6d574855d86ff67253c71..817a43afced3a74477485099d821dc2bcf4f4e79 100644
--- a/contrib/eigen/Eigen/src/Core/ForceAlignedAccess.h
+++ b/contrib/eigen/Eigen/src/Core/ForceAlignedAccess.h
@@ -41,10 +41,14 @@ template<typename ExpressionType> class ForceAlignedAccess
 
     EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
 
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); }
 
     EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
     {
diff --git a/contrib/eigen/Eigen/src/Core/Fuzzy.h b/contrib/eigen/Eigen/src/Core/Fuzzy.h
index 3e403a09d92fc025661b427e5649c3334c7309c8..43aa49b2bc2f4636b08ecac59d3d2bf3217815b5 100644
--- a/contrib/eigen/Eigen/src/Core/Fuzzy.h
+++ b/contrib/eigen/Eigen/src/Core/Fuzzy.h
@@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector<Derived, true>
   */
 template<typename Derived>
 template<typename OtherDerived>
-bool DenseBase<Derived>::isApprox(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
   const DenseBase<OtherDerived>& other,
   const RealScalar& prec
 ) const
@@ -122,7 +122,7 @@ bool DenseBase<Derived>::isApprox(
   * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
   */
 template<typename Derived>
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
   const typename NumTraits<Scalar>::Real& other,
   const RealScalar& prec
 ) const
@@ -142,7 +142,7 @@ bool DenseBase<Derived>::isMuchSmallerThan(
   */
 template<typename Derived>
 template<typename OtherDerived>
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
   const DenseBase<OtherDerived>& other,
   const RealScalar& prec
 ) const
diff --git a/contrib/eigen/Eigen/src/Core/GeneralProduct.h b/contrib/eigen/Eigen/src/Core/GeneralProduct.h
index 6f0cc80e940c67db21f7dd56058bae51e5d7b1b9..6906aa75d106c4b2ccb12e0848d4d42b5ecc9e9f 100644
--- a/contrib/eigen/Eigen/src/Core/GeneralProduct.h
+++ b/contrib/eigen/Eigen/src/Core/GeneralProduct.h
@@ -18,6 +18,16 @@ enum {
   Small = 3
 };
 
+// Define the threshold value to fallback from the generic matrix-matrix product
+// implementation (heavy) to the lightweight coeff-based product one.
+// See generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
+// in products/GeneralMatrixMatrix.h for more details.
+// TODO This threshold should also be used in the compile-time selector below.
+#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
+// This default value has been obtained on a Haswell architecture.
+#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20
+#endif
+
 namespace internal {
 
 template<int Rows, int Cols, int Depth> struct product_type_selector;
@@ -25,7 +35,7 @@ template<int Rows, int Cols, int Depth> struct product_type_selector;
 template<int Size, int MaxSize> struct product_size_category
 {
   enum {
-    #ifndef EIGEN_CUDA_ARCH
+    #ifndef EIGEN_GPU_COMPILE_PHASE
     is_large = MaxSize == Dynamic ||
                Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
                (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
@@ -153,13 +163,13 @@ template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vect
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
 {
-  EIGEN_STRONG_INLINE  Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
 };
 
 template<typename Scalar,int Size>
 struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 {
-  EIGEN_STRONG_INLINE Scalar* data() { return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; }
 };
 
 template<typename Scalar,int Size,int MaxSize>
@@ -218,8 +228,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
     ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
     ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);
 
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
-                                  * RhsBlasTraits::extractScalarFactor(rhs);
+    ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs);
 
     // make sure Dest is a compile-time vector type (bug 1166)
     typedef typename conditional<Dest::IsVectorAtCompileTime, Dest, typename Dest::ColXpr>::type ActualDest;
@@ -229,7 +238,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
       // on, the other hand it is good for the cache to pack the vector anyways...
       EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1),
       ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
-      MightCannotUseDest = (!EvalToDestAtCompileTime) || ComplexByReal
+      MightCannotUseDest = ((!EvalToDestAtCompileTime) || ComplexByReal) && (ActualDest::MaxSizeAtCompileTime!=0)
     };
 
     typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
@@ -310,13 +319,12 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,true>
     typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
     typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
 
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
-                                  * RhsBlasTraits::extractScalarFactor(rhs);
+    ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs);
 
     enum {
       // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
       // on, the other hand it is good for the cache to pack the vector anyways...
-      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
+      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 || ActualRhsTypeCleaned::MaxSizeAtCompileTime==0
     };
 
     gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
@@ -386,7 +394,8 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
   */
 template<typename Derived>
 template<typename OtherDerived>
-inline const Product<Derived, OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const Product<Derived, OtherDerived>
 MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 {
   // A note regarding the function declaration: In MSVC, this function will sometimes
@@ -428,6 +437,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
   */
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 const Product<Derived,OtherDerived,LazyProduct>
 MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
 {
diff --git a/contrib/eigen/Eigen/src/Core/GenericPacketMath.h b/contrib/eigen/Eigen/src/Core/GenericPacketMath.h
index e594437791beec0ee137697e8652fbd175e6d12a..cf677a1905f1e80697eb6c0daccb48f745c605e3 100644
--- a/contrib/eigen/Eigen/src/Core/GenericPacketMath.h
+++ b/contrib/eigen/Eigen/src/Core/GenericPacketMath.h
@@ -44,23 +44,29 @@ struct default_packet_traits
   enum {
     HasHalfPacket = 0,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasNegate = 1,
-    HasAbs    = 1,
-    HasArg    = 0,
-    HasAbs2   = 1,
-    HasMin    = 1,
-    HasMax    = 1,
-    HasConj   = 1,
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 0,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
     HasSetLinear = 1,
-    HasBlend  = 0,
+    HasBlend     = 0,
+    // This flag is used to indicate whether packet comparison is supported.
+    // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true.
+    HasCmp       = 0,
 
     HasDiv    = 0,
     HasSqrt   = 0,
     HasRsqrt  = 0,
     HasExp    = 0,
+    HasExpm1  = 0,
     HasLog    = 0,
     HasLog1p  = 0,
     HasLog10  = 0,
@@ -81,14 +87,18 @@ struct default_packet_traits
     HasPolygamma = 0,
     HasErf = 0,
     HasErfc = 0,
+    HasNdtri = 0,
+    HasBessel = 0,
     HasIGamma = 0,
+    HasIGammaDerA = 0,
+    HasGammaSampleDerAlpha = 0,
     HasIGammac = 0,
     HasBetaInc = 0,
 
     HasRound  = 0,
+    HasRint   = 0,
     HasFloor  = 0,
     HasCeil   = 0,
-
     HasSign   = 0
   };
 };
@@ -119,6 +129,22 @@ template<typename T> struct packet_traits : default_packet_traits
 
 template<typename T> struct packet_traits<const T> : packet_traits<T> { };
 
+template<typename T> struct unpacket_traits
+{
+  typedef T type;
+  typedef T half;
+  enum
+  {
+    size = 1,
+    alignment = 1,
+    vectorizable = false,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
+
+template<typename T> struct unpacket_traits<const T> : unpacket_traits<T> { };
+
 template <typename Src, typename Tgt> struct type_casting_traits {
   enum {
     VectorizedCast = 0,
@@ -127,6 +153,34 @@ template <typename Src, typename Tgt> struct type_casting_traits {
   };
 };
 
+/** \internal Wrapper to ensure that multiple packet types can map to the same
+    same underlying vector type. */
+template<typename T, int unique_id = 0>
+struct eigen_packet_wrapper
+{
+  EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
+  EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
+    m_val = v;
+    return *this;
+  }
+
+  T m_val;
+};
+
+
+/** \internal A convenience utility for determining if the type is a scalar.
+ * This is used to enable some generic packet implementations.
+ */
+template<typename Packet>
+struct is_scalar {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  enum {
+    value = internal::is_same<Packet, Scalar>::value
+  };
+};
 
 /** \internal \returns static_cast<TgtType>(a) (coeff-wise) */
 template <typename SrcPacket, typename TgtPacket>
@@ -139,75 +193,406 @@ EIGEN_DEVICE_FUNC inline TgtPacket
 pcast(const SrcPacket& a, const SrcPacket& /*b*/) {
   return static_cast<TgtPacket>(a);
 }
-
 template <typename SrcPacket, typename TgtPacket>
 EIGEN_DEVICE_FUNC inline TgtPacket
 pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/) {
   return static_cast<TgtPacket>(a);
 }
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/,
+      const SrcPacket& /*e*/, const SrcPacket& /*f*/, const SrcPacket& /*g*/, const SrcPacket& /*h*/) {
+  return static_cast<TgtPacket>(a);
+}
+
+/** \internal \returns reinterpret_cast<Target>(a) */
+template <typename Target, typename Packet>
+EIGEN_DEVICE_FUNC inline Target
+preinterpret(const Packet& a); /* { return reinterpret_cast<const Target&>(a); } */
 
 /** \internal \returns a + b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-padd(const Packet& a,
-        const Packet& b) { return a+b; }
+padd(const Packet& a, const Packet& b) { return a+b; }
+// Avoid compiler warning for boolean algebra.
+template<> EIGEN_DEVICE_FUNC inline bool
+padd(const bool& a, const bool& b) { return a || b; }
 
 /** \internal \returns a - b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-psub(const Packet& a,
-        const Packet& b) { return a-b; }
+psub(const Packet& a, const Packet& b) { return a-b; }
 
 /** \internal \returns -a (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pnegate(const Packet& a) { return -a; }
 
-/** \internal \returns conj(a) (coeff-wise) */
+template<> EIGEN_DEVICE_FUNC inline bool
+pnegate(const bool& a) { return !a; }
 
+/** \internal \returns conj(a) (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pconj(const Packet& a) { return numext::conj(a); }
 
 /** \internal \returns a * b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmul(const Packet& a,
-        const Packet& b) { return a*b; }
+pmul(const Packet& a, const Packet& b) { return a*b; }
+// Avoid compiler warning for boolean algebra.
+template<> EIGEN_DEVICE_FUNC inline bool
+pmul(const bool& a, const bool& b) { return a && b; }
 
 /** \internal \returns a / b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pdiv(const Packet& a,
-        const Packet& b) { return a/b; }
+pdiv(const Packet& a, const Packet& b) { return a/b; }
+
+// In the generic case, memset to all one bits.
+template<typename Packet, typename EnableIf = void>
+struct ptrue_impl {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/){
+    Packet b;
+    memset(static_cast<void*>(&b), 0xff, sizeof(Packet));
+    return b;
+  }
+};
 
-/** \internal \returns the min of \a a and \a b  (coeff-wise) */
+// For non-trivial scalars, set to Scalar(1) (i.e. a non-zero value).
+// Although this is technically not a valid bitmask, the scalar path for pselect
+// uses a comparison to zero, so this should still work in most cases. We don't
+// have another option, since the scalar type requires initialization.
+template<typename T>
+struct ptrue_impl<T, 
+    typename internal::enable_if<is_scalar<T>::value && NumTraits<T>::RequireInitialization>::type > {
+  static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/){
+    return T(1);
+  }
+};
+
+/** \internal \returns one bits. */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmin(const Packet& a,
-        const Packet& b) { return numext::mini(a, b); }
+ptrue(const Packet& a) {
+  return ptrue_impl<Packet>::run(a);
+}
+
+// In the general case, memset to zero.
+template<typename Packet, typename EnableIf = void>
+struct pzero_impl {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) {
+    Packet b;
+    memset(static_cast<void*>(&b), 0x00, sizeof(Packet));
+    return b;
+  }
+};
+
+// For scalars, explicitly set to Scalar(0), since the underlying representation
+// for zero may not consist of all-zero bits.
+template<typename T>
+struct pzero_impl<T,
+    typename internal::enable_if<is_scalar<T>::value>::type> {
+  static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/) {
+    return T(0);
+  }
+};
 
-/** \internal \returns the max of \a a and \a b  (coeff-wise) */
+/** \internal \returns packet of zeros */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmax(const Packet& a,
-        const Packet& b) { return numext::maxi(a, b); }
+pzero(const Packet& a) {
+  return pzero_impl<Packet>::run(a);
+}
 
-/** \internal \returns the absolute value of \a a */
+/** \internal \returns a <= b as a bit mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pabs(const Packet& a) { using std::abs; return abs(a); }
+pcmp_le(const Packet& a, const Packet& b)  { return a<=b ? ptrue(a) : pzero(a); }
 
-/** \internal \returns the phase angle of \a a */
+/** \internal \returns a < b as a bit mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-parg(const Packet& a) { using numext::arg; return arg(a); }
+pcmp_lt(const Packet& a, const Packet& b)  { return a<b ? ptrue(a) : pzero(a); }
+
+/** \internal \returns a == b as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); }
+
+/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_lt_or_nan(const Packet& a, const Packet& b) { return a>=b ? pzero(a) : ptrue(a); }
+
+template<typename T>
+struct bit_and {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const {
+    return a & b;
+  }
+};
+
+template<typename T>
+struct bit_or {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const {
+    return a | b;
+  }
+};
+
+template<typename T>
+struct bit_xor {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const {
+    return a ^ b;
+  }
+};
+
+template<typename T>
+struct bit_not {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a) const {
+    return ~a;
+  }
+};
+
+// Use operators &, |, ^, ~.
+template<typename T>
+struct operator_bitwise_helper {
+  EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { return bit_and<T>()(a, b); }
+  EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { return bit_or<T>()(a, b); }
+  EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { return bit_xor<T>()(a, b); }
+  EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { return bit_not<T>()(a); }
+};
+
+// Apply binary operations byte-by-byte
+template<typename T>
+struct bytewise_bitwise_helper {
+  EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) {
+    return binary(a, b, bit_and<unsigned char>());
+  }
+  EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { 
+    return binary(a, b, bit_or<unsigned char>());
+   }
+  EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) {
+    return binary(a, b, bit_xor<unsigned char>());
+  }
+  EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { 
+    return unary(a,bit_not<unsigned char>());
+   }
+  
+ private:
+  template<typename Op>
+  EIGEN_DEVICE_FUNC static inline T unary(const T& a, Op op) {
+    const unsigned char* a_ptr = reinterpret_cast<const unsigned char*>(&a);
+    T c;
+    unsigned char* c_ptr = reinterpret_cast<unsigned char*>(&c);
+    for (size_t i = 0; i < sizeof(T); ++i) {
+      *c_ptr++ = op(*a_ptr++);
+    }
+    return c;
+  }
+
+  template<typename Op>
+  EIGEN_DEVICE_FUNC static inline T binary(const T& a, const T& b, Op op) {
+    const unsigned char* a_ptr = reinterpret_cast<const unsigned char*>(&a);
+    const unsigned char* b_ptr = reinterpret_cast<const unsigned char*>(&b);
+    T c;
+    unsigned char* c_ptr = reinterpret_cast<unsigned char*>(&c);
+    for (size_t i = 0; i < sizeof(T); ++i) {
+      *c_ptr++ = op(*a_ptr++, *b_ptr++);
+    }
+    return c;
+  }
+};
+
+// In the general case, use byte-by-byte manipulation.
+template<typename T, typename EnableIf = void>
+struct bitwise_helper : public bytewise_bitwise_helper<T> {};
+
+// For integers or non-trivial scalars, use binary operators.
+template<typename T>
+struct bitwise_helper<T,
+  typename internal::enable_if<
+    is_scalar<T>::value && (NumTraits<T>::IsInteger || NumTraits<T>::RequireInitialization)>::type
+  > : public operator_bitwise_helper<T> {};
 
 /** \internal \returns the bitwise and of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pand(const Packet& a, const Packet& b) { return a & b; }
+pand(const Packet& a, const Packet& b) {
+  return bitwise_helper<Packet>::bitwise_and(a, b);
+}
 
 /** \internal \returns the bitwise or of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-por(const Packet& a, const Packet& b) { return a | b; }
+por(const Packet& a, const Packet& b) {
+  return bitwise_helper<Packet>::bitwise_or(a, b);
+}
 
 /** \internal \returns the bitwise xor of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pxor(const Packet& a, const Packet& b) { return a ^ b; }
+pxor(const Packet& a, const Packet& b) {
+  return bitwise_helper<Packet>::bitwise_xor(a, b);
+}
+
+/** \internal \returns the bitwise not of \a a */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pnot(const Packet& a) {
+  return bitwise_helper<Packet>::bitwise_not(a);
+}
+
+/** \internal \returns the bitwise and of \a a and not \a b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pandnot(const Packet& a, const Packet& b) { return pand(a, pnot(b)); }
+
+// In the general case, use bitwise select.
+template<typename Packet, typename EnableIf = void>
+struct pselect_impl {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) {
+    return por(pand(a,mask),pandnot(b,mask));
+  }
+};
+
+// For scalars, use ternary select.
+template<typename Packet>
+struct pselect_impl<Packet, 
+    typename internal::enable_if<is_scalar<Packet>::value>::type > {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) {
+    return numext::equal_strict(mask, Packet(0)) ? b : a;
+  }
+};
+
+/** \internal \returns \a or \b for each field in packet according to \mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pselect(const Packet& mask, const Packet& a, const Packet& b) {
+  return pselect_impl<Packet>::run(mask, a, b);
+}
+
+template<> EIGEN_DEVICE_FUNC inline bool pselect<bool>(
+    const bool& cond, const bool& a, const bool& b) {
+  return cond ? a : b;
+}
+
+/** \internal \returns the min or of \a a and \a b (coeff-wise)
+    If either \a a or \a b are NaN, the result is implementation defined. */
+template<int NaNPropagation>
+struct pminmax_impl {
+  template <typename Packet, typename Op>
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
+    return op(a,b);
+  }
+};
+
+/** \internal \returns the min or max of \a a and \a b (coeff-wise)
+    If either \a a or \a b are NaN, NaN is returned. */
+template<>
+struct pminmax_impl<PropagateNaN> {
+  template <typename Packet, typename Op>
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
+  Packet not_nan_mask_a = pcmp_eq(a, a);
+  Packet not_nan_mask_b = pcmp_eq(b, b);
+  return pselect(not_nan_mask_a,
+                 pselect(not_nan_mask_b, op(a, b), b),
+                 a);
+  }
+};
+
+/** \internal \returns the min or max of \a a and \a b (coeff-wise)
+    If both \a a and \a b are NaN, NaN is returned.
+    Equivalent to std::fmin(a, b).  */
+template<>
+struct pminmax_impl<PropagateNumbers> {
+  template <typename Packet, typename Op>
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
+  Packet not_nan_mask_a = pcmp_eq(a, a);
+  Packet not_nan_mask_b = pcmp_eq(b, b);
+  return pselect(not_nan_mask_a,
+                 pselect(not_nan_mask_b, op(a, b), a),
+                 b);
+  }
+};
+
+
+#ifndef SYCL_DEVICE_ONLY
+#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) Func
+#else
+#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) \
+[](const Type& a, const Type& b) { \
+        return Func(a, b);}
+#endif
+
+/** \internal \returns the min of \a a and \a b  (coeff-wise).
+    If \a a or \b b is NaN, the return value is implementation defined. */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pmin(const Packet& a, const Packet& b) { return numext::mini(a,b); }
+
+/** \internal \returns the min of \a a and \a b  (coeff-wise).
+    NaNPropagation determines the NaN propagation semantics. */
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) {
+  return pminmax_impl<NaNPropagation>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmin<Packet>)));
+}
+
+/** \internal \returns the max of \a a and \a b  (coeff-wise)
+    If \a a or \b b is NaN, the return value is implementation defined. */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pmax(const Packet& a, const Packet& b) { return numext::maxi(a, b); }
+
+/** \internal \returns the max of \a a and \a b  (coeff-wise).
+    NaNPropagation determines the NaN propagation semantics. */
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pmax(const Packet& a, const Packet& b) {
+  return pminmax_impl<NaNPropagation>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet,(pmax<Packet>)));
+}
+
+/** \internal \returns the absolute value of \a a */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pabs(const Packet& a) { return numext::abs(a); }
+template<> EIGEN_DEVICE_FUNC inline unsigned int
+pabs(const unsigned int& a) { return a; }
+template<> EIGEN_DEVICE_FUNC inline unsigned long
+pabs(const unsigned long& a) { return a; }
+template<> EIGEN_DEVICE_FUNC inline unsigned long long
+pabs(const unsigned long long& a) { return a; }
+
+/** \internal \returns the addsub value of \a a,b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+paddsub(const Packet& a, const Packet& b) {
+  return pselect(peven_mask(a), padd(a, b), psub(a, b));
+ }
+
+/** \internal \returns the phase angle of \a a */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+parg(const Packet& a) { using numext::arg; return arg(a); }
+
+
+/** \internal \returns \a a logically shifted by N bits to the right */
+template<int N> EIGEN_DEVICE_FUNC inline int
+parithmetic_shift_right(const int& a) { return a >> N; }
+template<int N> EIGEN_DEVICE_FUNC inline long int
+parithmetic_shift_right(const long int& a) { return a >> N; }
+
+/** \internal \returns \a a arithmetically shifted by N bits to the right */
+template<int N> EIGEN_DEVICE_FUNC inline int
+plogical_shift_right(const int& a) { return static_cast<int>(static_cast<unsigned int>(a) >> N); }
+template<int N> EIGEN_DEVICE_FUNC inline long int
+plogical_shift_right(const long int& a) { return static_cast<long>(static_cast<unsigned long>(a) >> N); }
 
-/** \internal \returns the bitwise andnot of \a a and \a b */
+/** \internal \returns \a a shifted by N bits to the left */
+template<int N> EIGEN_DEVICE_FUNC inline int
+plogical_shift_left(const int& a) { return a << N; }
+template<int N> EIGEN_DEVICE_FUNC inline long int
+plogical_shift_left(const long int& a) { return a << N; }
+
+/** \internal \returns the significant and exponent of the underlying floating point numbers
+  * See https://en.cppreference.com/w/cpp/numeric/math/frexp
+  */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pfrexp(const Packet& a, Packet& exponent) {
+  int exp;
+  EIGEN_USING_STD(frexp);
+  Packet result = static_cast<Packet>(frexp(a, &exp));
+  exponent = static_cast<Packet>(exp);
+  return result;
+}
+
+/** \internal \returns a * 2^((int)exponent)
+  * See https://en.cppreference.com/w/cpp/numeric/math/ldexp
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pldexp(const Packet &a, const Packet &exponent) {
+  EIGEN_USING_STD(ldexp)
+  return static_cast<Packet>(ldexp(a, static_cast<int>(exponent)));
+}
+
+/** \internal \returns the min of \a a and \a b  (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pandnot(const Packet& a, const Packet& b) { return a & (!b); }
+pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); }
 
 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -217,10 +602,22 @@ pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
 
+/** \internal \returns a packet version of \a *from, (un-aligned masked load)
+ * There is no generic implementation. We only have implementations for specialized
+ * cases. Generic case should not be called.
+ */
+template<typename Packet> EIGEN_DEVICE_FUNC inline
+typename enable_if<unpacket_traits<Packet>::masked_load_available, Packet>::type
+ploadu(const typename unpacket_traits<Packet>::type* from, typename unpacket_traits<Packet>::mask_t umask);
+
 /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
 
+/** \internal \returns a packet with constant coefficients set from bits */
+template<typename Packet,typename BitsType> EIGEN_DEVICE_FUNC inline Packet
+pset1frombits(BitsType a);
+
 /** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(*a); }
@@ -237,7 +634,7 @@ ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
   * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and
   * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]}
   * Currently, this function is only used in matrix products.
-  * For packet-size smaller or equal to 4, this function is equivalent to pload1 
+  * For packet-size smaller or equal to 4, this function is equivalent to pload1
   */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploadquad(const typename unpacket_traits<Packet>::type* from)
@@ -281,6 +678,20 @@ inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
 plset(const typename unpacket_traits<Packet>::type& a) { return a; }
 
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (x, 0, x, 0),
+     where x is the value of all 1-bits. */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+peven_mask(const Packet& /*a*/) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  const size_t n = unpacket_traits<Packet>::size;
+  EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
+  for(size_t i = 0; i < n; ++i) {
+    memset(elements+i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar));
+  }
+  return ploadu<Packet>(elements);
+}
+
+
 /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
 { (*to) = from; }
@@ -289,6 +700,15 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
 {  (*to) = from; }
 
+/** \internal copy the packet \a from to \a *to, (un-aligned store with a mask)
+ * There is no generic implementation. We only have implementations for specialized
+ * cases. Generic case should not be called.
+ */
+template<typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline
+typename enable_if<unpacket_traits<Packet>::masked_store_available, void>::type
+pstoreu(Scalar* to, const Packet& from, typename unpacket_traits<Packet>::mask_t umask);
+
  template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
  { return ploadu<Packet>(from); }
 
@@ -298,8 +718,10 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
 /** \internal tries to do cache prefetching of \a addr */
 template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
 {
-#ifdef __CUDA_ARCH__
-#if defined(__LP64__)
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+  // do nothing
+#elif defined(EIGEN_CUDA_ARCH)
+#if defined(__LP64__) || EIGEN_OS_WIN64
   // 64-bit pointer operand constraint for inlined asm
   asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
 #else
@@ -311,39 +733,6 @@ template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* a
 #endif
 }
 
-/** \internal \returns the first element of a packet */
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type pfirst(const Packet& a)
-{ return a; }
-
-/** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-preduxp(const Packet* vecs) { return vecs[0]; }
-
-/** \internal \returns the sum of the elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
-{ return a; }
-
-/** \internal \returns the sum of the elements of \a a by block of 4 elements.
-  * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
-  * For packet-size smaller or equal to 4, this boils down to a noop.
-  */
-template<typename Packet> EIGEN_DEVICE_FUNC inline
-typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
-predux_downto4(const Packet& a)
-{ return a; }
-
-/** \internal \returns the product of the elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
-{ return a; }
-
-/** \internal \returns the min of the elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
-{ return a; }
-
-/** \internal \returns the max of the elements of \a a*/
-template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
-{ return a; }
-
 /** \internal \returns the reversed elements of \a a*/
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
 { return a; }
@@ -351,7 +740,7 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet&
 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
 {
-  return Packet(a.imag(),a.real());
+  return Packet(numext::imag(a),numext::real(a));
 }
 
 /**************************
@@ -360,47 +749,51 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet
 
 /** \internal \returns the sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psin(const Packet& a) { using std::sin; return sin(a); }
+Packet psin(const Packet& a) { EIGEN_USING_STD(sin); return sin(a); }
 
 /** \internal \returns the cosine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pcos(const Packet& a) { using std::cos; return cos(a); }
+Packet pcos(const Packet& a) { EIGEN_USING_STD(cos); return cos(a); }
 
 /** \internal \returns the tan of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet ptan(const Packet& a) { using std::tan; return tan(a); }
+Packet ptan(const Packet& a) { EIGEN_USING_STD(tan); return tan(a); }
 
 /** \internal \returns the arc sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pasin(const Packet& a) { using std::asin; return asin(a); }
+Packet pasin(const Packet& a) { EIGEN_USING_STD(asin); return asin(a); }
 
 /** \internal \returns the arc cosine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pacos(const Packet& a) { using std::acos; return acos(a); }
+Packet pacos(const Packet& a) { EIGEN_USING_STD(acos); return acos(a); }
 
 /** \internal \returns the arc tangent of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan(const Packet& a) { using std::atan; return atan(a); }
+Packet patan(const Packet& a) { EIGEN_USING_STD(atan); return atan(a); }
 
 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psinh(const Packet& a) { using std::sinh; return sinh(a); }
+Packet psinh(const Packet& a) { EIGEN_USING_STD(sinh); return sinh(a); }
 
 /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); }
+Packet pcosh(const Packet& a) { EIGEN_USING_STD(cosh); return cosh(a); }
 
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); }
+Packet ptanh(const Packet& a) { EIGEN_USING_STD(tanh); return tanh(a); }
 
 /** \internal \returns the exp of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pexp(const Packet& a) { using std::exp; return exp(a); }
+Packet pexp(const Packet& a) { EIGEN_USING_STD(exp); return exp(a); }
+
+/** \internal \returns the expm1 of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pexpm1(const Packet& a) { return numext::expm1(a); }
 
 /** \internal \returns the log of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog(const Packet& a) { using std::log; return log(a); }
+Packet plog(const Packet& a) { EIGEN_USING_STD(log); return log(a); }
 
 /** \internal \returns the log1p of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
@@ -408,16 +801,24 @@ Packet plog1p(const Packet& a) { return numext::log1p(a); }
 
 /** \internal \returns the log10 of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog10(const Packet& a) { using std::log10; return log10(a); }
+Packet plog10(const Packet& a) { EIGEN_USING_STD(log10); return log10(a); }
+
+/** \internal \returns the log10 of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plog2(const Packet& a) {
+  typedef typename internal::unpacket_traits<Packet>::type Scalar;
+  return pmul(pset1<Packet>(Scalar(EIGEN_LOG2E)), plog(a)); 
+}
 
 /** \internal \returns the square-root of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); }
+Packet psqrt(const Packet& a) { return numext::sqrt(a); }
 
 /** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet prsqrt(const Packet& a) {
-  return pdiv(pset1<Packet>(1), psqrt(a));
+  typedef typename internal::unpacket_traits<Packet>::type Scalar;
+  return pdiv(pset1<Packet>(Scalar(1)), psqrt(a));
 }
 
 /** \internal \returns the rounded value of \a a (coeff-wise) */
@@ -428,15 +829,121 @@ Packet pround(const Packet& a) { using numext::round; return round(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pfloor(const Packet& a) { using numext::floor; return floor(a); }
 
+/** \internal \returns the rounded value of \a a (coeff-wise) with current
+ * rounding mode */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet print(const Packet& a) { using numext::rint; return rint(a); }
+
 /** \internal \returns the ceil of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }
 
+/** \internal \returns the first element of a packet */
+template<typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type
+pfirst(const Packet& a)
+{ return a; }
+
+/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4.
+  * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
+  * For packet-size smaller or equal to 4, this boils down to a noop.
+  */
+template<typename Packet>
+EIGEN_DEVICE_FUNC inline typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
+predux_half_dowto4(const Packet& a)
+{ return a; }
+
+// Slow generic implementation of Packet reduction.
+template <typename Packet, typename Op>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type
+predux_helper(const Packet& a, Op op) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  const size_t n = unpacket_traits<Packet>::size;
+  EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
+  pstoreu<Scalar>(elements, a);
+  for(size_t k = n / 2; k > 0; k /= 2)  {
+    for(size_t i = 0; i < k; ++i) {
+      elements[i] = op(elements[i], elements[i + k]);
+    }
+  }
+  return elements[0];
+}
+
+/** \internal \returns the sum of the elements of \a a*/
+template<typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type
+predux(const Packet& a)
+{
+  return a;
+}
+
+/** \internal \returns the product of the elements of \a a */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(
+    const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar; 
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmul<Scalar>)));
+}
+
+/** \internal \returns the min of the elements of \a a */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(
+    const Packet &a) {
+  typedef typename unpacket_traits<Packet>::type Scalar; 
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<PropagateFast, Scalar>)));
+}
+
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(
+    const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar; 
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<NaNPropagation, Scalar>)));
+}
+
+/** \internal \returns the min of the elements of \a a */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(
+    const Packet &a) {
+  typedef typename unpacket_traits<Packet>::type Scalar; 
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<PropagateFast, Scalar>)));
+}
+
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(
+    const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar; 
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<NaNPropagation, Scalar>)));
+}
+
+#undef EIGEN_BINARY_OP_NAN_PROPAGATION
+
+/** \internal \returns true if all coeffs of \a a means "true"
+  * It is supposed to be called on values returned by pcmp_*.
+  */
+// not needed yet
+// template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a)
+// { return bool(a); }
+
+/** \internal \returns true if any coeffs of \a a means "true"
+  * It is supposed to be called on values returned by pcmp_*.
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a)
+{
+  // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames.
+  // It is expected that "true" is either:
+  //  - Scalar(1)
+  //  - bits full of ones (NaN for floats),
+  //  - or first bit equals to 1 (1 for ints, smallest denormal for floats).
+  // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars.
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  return numext::not_equal_strict(predux(a), Scalar(0));
+}
+
 /***************************************************************************
 * The following functions might not have to be overwritten for vectorized types
 ***************************************************************************/
 
-/** \internal copy a packet with constant coeficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */
+/** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */
 // NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type)
 template<typename Packet>
 inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename unpacket_traits<Packet>::type& a)
@@ -484,41 +991,12 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_t
   return ploadt<Packet, LoadMode>(from);
 }
 
-/** \internal default implementation of palign() allowing partial specialization */
-template<int Offset,typename PacketType>
-struct palign_impl
-{
-  // by default data are aligned, so there is nothing to be done :)
-  static inline void run(PacketType&, const PacketType&) {}
-};
-
-/** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements
-  * of \a first and \a Offset first elements of \a second.
-  * 
-  * This function is currently only used to optimize matrix-vector products on unligned matrices.
-  * It takes 2 packets that represent a contiguous memory array, and returns a packet starting
-  * at the position \a Offset. For instance, for packets of 4 elements, we have:
-  *  Input:
-  *  - first = {f0,f1,f2,f3}
-  *  - second = {s0,s1,s2,s3}
-  * Output: 
-  *   - if Offset==0 then {f0,f1,f2,f3}
-  *   - if Offset==1 then {f1,f2,f3,s0}
-  *   - if Offset==2 then {f2,f3,s0,s1}
-  *   - if Offset==3 then {f3,s0,s1,s3}
-  */
-template<int Offset,typename PacketType>
-inline void palign(PacketType& first, const PacketType& second)
-{
-  palign_impl<Offset,PacketType>::run(first,second);
-}
-
 /***************************************************************************
 * Fast complex products (GCC generates a function call which is very slow)
 ***************************************************************************/
 
 // Eigen+CUDA does not support complexes.
-#ifndef __CUDACC__
+#if !defined(EIGEN_GPUCC)
 
 template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
 { return std::complex<float>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }
@@ -555,34 +1033,6 @@ pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& th
   return ifPacket.select[0] ? thenPacket : elsePacket;
 }
 
-/** \internal \returns \a a with the first coefficient replaced by the scalar b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pinsertfirst(const Packet& a, typename unpacket_traits<Packet>::type b)
-{
-  // Default implementation based on pblend.
-  // It must be specialized for higher performance.
-  Selector<unpacket_traits<Packet>::size> mask;
-  mask.select[0] = true;
-  // This for loop should be optimized away by the compiler.
-  for(Index i=1; i<unpacket_traits<Packet>::size; ++i)
-    mask.select[i] = false;
-  return pblend(mask, pset1<Packet>(b), a);
-}
-
-/** \internal \returns \a a with the last coefficient replaced by the scalar b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pinsertlast(const Packet& a, typename unpacket_traits<Packet>::type b)
-{
-  // Default implementation based on pblend.
-  // It must be specialized for higher performance.
-  Selector<unpacket_traits<Packet>::size> mask;
-  // This for loop should be optimized away by the compiler.
-  for(Index i=0; i<unpacket_traits<Packet>::size-1; ++i)
-    mask.select[i] = false;
-  mask.select[unpacket_traits<Packet>::size-1] = true;
-  return pblend(mask, pset1<Packet>(b), a);
-}
-
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/contrib/eigen/Eigen/src/Core/GlobalFunctions.h b/contrib/eigen/Eigen/src/Core/GlobalFunctions.h
index 769dc255c207c3e8ca3245571d5035167fc0ef0c..629af94b9940f3b474b399dd6646cb2b1f1ad29a 100644
--- a/contrib/eigen/Eigen/src/Core/GlobalFunctions.h
+++ b/contrib/eigen/Eigen/src/Core/GlobalFunctions.h
@@ -66,21 +66,31 @@ namespace Eigen
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
+#if EIGEN_HAS_CXX11_MATH
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh,scalar_asinh_op,inverse hyperbolic sine,\sa ArrayBase::asinh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh,scalar_acosh_op,inverse hyperbolic cosine,\sa ArrayBase::acosh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh,scalar_atanh_op,inverse hyperbolic tangent,\sa ArrayBase::atanh)
+#endif
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op,complement error function,\sa ArrayBase::erfc)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ndtri,scalar_ndtri_op,inverse normal distribution function,\sa ArrayBase::ndtri)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op,exponential,\sa ArrayBase::exp)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1,scalar_expm1_op,exponential of a value minus 1,\sa ArrayBase::expm1)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p,scalar_log1p_op,natural logarithm of 1 plus the value,\sa ArrayBase::log1p)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log10)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log2,scalar_log2_op,base 2 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log2)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op,absolute value,\sa ArrayBase::abs DOXCOMMA MatrixBase::cwiseAbs)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op,squared absolute value,\sa ArrayBase::abs2 DOXCOMMA MatrixBase::cwiseAbs2)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg DOXCOMMA MatrixBase::cwiseArg)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op,square root,\sa ArrayBase::sqrt DOXCOMMA MatrixBase::cwiseSqrt)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op,cube (power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rint,scalar_rint_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op,nearest integer not greater than the giben value,\sa Eigen::ceil DOXCOMMA ArrayBase::floor)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op,nearest integer not less than the giben value,\sa Eigen::floor DOXCOMMA ArrayBase::ceil)
@@ -88,7 +98,7 @@ namespace Eigen
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign)
-  
+
   /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent.
     *
     * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar).
@@ -102,17 +112,18 @@ namespace Eigen
   inline const CwiseBinaryOp<internal::scalar_pow_op<Derived::Scalar,ScalarExponent>,Derived,Constant<ScalarExponent> >
   pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
 #else
-  template<typename Derived,typename ScalarExponent>
-  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,ScalarExponent>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent),
-          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type
-  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent) {
-    return x.derived().pow(exponent);
-  }
-
-  template<typename Derived>
-  inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow)
-  pow(const Eigen::ArrayBase<Derived>& x, const typename Derived::Scalar& exponent) {
-    return x.derived().pow(exponent);
+  template <typename Derived,typename ScalarExponent>
+  EIGEN_DEVICE_FUNC inline
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
+    const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<typename Derived::Scalar
+                                                 EIGEN_COMMA ScalarExponent EIGEN_COMMA
+                                                 EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type,pow))
+  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent)
+  {
+    typedef typename internal::promote_scalar_arg<typename Derived::Scalar,ScalarExponent,
+                                                  EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type PromotedExponent;
+    return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(),
+           typename internal::plain_constant_type<Derived,PromotedExponent>::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op<PromotedExponent>(exponent)));
   }
 #endif
 
@@ -122,21 +133,21 @@ namespace Eigen
     *
     * Example: \include Cwise_array_power_array.cpp
     * Output: \verbinclude Cwise_array_power_array.out
-    * 
+    *
     * \sa ArrayBase::pow()
     *
     * \relates ArrayBase
     */
   template<typename Derived,typename ExponentDerived>
   inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
-  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents) 
+  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents)
   {
     return Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
       x.derived(),
       exponents.derived()
     );
   }
-  
+
   /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
     *
     * This function computes the coefficient-wise power between a scalar and an array of exponents.
@@ -145,7 +156,7 @@ namespace Eigen
     *
     * Example: \include Cwise_scalar_power_array.cpp
     * Output: \verbinclude Cwise_scalar_power_array.out
-    * 
+    *
     * \sa ArrayBase::pow()
     *
     * \relates ArrayBase
@@ -155,21 +166,17 @@ namespace Eigen
   inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar,Derived::Scalar>,Constant<Scalar>,Derived>
   pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x);
 #else
-  template<typename Scalar, typename Derived>
-  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,Scalar>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar),
-          const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type
-  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
-  {
-    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)(
-            typename internal::plain_constant_type<Derived,Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
-  }
-
-  template<typename Derived>
-  inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)
-  pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
-  {
-    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)(
-      typename internal::plain_constant_type<Derived,typename Derived::Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
+  template <typename Scalar, typename Derived>
+  EIGEN_DEVICE_FUNC inline
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
+    const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<typename Derived::Scalar
+                                                 EIGEN_COMMA Scalar EIGEN_COMMA
+                                                 EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type,Derived,pow))
+  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents) {
+    typedef typename internal::promote_scalar_arg<typename Derived::Scalar,Scalar,
+                                                  EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type PromotedScalar;
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)(
+           typename internal::plain_constant_type<Derived,PromotedScalar>::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op<PromotedScalar>(x)), exponents.derived());
   }
 #endif
 
diff --git a/contrib/eigen/Eigen/src/Core/IO.h b/contrib/eigen/Eigen/src/Core/IO.h
index da7fd6cce2e811bd8fe94e8a4a85b814377cbc6c..e81c3152168ba3a3e806ca262a0d2046d729f9d4 100644
--- a/contrib/eigen/Eigen/src/Core/IO.h
+++ b/contrib/eigen/Eigen/src/Core/IO.h
@@ -41,6 +41,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
   *  - \b rowSuffix string printed at the end of each row
   *  - \b matPrefix string printed at the beginning of the matrix
   *  - \b matSuffix string printed at the end of the matrix
+  *  - \b fill character printed to fill the empty space in aligned columns
   *
   * Example: \include IOFormat.cpp
   * Output: \verbinclude IOFormat.out
@@ -53,9 +54,9 @@ struct IOFormat
   IOFormat(int _precision = StreamPrecision, int _flags = 0,
     const std::string& _coeffSeparator = " ",
     const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="",
-    const std::string& _matPrefix="", const std::string& _matSuffix="")
+    const std::string& _matPrefix="", const std::string& _matSuffix="", const char _fill=' ')
   : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator),
-    rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags)
+    rowSpacer(""), coeffSeparator(_coeffSeparator), fill(_fill), precision(_precision), flags(_flags)
   {
     // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
     // don't add rowSpacer if columns are not to be aligned
@@ -71,6 +72,7 @@ struct IOFormat
   std::string matPrefix, matSuffix;
   std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer;
   std::string coeffSeparator;
+  char fill;
   int precision;
   int flags;
 };
@@ -128,6 +130,9 @@ struct significant_decimals_impl
 template<typename Derived>
 std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& fmt)
 {
+  using internal::is_same;
+  using internal::conditional;
+
   if(_m.size() == 0)
   {
     s << fmt.matPrefix << fmt.matSuffix;
@@ -136,6 +141,22 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
   
   typename Derived::Nested m = _m;
   typedef typename Derived::Scalar Scalar;
+  typedef typename
+      conditional<
+          is_same<Scalar, char>::value ||
+            is_same<Scalar, unsigned char>::value ||
+            is_same<Scalar, numext::int8_t>::value ||
+            is_same<Scalar, numext::uint8_t>::value,
+          int,
+          typename conditional<
+              is_same<Scalar, std::complex<char> >::value ||
+                is_same<Scalar, std::complex<unsigned char> >::value ||
+                is_same<Scalar, std::complex<numext::int8_t> >::value ||
+                is_same<Scalar, std::complex<numext::uint8_t> >::value,
+              std::complex<int>,
+              const Scalar&
+            >::type
+        >::type PrintType;
 
   Index width = 0;
 
@@ -172,23 +193,31 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
       {
         std::stringstream sstr;
         sstr.copyfmt(s);
-        sstr << m.coeff(i,j);
+        sstr << static_cast<PrintType>(m.coeff(i,j));
         width = std::max<Index>(width, Index(sstr.str().length()));
       }
   }
+  std::streamsize old_width = s.width();
+  char old_fill_character = s.fill();
   s << fmt.matPrefix;
   for(Index i = 0; i < m.rows(); ++i)
   {
     if (i)
       s << fmt.rowSpacer;
     s << fmt.rowPrefix;
-    if(width) s.width(width);
-    s << m.coeff(i, 0);
+    if(width) {
+      s.fill(fmt.fill);
+      s.width(width);
+    }
+    s << static_cast<PrintType>(m.coeff(i, 0));
     for(Index j = 1; j < m.cols(); ++j)
     {
       s << fmt.coeffSeparator;
-      if (width) s.width(width);
-      s << m.coeff(i, j);
+      if(width) {
+        s.fill(fmt.fill);
+        s.width(width);
+      }
+      s << static_cast<PrintType>(m.coeff(i, j));
     }
     s << fmt.rowSuffix;
     if( i < m.rows() - 1)
@@ -196,6 +225,10 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
   }
   s << fmt.matSuffix;
   if(explicit_precision) s.precision(old_precision);
+  if(width) {
+    s.fill(old_fill_character);
+    s.width(old_width);
+  }
   return s;
 }
 
diff --git a/contrib/eigen/Eigen/src/Core/Inverse.h b/contrib/eigen/Eigen/src/Core/Inverse.h
index b76f0439d80f1ee66c3a14e1e6e582b9f2327b8c..c514438c45e64863e1eb93071c4374d0d29ef969 100644
--- a/contrib/eigen/Eigen/src/Core/Inverse.h
+++ b/contrib/eigen/Eigen/src/Core/Inverse.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2014-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,7 +10,7 @@
 #ifndef EIGEN_INVERSE_H
 #define EIGEN_INVERSE_H
 
-namespace Eigen { 
+namespace Eigen {
 
 template<typename XprType,typename StorageKind> class InverseImpl;
 
@@ -44,19 +44,18 @@ class Inverse : public InverseImpl<XprType,typename internal::traits<XprType>::S
 {
 public:
   typedef typename XprType::StorageIndex StorageIndex;
-  typedef typename XprType::PlainObject                       PlainObject;
   typedef typename XprType::Scalar                            Scalar;
   typedef typename internal::ref_selector<XprType>::type      XprTypeNested;
   typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
   typedef typename internal::ref_selector<Inverse>::type Nested;
   typedef typename internal::remove_all<XprType>::type NestedExpression;
-  
+
   explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr)
     : m_xpr(xpr)
   {}
 
-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR  Index rows() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR  Index cols() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
 
   EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
 
@@ -82,7 +81,7 @@ namespace internal {
 
 /** \internal
   * \brief Default evaluator for Inverse expression.
-  * 
+  *
   * This default evaluator for Inverse expression simply evaluate the inverse into a temporary
   * by a call to internal::call_assignment_no_alias.
   * Therefore, inverse implementers only have to specialize Assignment<Dst,Inverse<...>, ...> for
@@ -97,7 +96,7 @@ struct unary_evaluator<Inverse<ArgType> >
   typedef Inverse<ArgType> InverseType;
   typedef typename InverseType::PlainObject PlainObject;
   typedef evaluator<PlainObject> Base;
-  
+
   enum { Flags = Base::Flags | EvalBeforeNestingBit };
 
   unary_evaluator(const InverseType& inv_xpr)
@@ -106,11 +105,11 @@ struct unary_evaluator<Inverse<ArgType> >
     ::new (static_cast<Base*>(this)) Base(m_result);
     internal::call_assignment_no_alias(m_result, inv_xpr);
   }
-  
+
 protected:
   PlainObject m_result;
 };
-  
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/contrib/eigen/Eigen/src/Core/Map.h b/contrib/eigen/Eigen/src/Core/Map.h
index 548bf9a2d558dbad6e83ce2c39618b08f225d766..218cc157f386924cae44ca6741d6aaba9fc96f74 100644
--- a/contrib/eigen/Eigen/src/Core/Map.h
+++ b/contrib/eigen/Eigen/src/Core/Map.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_MAP_H
 #define EIGEN_MAP_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 template<typename PlainObjectType, int MapOptions, typename StrideType>
@@ -47,7 +47,7 @@ private:
   * \brief A matrix or vector expression mapping an existing array of data.
   *
   * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
+  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
   *                The default is \c #Unaligned.
   * \tparam StrideType optionally specifies strides. By default, Map assumes the memory layout
   *                   of an ordinary, contiguous array. This can be overridden by specifying strides.
@@ -104,19 +104,19 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
     EIGEN_DEVICE_FUNC
     inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
 
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index innerStride() const
     {
       return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
     }
 
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index outerStride() const
     {
-      return int(StrideType::OuterStrideAtCompileTime) != 0 ? m_stride.outer()
-           : int(internal::traits<Map>::OuterStrideAtCompileTime) != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
+      return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
+           : internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
            : IsVectorAtCompileTime ? (this->size() * innerStride())
-           : (int(Flags)&RowMajorBit) ? (this->cols() * innerStride())
+           : int(Flags)&RowMajorBit ? (this->cols() * innerStride())
            : (this->rows() * innerStride());
     }
 
diff --git a/contrib/eigen/Eigen/src/Core/MapBase.h b/contrib/eigen/Eigen/src/Core/MapBase.h
index 92c3b2818a420cd3e279208fc3967059fcbbaa2a..d856447f03e57a749ec250456c0e00e539d7e700 100644
--- a/contrib/eigen/Eigen/src/Core/MapBase.h
+++ b/contrib/eigen/Eigen/src/Core/MapBase.h
@@ -15,7 +15,7 @@
       EIGEN_STATIC_ASSERT((int(internal::evaluator<Derived>::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \
                           YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT)
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \ingroup Core_Module
   *
@@ -87,9 +87,11 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
     typedef typename Base::CoeffReturnType CoeffReturnType;
 
     /** \copydoc DenseBase::rows() */
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_rows.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_rows.value(); }
     /** \copydoc DenseBase::cols() */
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_cols.value(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_cols.value(); }
 
     /** Returns a pointer to the first coefficient of the matrix or vector.
       *
diff --git a/contrib/eigen/Eigen/src/Core/MathFunctions.h b/contrib/eigen/Eigen/src/Core/MathFunctions.h
index 01736c2a060473567ef1d7316c5cc05a5756c0d9..61b78f4f20a6a3820b168796db9aeb79d712bb11 100644
--- a/contrib/eigen/Eigen/src/Core/MathFunctions.h
+++ b/contrib/eigen/Eigen/src/Core/MathFunctions.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,10 +11,11 @@
 #ifndef EIGEN_MATHFUNCTIONS_H
 #define EIGEN_MATHFUNCTIONS_H
 
-// source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html
 // TODO this should better be moved to NumTraits
-#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L
-
+// Source: WolframAlpha
+#define EIGEN_PI    3.141592653589793238462643383279502884197169399375105820974944592307816406L
+#define EIGEN_LOG2E 1.442695040888963407359924681001892137426645954152985934135449406931109219L
+#define EIGEN_LN2   0.693147180559945309417232121458176568075500134360255254120680009493393621L
 
 namespace Eigen {
 
@@ -97,7 +99,7 @@ struct real_default_impl<Scalar,true>
 
 template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
 
-#ifdef __CUDA_ARCH__
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T>
 struct real_impl<std::complex<T> >
 {
@@ -145,7 +147,7 @@ struct imag_default_impl<Scalar,true>
 
 template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
 
-#ifdef __CUDA_ARCH__
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T>
 struct imag_impl<std::complex<T> >
 {
@@ -213,12 +215,12 @@ struct imag_ref_default_impl
 template<typename Scalar>
 struct imag_ref_default_impl<Scalar, false>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline Scalar run(Scalar&)
   {
     return Scalar(0);
   }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline const Scalar run(const Scalar&)
   {
     return Scalar(0);
@@ -239,7 +241,7 @@ struct imag_ref_retval
 ****************************************************************************/
 
 template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct conj_impl
+struct conj_default_impl
 {
   EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
@@ -249,7 +251,7 @@ struct conj_impl
 };
 
 template<typename Scalar>
-struct conj_impl<Scalar,true>
+struct conj_default_impl<Scalar,true>
 {
   EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
@@ -259,6 +261,9 @@ struct conj_impl<Scalar,true>
   }
 };
 
+template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct conj_impl : conj_default_impl<Scalar, IsComplex> {};
+
 template<typename Scalar>
 struct conj_retval
 {
@@ -308,6 +313,65 @@ struct abs2_retval
   typedef typename NumTraits<Scalar>::Real type;
 };
 
+/****************************************************************************
+* Implementation of sqrt/rsqrt                                             *
+****************************************************************************/
+
+template<typename Scalar>
+struct sqrt_impl
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE Scalar run(const Scalar& x)
+  {
+    EIGEN_USING_STD(sqrt);
+    return sqrt(x);
+  }
+};
+
+// Complex sqrt defined in MathFunctionsImpl.h.
+template<typename T> EIGEN_DEVICE_FUNC std::complex<T> complex_sqrt(const std::complex<T>& a_x);
+
+// Custom implementation is faster than `std::sqrt`, works on
+// GPU, and correctly handles special cases (unlike MSVC).
+template<typename T>
+struct sqrt_impl<std::complex<T> >
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x)
+  {
+    return complex_sqrt<T>(x);
+  }
+};
+
+template<typename Scalar>
+struct sqrt_retval
+{
+  typedef Scalar type;
+};
+
+// Default implementation relies on numext::sqrt, at bottom of file.
+template<typename T>
+struct rsqrt_impl;
+
+// Complex rsqrt defined in MathFunctionsImpl.h.
+template<typename T> EIGEN_DEVICE_FUNC std::complex<T> complex_rsqrt(const std::complex<T>& a_x);
+
+template<typename T>
+struct rsqrt_impl<std::complex<T> >
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x)
+  {
+    return complex_rsqrt<T>(x);
+  }
+};
+
+template<typename Scalar>
+struct rsqrt_retval
+{
+  typedef Scalar type;
+};
+
 /****************************************************************************
 * Implementation of norm1                                                *
 ****************************************************************************/
@@ -322,7 +386,7 @@ struct norm1_default_impl<Scalar,true>
   EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar& x)
   {
-    EIGEN_USING_STD_MATH(abs);
+    EIGEN_USING_STD(abs);
     return abs(x.real()) + abs(x.imag());
   }
 };
@@ -333,7 +397,7 @@ struct norm1_default_impl<Scalar, false>
   EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
   {
-    EIGEN_USING_STD_MATH(abs);
+    EIGEN_USING_STD(abs);
     return abs(x);
   }
 };
@@ -363,7 +427,7 @@ struct hypot_retval
 * Implementation of cast                                                 *
 ****************************************************************************/
 
-template<typename OldType, typename NewType>
+template<typename OldType, typename NewType, typename EnableIf = void>
 struct cast_impl
 {
   EIGEN_DEVICE_FUNC
@@ -373,6 +437,22 @@ struct cast_impl
   }
 };
 
+// Casting from S -> Complex<T> leads to an implicit conversion from S to T,
+// generating warnings on clang.  Here we explicitly cast the real component.
+template<typename OldType, typename NewType>
+struct cast_impl<OldType, NewType,
+  typename internal::enable_if<
+    !NumTraits<OldType>::IsComplex && NumTraits<NewType>::IsComplex
+  >::type>
+{
+  EIGEN_DEVICE_FUNC
+  static inline NewType run(const OldType& x)
+  {
+    typedef typename NumTraits<NewType>::Real NewReal;
+    return static_cast<NewType>(static_cast<NewReal>(x));
+  }
+};
+
 // here, for once, we're plainly returning NewType: we don't want cast to do weird things.
 
 template<typename OldType, typename NewType>
@@ -386,29 +466,59 @@ inline NewType cast(const OldType& x)
 * Implementation of round                                                   *
 ****************************************************************************/
 
+template<typename Scalar>
+struct round_impl
+{
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
 #if EIGEN_HAS_CXX11_MATH
-  template<typename Scalar>
-  struct round_impl {
-    static inline Scalar run(const Scalar& x)
-    {
-      EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
-      using std::round;
-      return round(x);
-    }
-  };
+    EIGEN_USING_STD(round);
+#endif
+    return Scalar(round(x));
+  }
+};
+
+#if !EIGEN_HAS_CXX11_MATH
+#if EIGEN_HAS_C99_MATH
+// Use ::roundf for float.
+template<>
+struct round_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static inline float run(const float& x)
+  {
+    return ::roundf(x);
+  }
+};
 #else
-  template<typename Scalar>
-  struct round_impl
+template<typename Scalar>
+struct round_using_floor_ceil_impl
+{
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x)
   {
-    static inline Scalar run(const Scalar& x)
-    {
-      EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
-      EIGEN_USING_STD_MATH(floor);
-      EIGEN_USING_STD_MATH(ceil);
-      return (x > Scalar(0)) ? floor(x + Scalar(0.5)) : ceil(x - Scalar(0.5));
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
+    // Without C99 round/roundf, resort to floor/ceil.
+    EIGEN_USING_STD(floor);
+    EIGEN_USING_STD(ceil);
+    // If not enough precision to resolve a decimal at all, return the input.
+    // Otherwise, adding 0.5 can trigger an increment by 1.
+    const Scalar limit = Scalar(1ull << (NumTraits<Scalar>::digits() - 1));
+    if (x >= limit || x <= -limit) {
+      return x;
     }
-  };
-#endif
+    return (x > Scalar(0)) ? Scalar(floor(x + Scalar(0.5))) : Scalar(ceil(x - Scalar(0.5)));
+  }
+};
+
+template<>
+struct round_impl<float> : round_using_floor_ceil_impl<float> {};
+
+template<>
+struct round_impl<double> : round_using_floor_ceil_impl<double> {};
+#endif // EIGEN_HAS_C99_MATH
+#endif // !EIGEN_HAS_CXX11_MATH
 
 template<typename Scalar>
 struct round_retval
@@ -417,43 +527,112 @@ struct round_retval
 };
 
 /****************************************************************************
-* Implementation of arg                                                     *
+* Implementation of rint                                                    *
 ****************************************************************************/
 
+template<typename Scalar>
+struct rint_impl {
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
 #if EIGEN_HAS_CXX11_MATH
-  template<typename Scalar>
-  struct arg_impl {
-    static inline Scalar run(const Scalar& x)
-    {
-      EIGEN_USING_STD_MATH(arg);
-      return arg(x);
-    }
-  };
-#else
-  template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-  struct arg_default_impl
+      EIGEN_USING_STD(rint);
+#endif
+    return rint(x);
+  }
+};
+
+#if !EIGEN_HAS_CXX11_MATH
+template<>
+struct rint_impl<double> {
+  EIGEN_DEVICE_FUNC
+  static inline double run(const double& x)
   {
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    EIGEN_DEVICE_FUNC
-    static inline RealScalar run(const Scalar& x)
-    {
-      return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0); }
-  };
+    return ::rint(x);
+  }
+};
+template<>
+struct rint_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static inline float run(const float& x)
+  {
+    return ::rintf(x);
+  }
+};
+#endif
 
-  template<typename Scalar>
-  struct arg_default_impl<Scalar,true>
+template<typename Scalar>
+struct rint_retval
+{
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of arg                                                     *
+****************************************************************************/
+
+// Visual Studio 2017 has a bug where arg(float) returns 0 for negative inputs.
+// This seems to be fixed in VS 2019.
+#if EIGEN_HAS_CXX11_MATH && (!EIGEN_COMP_MSVC || EIGEN_COMP_MSVC >= 1920)
+// std::arg is only defined for types of std::complex, or integer types or float/double/long double
+template<typename Scalar,
+          bool HasStdImpl = NumTraits<Scalar>::IsComplex || is_integral<Scalar>::value
+                            || is_same<Scalar, float>::value || is_same<Scalar, double>::value
+                            || is_same<Scalar, long double>::value >
+struct arg_default_impl;
+
+template<typename Scalar>
+struct arg_default_impl<Scalar, true> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
   {
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    EIGEN_DEVICE_FUNC
-    static inline RealScalar run(const Scalar& x)
-    {
-      EIGEN_USING_STD_MATH(arg);
-      return arg(x);
-    }
-  };
+    #if defined(EIGEN_HIP_DEVICE_COMPILE)
+    // HIP does not seem to have a native device side implementation for the math routine "arg"
+    using std::arg;
+    #else
+    EIGEN_USING_STD(arg);
+    #endif
+    return static_cast<RealScalar>(arg(x));
+  }
+};
+
+// Must be non-complex floating-point type (e.g. half/bfloat16).
+template<typename Scalar>
+struct arg_default_impl<Scalar, false> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    return (x < Scalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0);
+  }
+};
+#else
+template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct arg_default_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    return (x < RealScalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0);
+  }
+};
 
-  template<typename Scalar> struct arg_impl : arg_default_impl<Scalar> {};
+template<typename Scalar>
+struct arg_default_impl<Scalar,true>
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    EIGEN_USING_STD(arg);
+    return arg(x);
+  }
+};
 #endif
+template<typename Scalar> struct arg_impl : arg_default_impl<Scalar> {};
 
 template<typename Scalar>
 struct arg_retval
@@ -461,6 +640,80 @@ struct arg_retval
   typedef typename NumTraits<Scalar>::Real type;
 };
 
+/****************************************************************************
+* Implementation of expm1                                                   *
+****************************************************************************/
+
+// This implementation is based on GSL Math's expm1.
+namespace std_fallback {
+  // fallback expm1 implementation in case there is no expm1(Scalar) function in namespace of Scalar,
+  // or that there is no suitable std::expm1 function available. Implementation
+  // attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php.
+  template<typename Scalar>
+  EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    EIGEN_USING_STD(exp);
+    Scalar u = exp(x);
+    if (numext::equal_strict(u, Scalar(1))) {
+      return x;
+    }
+    Scalar um1 = u - RealScalar(1);
+    if (numext::equal_strict(um1, Scalar(-1))) {
+      return RealScalar(-1);
+    }
+
+    EIGEN_USING_STD(log);
+    Scalar logu = log(u);
+    return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu;
+  }
+}
+
+template<typename Scalar>
+struct expm1_impl {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    #if EIGEN_HAS_CXX11_MATH
+    using std::expm1;
+    #else
+    using std_fallback::expm1;
+    #endif
+    return expm1(x);
+  }
+};
+
+template<typename Scalar>
+struct expm1_retval
+{
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of log                                                     *
+****************************************************************************/
+
+// Complex log defined in MathFunctionsImpl.h.
+template<typename T> EIGEN_DEVICE_FUNC std::complex<T> complex_log(const std::complex<T>& z);
+
+template<typename Scalar>
+struct log_impl {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_USING_STD(log);
+    return static_cast<Scalar>(log(x));
+  }
+};
+
+template<typename Scalar>
+struct log_impl<std::complex<Scalar> > {
+  EIGEN_DEVICE_FUNC static inline std::complex<Scalar> run(const std::complex<Scalar>& z)
+  {
+    return complex_log(z);
+  }
+};
+
 /****************************************************************************
 * Implementation of log1p                                                   *
 ****************************************************************************/
@@ -472,25 +725,38 @@ namespace std_fallback {
   EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    EIGEN_USING_STD_MATH(log);
+    EIGEN_USING_STD(log);
     Scalar x1p = RealScalar(1) + x;
-    return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
+    Scalar log_1p = log_impl<Scalar>::run(x1p);
+    const bool is_small = numext::equal_strict(x1p, Scalar(1));
+    const bool is_inf = numext::equal_strict(x1p, log_1p);
+    return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1)));
   }
 }
 
 template<typename Scalar>
 struct log1p_impl {
-  static inline Scalar run(const Scalar& x)
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
   {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
     #if EIGEN_HAS_CXX11_MATH
     using std::log1p;
-    #endif
+    #else
     using std_fallback::log1p;
+    #endif
     return log1p(x);
   }
 };
 
+// Specialization for complex types that are not supported by std::log1p.
+template <typename RealScalar>
+struct log1p_impl<std::complex<RealScalar> > {
+  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
+      const std::complex<RealScalar>& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
+    return std_fallback::log1p(x);
+  }
+};
 
 template<typename Scalar>
 struct log1p_retval
@@ -509,7 +775,7 @@ struct pow_impl
   typedef typename ScalarBinaryOpTraits<ScalarX,ScalarY,internal::scalar_pow_op<ScalarX,ScalarY> >::ReturnType result_type;
   static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x, const ScalarY& y)
   {
-    EIGEN_USING_STD_MATH(pow);
+    EIGEN_USING_STD(pow);
     return pow(x, y);
   }
 };
@@ -687,7 +953,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
   return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
 }
 
-// Implementatin of is* functions
+// Implementation of is* functions
 
 // std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang.
 #if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG)
@@ -716,7 +982,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isfinite_impl(const T& x)
 {
-  #ifdef __CUDA_ARCH__
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
     return (::isfinite)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isfinite;
@@ -731,7 +997,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isinf_impl(const T& x)
 {
-  #ifdef __CUDA_ARCH__
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
     return (::isinf)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isinf;
@@ -746,7 +1012,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isnan_impl(const T& x)
 {
-  #ifdef __CUDA_ARCH__
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
     return (::isnan)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isnan;
@@ -803,7 +1069,6 @@ template<typename T> EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x)
 template<typename T> EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x);
 
 template<typename T> T generic_fast_tanh_float(const T& a_x);
-
 } // end namespace internal
 
 /****************************************************************************
@@ -812,12 +1077,12 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
 
 namespace numext {
 
-#ifndef __CUDA_ARCH__
+#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
 {
-  EIGEN_USING_STD_MATH(min);
+  EIGEN_USING_STD(min)
   return min EIGEN_NOT_A_MACRO (x,y);
 }
 
@@ -825,7 +1090,7 @@ template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
 {
-  EIGEN_USING_STD_MATH(max);
+  EIGEN_USING_STD(max)
   return max EIGEN_NOT_A_MACRO (x,y);
 }
 #else
@@ -841,6 +1106,24 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
 {
   return fminf(x, y);
 }
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
+{
+  return fmin(x, y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
+{
+#if defined(EIGEN_HIPCC)
+  // no "fminl" on HIP yet
+  return (x < y) ? x : y;
+#else
+  return fminl(x, y);
+#endif
+}
+
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
@@ -853,6 +1136,92 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
 {
   return fmaxf(x, y);
 }
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
+{
+  return fmax(x, y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
+{
+#if defined(EIGEN_HIPCC)
+  // no "fmaxl" on HIP yet
+  return (x > y) ? x : y;
+#else
+  return fmaxl(x, y);
+#endif
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+
+
+#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char)   \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short)  \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int)    \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
+#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char)   \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short)  \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int)    \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
+#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)  \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)   \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
+#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)  \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)   \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
+#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC)
+#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(NAME, FUNC, RET_TYPE) \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double)
+
+#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
+template<>                                               \
+  EIGEN_DEVICE_FUNC                                      \
+  EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \
+    return cl::sycl::FUNC(x);                            \
+  }
+
+#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE)
+
+#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \
+  template<>                                                                  \
+  EIGEN_DEVICE_FUNC                                                           \
+  EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, const ARG_TYPE2& y) { \
+    return cl::sycl::FUNC(x, y);                                              \
+  }
+
+#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
+  SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE)
+
+#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \
+  SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE)
+
+SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin)
+SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
+
 #endif
 
 
@@ -922,6 +1291,34 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
 EIGEN_DEVICE_FUNC
 inline bool abs2(bool x) { return x; }
 
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T absdiff(const T& x, const T& y)
+{
+  return x > y ? x - y : y - x;
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE float absdiff(const float& x, const float& y)
+{
+  return fabsf(x - y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE double absdiff(const double& x, const double& y)
+{
+  return fabs(x - y);
+}
+
+#if !defined(EIGEN_GPUCC)
+// HIP and CUDA do not support long double.
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE long double absdiff(const long double& x, const long double& y) {
+  return fabsl(x - y);
+}
+#endif
+
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
@@ -936,6 +1333,10 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar&
   return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
 }
 
+#if defined(SYCL_DEVICE_ONLY)
+  SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot)
+#endif
+
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
@@ -943,7 +1344,11 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
   return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log1p(const float &x) { return ::log1pf(x); }
 
@@ -958,10 +1363,27 @@ inline typename internal::pow_impl<ScalarX,ScalarY>::result_type pow(const Scala
   return internal::pow_impl<ScalarX,ScalarY>::run(x, y);
 }
 
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow)
+#endif
+
 template<typename T> EIGEN_DEVICE_FUNC bool (isnan)   (const T &x) { return internal::isnan_impl(x); }
 template<typename T> EIGEN_DEVICE_FUNC bool (isinf)   (const T &x) { return internal::isinf_impl(x); }
 template<typename T> EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); }
 
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
+#endif
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(rint, Scalar) rint(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(rint, Scalar)::run(x);
+}
+
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x)
@@ -969,15 +1391,23 @@ inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x)
   return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x);
 }
 
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
+#endif
+
 template<typename T>
 EIGEN_DEVICE_FUNC
 T (floor)(const T& x)
 {
-  EIGEN_USING_STD_MATH(floor);
+  EIGEN_USING_STD(floor)
   return floor(x);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float floor(const float &x) { return ::floorf(x); }
 
@@ -989,11 +1419,15 @@ template<typename T>
 EIGEN_DEVICE_FUNC
 T (ceil)(const T& x)
 {
-  EIGEN_USING_STD_MATH(ceil);
+  EIGEN_USING_STD(ceil);
   return ceil(x);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float ceil(const float &x) { return ::ceilf(x); }
 
@@ -1026,22 +1460,42 @@ inline int log2(int x)
   *
   * It's usage is justified in performance critical functions, like norm/normalize.
   */
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE EIGEN_MATHFUNC_RETVAL(sqrt, Scalar) sqrt(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(sqrt, Scalar)::run(x);
+}
+
+// Boolean specialization, avoids implicit float to bool conversion (-Wimplicit-conversion-floating-point-to-bool).
+template<>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC
+bool sqrt<bool>(const bool &x) { return x; }
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
+#endif
+
+/** \returns the reciprocal square root of \a x. **/
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-T sqrt(const T &x)
+T rsqrt(const T& x)
 {
-  EIGEN_USING_STD_MATH(sqrt);
-  return sqrt(x);
+  return internal::rsqrt_impl<T>::run(x);
 }
 
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T log(const T &x) {
-  EIGEN_USING_STD_MATH(log);
-  return log(x);
+  return internal::log_impl<T>::run(x);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log)
+#endif
+
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log(const float &x) { return ::logf(x); }
 
@@ -1053,7 +1507,7 @@ template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 typename internal::enable_if<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex,typename NumTraits<T>::Real>::type
 abs(const T &x) {
-  EIGEN_USING_STD_MATH(abs);
+  EIGEN_USING_STD(abs);
   return abs(x);
 }
 
@@ -1064,12 +1518,12 @@ abs(const T &x) {
   return x;
 }
 
-#if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   abs(float x) { return cl::sycl::fabs(x); }
-EIGEN_ALWAYS_INLINE double  abs(double x) { return cl::sycl::fabs(x); }
-#endif // defined(__SYCL_DEVICE_ONLY__)
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs)
+#endif
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float abs(const float &x) { return ::fabsf(x); }
 
@@ -1090,26 +1544,69 @@ double abs(const std::complex<double>& x) {
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T exp(const T &x) {
-  EIGEN_USING_STD_MATH(exp);
+  EIGEN_USING_STD(exp);
   return exp(x);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float exp(const float &x) { return ::expf(x); }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double exp(const double &x) { return ::exp(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+std::complex<float> exp(const std::complex<float>& x) {
+  float com = ::expf(x.real());
+  float res_real = com * ::cosf(x.imag());
+  float res_imag = com * ::sinf(x.imag());
+  return std::complex<float>(res_real, res_imag);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+std::complex<double> exp(const std::complex<double>& x) {
+  double com = ::exp(x.real());
+  double res_real = com * ::cos(x.imag());
+  double res_imag = com * ::sin(x.imag());
+  return std::complex<double>(res_real, res_imag);
+}
+#endif
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) expm1(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float expm1(const float &x) { return ::expm1f(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double expm1(const double &x) { return ::expm1(x); }
 #endif
 
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T cos(const T &x) {
-  EIGEN_USING_STD_MATH(cos);
+  EIGEN_USING_STD(cos);
   return cos(x);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos,cos)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cos(const float &x) { return ::cosf(x); }
 
@@ -1120,11 +1617,15 @@ double cos(const double &x) { return ::cos(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T sin(const T &x) {
-  EIGEN_USING_STD_MATH(sin);
+  EIGEN_USING_STD(sin);
   return sin(x);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sin(const float &x) { return ::sinf(x); }
 
@@ -1135,11 +1636,15 @@ double sin(const double &x) { return ::sin(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T tan(const T &x) {
-  EIGEN_USING_STD_MATH(tan);
+  EIGEN_USING_STD(tan);
   return tan(x);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tan(const float &x) { return ::tanf(x); }
 
@@ -1150,11 +1655,25 @@ double tan(const double &x) { return ::tan(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T acos(const T &x) {
-  EIGEN_USING_STD_MATH(acos);
+  EIGEN_USING_STD(acos);
   return acos(x);
 }
 
-#ifdef __CUDACC__
+#if EIGEN_HAS_CXX11_MATH
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T acosh(const T &x) {
+  EIGEN_USING_STD(acosh);
+  return static_cast<T>(acosh(x));
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float acos(const float &x) { return ::acosf(x); }
 
@@ -1165,11 +1684,25 @@ double acos(const double &x) { return ::acos(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T asin(const T &x) {
-  EIGEN_USING_STD_MATH(asin);
+  EIGEN_USING_STD(asin);
   return asin(x);
 }
 
-#ifdef __CUDACC__
+#if EIGEN_HAS_CXX11_MATH
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T asinh(const T &x) {
+  EIGEN_USING_STD(asinh);
+  return static_cast<T>(asinh(x));
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float asin(const float &x) { return ::asinf(x); }
 
@@ -1180,11 +1713,25 @@ double asin(const double &x) { return ::asin(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T atan(const T &x) {
-  EIGEN_USING_STD_MATH(atan);
-  return atan(x);
+  EIGEN_USING_STD(atan);
+  return static_cast<T>(atan(x));
+}
+
+#if EIGEN_HAS_CXX11_MATH
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T atanh(const T &x) {
+  EIGEN_USING_STD(atanh);
+  return static_cast<T>(atanh(x));
 }
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh)
+#endif
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float atan(const float &x) { return ::atanf(x); }
 
@@ -1196,11 +1743,15 @@ double atan(const double &x) { return ::atan(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T cosh(const T &x) {
-  EIGEN_USING_STD_MATH(cosh);
-  return cosh(x);
+  EIGEN_USING_STD(cosh);
+  return static_cast<T>(cosh(x));
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cosh(const float &x) { return ::coshf(x); }
 
@@ -1211,11 +1762,15 @@ double cosh(const double &x) { return ::cosh(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T sinh(const T &x) {
-  EIGEN_USING_STD_MATH(sinh);
-  return sinh(x);
+  EIGEN_USING_STD(sinh);
+  return static_cast<T>(sinh(x));
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sinh(const float &x) { return ::sinhf(x); }
 
@@ -1226,16 +1781,20 @@ double sinh(const double &x) { return ::sinh(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T tanh(const T &x) {
-  EIGEN_USING_STD_MATH(tanh);
+  EIGEN_USING_STD(tanh);
   return tanh(x);
 }
 
-#if (!defined(__CUDACC__)) && EIGEN_FAST_MATH
+#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && !defined(SYCL_DEVICE_ONLY)
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(float x) { return internal::generic_fast_tanh_float(x); }
 #endif
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(const float &x) { return ::tanhf(x); }
 
@@ -1246,11 +1805,15 @@ double tanh(const double &x) { return ::tanh(x); }
 template <typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T fmod(const T& a, const T& b) {
-  EIGEN_USING_STD_MATH(fmod);
+  EIGEN_USING_STD(fmod);
   return fmod(a, b);
 }
 
-#ifdef __CUDACC__
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod)
+#endif
+
+#if defined(EIGEN_GPUCC)
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float fmod(const float& a, const float& b) {
@@ -1264,6 +1827,23 @@ double fmod(const double& a, const double& b) {
 }
 #endif
 
+#if defined(SYCL_DEVICE_ONLY)
+#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE
+#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC
+#undef SYCL_SPECIALIZE_UNARY_FUNC
+#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC
+#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC
+#undef SYCL_SPECIALIZE_BINARY_FUNC
+#endif
+
 } // end namespace numext
 
 namespace internal {
@@ -1387,18 +1967,23 @@ template<> struct random_impl<bool>
   {
     return random<int>(0,1)==0 ? false : true;
   }
+
+  static inline bool run(const bool& a, const bool& b)
+  {
+    return random<int>(a, b)==0 ? false : true;
+  }
 };
 
 template<> struct scalar_fuzzy_impl<bool>
 {
   typedef bool RealScalar;
-  
+
   template<typename OtherScalar> EIGEN_DEVICE_FUNC
   static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&)
   {
     return !x;
   }
-  
+
   EIGEN_DEVICE_FUNC
   static inline bool isApprox(bool x, bool y, bool)
   {
@@ -1410,10 +1995,61 @@ template<> struct scalar_fuzzy_impl<bool>
   {
     return (!x) || y;
   }
-  
+
+};
+
+} // end namespace internal
+
+// Default implementations that rely on other numext implementations
+namespace internal {
+
+// Specialization for complex types that are not supported by std::expm1.
+template <typename RealScalar>
+struct expm1_impl<std::complex<RealScalar> > {
+  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
+      const std::complex<RealScalar>& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
+    RealScalar xr = x.real();
+    RealScalar xi = x.imag();
+    // expm1(z) = exp(z) - 1
+    //          = exp(x +  i * y) - 1
+    //          = exp(x) * (cos(y) + i * sin(y)) - 1
+    //          = exp(x) * cos(y) - 1 + i * exp(x) * sin(y)
+    // Imag(expm1(z)) = exp(x) * sin(y)
+    // Real(expm1(z)) = exp(x) * cos(y) - 1
+    //          = exp(x) * cos(y) - 1.
+    //          = expm1(x) + exp(x) * (cos(y) - 1)
+    //          = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2)
+    RealScalar erm1 = numext::expm1<RealScalar>(xr);
+    RealScalar er = erm1 + RealScalar(1.);
+    RealScalar sin2 = numext::sin(xi / RealScalar(2.));
+    sin2 = sin2 * sin2;
+    RealScalar s = numext::sin(xi);
+    RealScalar real_part = erm1 - RealScalar(2.) * er * sin2;
+    return std::complex<RealScalar>(real_part, er * s);
+  }
 };
 
-  
+template<typename T>
+struct rsqrt_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_ALWAYS_INLINE T run(const T& x) {
+    return T(1)/numext::sqrt(x);
+  }
+};
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template<typename T>
+struct conj_impl<std::complex<T>, true>
+{
+  EIGEN_DEVICE_FUNC
+  static inline std::complex<T> run(const std::complex<T>& x)
+  {
+    return std::complex<T>(numext::real(x), -numext::imag(x));
+  }
+};
+#endif
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/contrib/eigen/Eigen/src/Core/MathFunctionsImpl.h b/contrib/eigen/Eigen/src/Core/MathFunctionsImpl.h
index 9c1ceb0eb0f4a0bb2a03adef3b7fd72bc5b20b88..4eaaaa78449031830cea7446210226afcd2467d0 100644
--- a/contrib/eigen/Eigen/src/Core/MathFunctionsImpl.h
+++ b/contrib/eigen/Eigen/src/Core/MathFunctionsImpl.h
@@ -17,24 +17,28 @@ namespace internal {
 
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise)
     Doesn't do anything fancy, just a 13/6-degree rational interpolant which
-    is accurate up to a couple of ulp in the range [-9, 9], outside of which
-    the tanh(x) = +/-1.
+    is accurate up to a couple of ulps in the (approximate) range [-8, 8],
+    outside of which tanh(x) = +/-1 in single precision. The input is clamped
+    to the range [-c, c]. The value c is chosen as the smallest value where
+    the approximation evaluates to exactly 1. In the reange [-0.0004, 0.0004]
+    the approxmation tanh(x) ~= x is used for better accuracy as x tends to zero.
 
     This implementation works on both scalars and packets.
 */
 template<typename T>
 T generic_fast_tanh_float(const T& a_x)
 {
-  // Clamp the inputs to the range [-9, 9] since anything outside
-  // this range is +/-1.0f in single-precision.
-  const T plus_9 = pset1<T>(9.f);
-  const T minus_9 = pset1<T>(-9.f);
-  // NOTE GCC prior to 6.3 might improperly optimize this max/min
-  //      step such that if a_x is nan, x will be either 9 or -9,
-  //      and tanh will return 1 or -1 instead of nan.
-  //      This is supposed to be fixed in gcc6.3,
-  //      see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
-  const T x = pmax(minus_9,pmin(plus_9,a_x));
+  // Clamp the inputs to the range [-c, c]
+#ifdef EIGEN_VECTORIZE_FMA
+  const T plus_clamp = pset1<T>(7.99881172180175781f);
+  const T minus_clamp = pset1<T>(-7.99881172180175781f);
+#else
+  const T plus_clamp = pset1<T>(7.90531110763549805f);
+  const T minus_clamp = pset1<T>(-7.90531110763549805f);
+#endif
+  const T tiny = pset1<T>(0.0004f);
+  const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
+  const T tiny_mask = pcmp_lt(pabs(a_x), tiny);
   // The monomial coefficients of the numerator polynomial (odd).
   const T alpha_1 = pset1<T>(4.89352455891786e-03f);
   const T alpha_3 = pset1<T>(6.37261928875436e-04f);
@@ -62,24 +66,30 @@ T generic_fast_tanh_float(const T& a_x)
   p = pmadd(x2, p, alpha_1);
   p = pmul(x, p);
 
-  // Evaluate the denominator polynomial p.
+  // Evaluate the denominator polynomial q.
   T q = pmadd(x2, beta_6, beta_4);
   q = pmadd(x2, q, beta_2);
   q = pmadd(x2, q, beta_0);
 
   // Divide the numerator by the denominator.
-  return pdiv(p, q);
+  return pselect(tiny_mask, x, pdiv(p, q));
 }
 
 template<typename RealScalar>
-EIGEN_STRONG_INLINE
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
 {
-  EIGEN_USING_STD_MATH(sqrt);
+  // IEEE IEC 6059 special cases.
+  if ((numext::isinf)(x) || (numext::isinf)(y))
+    return NumTraits<RealScalar>::infinity();
+  if ((numext::isnan)(x) || (numext::isnan)(y))
+    return NumTraits<RealScalar>::quiet_NaN();
+    
+  EIGEN_USING_STD(sqrt);
   RealScalar p, qp;
   p = numext::maxi(x,y);
   if(p==RealScalar(0)) return RealScalar(0);
-  qp = numext::mini(y,x) / p;    
+  qp = numext::mini(y,x) / p;
   return p * sqrt(RealScalar(1) + qp*qp);
 }
 
@@ -87,13 +97,102 @@ template<typename Scalar>
 struct hypot_impl
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline RealScalar run(const Scalar& x, const Scalar& y)
+  static EIGEN_DEVICE_FUNC
+  inline RealScalar run(const Scalar& x, const Scalar& y)
   {
-    EIGEN_USING_STD_MATH(abs);
+    EIGEN_USING_STD(abs);
     return positive_real_hypot<RealScalar>(abs(x), abs(y));
   }
 };
 
+// Generic complex sqrt implementation that correctly handles corner cases
+// according to https://en.cppreference.com/w/cpp/numeric/complex/sqrt
+template<typename T>
+EIGEN_DEVICE_FUNC std::complex<T> complex_sqrt(const std::complex<T>& z) {
+  // Computes the principal sqrt of the input.
+  //
+  // For a complex square root of the number x + i*y. We want to find real
+  // numbers u and v such that
+  //    (u + i*v)^2 = x + i*y  <=>
+  //    u^2 - v^2 + i*2*u*v = x + i*v.
+  // By equating the real and imaginary parts we get:
+  //    u^2 - v^2 = x
+  //    2*u*v = y.
+  //
+  // For x >= 0, this has the numerically stable solution
+  //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
+  //    v = y / (2 * u)
+  // and for x < 0,
+  //    v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
+  //    u = y / (2 * v)
+  //
+  // Letting w = sqrt(0.5 * (|x| + |z|)),
+  //   if x == 0: u = w, v = sign(y) * w
+  //   if x > 0:  u = w, v = y / (2 * w)
+  //   if x < 0:  u = |y| / (2 * w), v = sign(y) * w
+
+  const T x = numext::real(z);
+  const T y = numext::imag(z);
+  const T zero = T(0);
+  const T w = numext::sqrt(T(0.5) * (numext::abs(x) + numext::hypot(x, y)));
+
+  return
+    (numext::isinf)(y) ? std::complex<T>(NumTraits<T>::infinity(), y)
+      : x == zero ? std::complex<T>(w, y < zero ? -w : w)
+      : x > zero ? std::complex<T>(w, y / (2 * w))
+      : std::complex<T>(numext::abs(y) / (2 * w), y < zero ? -w : w );
+}
+
+// Generic complex rsqrt implementation.
+template<typename T>
+EIGEN_DEVICE_FUNC std::complex<T> complex_rsqrt(const std::complex<T>& z) {
+  // Computes the principal reciprocal sqrt of the input.
+  //
+  // For a complex reciprocal square root of the number z = x + i*y. We want to
+  // find real numbers u and v such that
+  //    (u + i*v)^2 = 1 / (x + i*y)  <=>
+  //    u^2 - v^2 + i*2*u*v = x/|z|^2 - i*v/|z|^2.
+  // By equating the real and imaginary parts we get:
+  //    u^2 - v^2 = x/|z|^2
+  //    2*u*v = y/|z|^2.
+  //
+  // For x >= 0, this has the numerically stable solution
+  //    u = sqrt(0.5 * (x + |z|)) / |z|
+  //    v = -y / (2 * u * |z|)
+  // and for x < 0,
+  //    v = -sign(y) * sqrt(0.5 * (-x + |z|)) / |z|
+  //    u = -y / (2 * v * |z|)
+  //
+  // Letting w = sqrt(0.5 * (|x| + |z|)),
+  //   if x == 0: u = w / |z|, v = -sign(y) * w / |z|
+  //   if x > 0:  u = w / |z|, v = -y / (2 * w * |z|)
+  //   if x < 0:  u = |y| / (2 * w * |z|), v = -sign(y) * w / |z|
+
+  const T x = numext::real(z);
+  const T y = numext::imag(z);
+  const T zero = T(0);
+
+  const T abs_z = numext::hypot(x, y);
+  const T w = numext::sqrt(T(0.5) * (numext::abs(x) + abs_z));
+  const T woz = w / abs_z;
+  // Corner cases consistent with 1/sqrt(z) on gcc/clang.
+  return
+    abs_z == zero ? std::complex<T>(NumTraits<T>::infinity(), NumTraits<T>::quiet_NaN())
+      : ((numext::isinf)(x) || (numext::isinf)(y)) ? std::complex<T>(zero, zero)
+      : x == zero ? std::complex<T>(woz, y < zero ? woz : -woz)
+      : x > zero ? std::complex<T>(woz, -y / (2 * w * abs_z))
+      : std::complex<T>(numext::abs(y) / (2 * w * abs_z), y < zero ? woz : -woz );
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC std::complex<T> complex_log(const std::complex<T>& z) {
+  // Computes complex log.
+  T a = numext::abs(z);
+  EIGEN_USING_STD(atan2);
+  T b = atan2(z.imag(), z.real());
+  return std::complex<T>(numext::log(a), b);
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/contrib/eigen/Eigen/src/Core/Matrix.h b/contrib/eigen/Eigen/src/Core/Matrix.h
index 7f4a7af93c6bb14f9e81f24886759ddba79a6ee5..f0e59a911d27602d3194daa1e3fe23c929f9670e 100644
--- a/contrib/eigen/Eigen/src/Core/Matrix.h
+++ b/contrib/eigen/Eigen/src/Core/Matrix.h
@@ -29,7 +29,7 @@ private:
       required_alignment = unpacket_traits<PacketScalar>::alignment,
       packet_access_bit = (packet_traits<_Scalar>::Vectorizable && (EIGEN_UNALIGNED_VECTORIZE || (actual_alignment>=required_alignment))) ? PacketAccessBit : 0
     };
-    
+
 public:
   typedef _Scalar Scalar;
   typedef Dense StorageKind;
@@ -44,7 +44,7 @@ public:
     Options = _Options,
     InnerStrideAtCompileTime = 1,
     OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime,
-    
+
     // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
     EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit,
     Alignment = actual_alignment
@@ -255,53 +255,93 @@ class Matrix
       *
       * \sa resize(Index,Index)
       */
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix() : Base()
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix() : Base()
     {
       Base::_check_template_params();
       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
     }
 
     // FIXME is it still needed
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     explicit Matrix(internal::constructor_without_unaligned_array_assert)
       : Base(internal::constructor_without_unaligned_array_assert())
     { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
 
 #if EIGEN_HAS_RVALUE_REFERENCES
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
       : Base(std::move(other))
     {
       Base::_check_template_params();
     }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
     {
-      other.swap(*this);
+      Base::operator=(std::move(other));
       return *this;
     }
 #endif
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
+#if EIGEN_HAS_CXX11
+    /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&... args)
+     *
+     * Example: \include Matrix_variadic_ctor_cxx11.cpp
+     * Output: \verbinclude Matrix_variadic_ctor_cxx11.out
+     *
+     * \sa Matrix(const std::initializer_list<std::initializer_list<Scalar>>&)
+     */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
+      : Base(a0, a1, a2, a3, args...) {}
+
+    /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
+      *
+      * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
+      *
+      * Example: \include Matrix_initializer_list_23_cxx11.cpp
+      * Output: \verbinclude Matrix_initializer_list_23_cxx11.out
+      *
+      * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
+      *
+      * In the case of a compile-time column vector, implicit transposition from a single row is allowed.
+      * Therefore <code>VectorXd{{1,2,3,4,5}}</code> is legal and the more verbose syntax
+      * <code>RowVectorXd{{1},{2},{3},{4},{5}}</code> can be avoided:
+      *
+      * Example: \include Matrix_initializer_list_vector_cxx11.cpp
+      * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out
+      *
+      * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes,
+      * and implicit transposition is allowed for compile-time vectors only.
+      *
+      * \sa Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
+      */
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
+#endif // end EIGEN_HAS_CXX11
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 
     // This constructor is for both 1x1 matrices and dynamic vectors
     template<typename T>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE explicit Matrix(const T& x)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    explicit Matrix(const T& x)
     {
       Base::_check_template_params();
       Base::template _init1<T>(x);
     }
 
     template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix(const T0& x, const T1& y)
     {
       Base::_check_template_params();
       Base::template _init2<T0,T1>(x, y);
     }
-    #else
+
+
+#else
     /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */
     EIGEN_DEVICE_FUNC
     explicit Matrix(const Scalar *data);
@@ -311,7 +351,7 @@ class Matrix
       * This is useful for dynamic-size vectors. For fixed-size vectors,
       * it is redundant to pass these parameters, so one should use the default constructor
       * Matrix() instead.
-      * 
+      *
       * \warning This constructor is disabled for fixed-size \c 1x1 matrices. For instance,
       * calling Matrix<double,1,1>(1) will call the initialization constructor: Matrix(const Scalar&).
       * For fixed-size \c 1x1 matrices it is therefore recommended to use the default
@@ -319,14 +359,15 @@ class Matrix
       * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
       */
     EIGEN_STRONG_INLINE explicit Matrix(Index dim);
-    /** \brief Constructs an initialized 1x1 matrix with the given coefficient */
+    /** \brief Constructs an initialized 1x1 matrix with the given coefficient
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
     Matrix(const Scalar& x);
     /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
       *
       * This is useful for dynamic-size matrices. For fixed-size matrices,
       * it is redundant to pass these parameters, so one should use the default constructor
       * Matrix() instead.
-      * 
+      *
       * \warning This constructor is disabled for fixed-size \c 1x2 and \c 2x1 vectors. For instance,
       * calling Matrix2f(2,1) will call the initialization constructor: Matrix(const Scalar& x, const Scalar& y).
       * For fixed-size \c 1x2 or \c 2x1 vectors it is therefore recommended to use the default
@@ -335,12 +376,15 @@ class Matrix
       */
     EIGEN_DEVICE_FUNC
     Matrix(Index rows, Index cols);
-    
-    /** \brief Constructs an initialized 2D vector with given coefficients */
+
+    /** \brief Constructs an initialized 2D vector with given coefficients
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
     Matrix(const Scalar& x, const Scalar& y);
-    #endif
+    #endif  // end EIGEN_PARSED_BY_DOXYGEN
 
-    /** \brief Constructs an initialized 3D vector with given coefficients */
+    /** \brief Constructs an initialized 3D vector with given coefficients
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
+      */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
     {
@@ -350,7 +394,9 @@ class Matrix
       m_storage.data()[1] = y;
       m_storage.data()[2] = z;
     }
-    /** \brief Constructs an initialized 4D vector with given coefficients */
+    /** \brief Constructs an initialized 4D vector with given coefficients
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
+      */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w)
     {
@@ -377,8 +423,10 @@ class Matrix
       : Base(other.derived())
     { }
 
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return 1; }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); }
 
     /////////// Geometry module ///////////
 
@@ -405,7 +453,7 @@ class Matrix
   *
   * \ingroup Core_Module
   *
-  * Eigen defines several typedef shortcuts for most common matrix and vector types.
+  * %Eigen defines several typedef shortcuts for most common matrix and vector types.
   *
   * The general patterns are the following:
   *
@@ -418,6 +466,15 @@ class Matrix
   * There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is
   * a fixed-size vector of 4 complex floats.
   *
+  * With \cpp11, template alias are also defined for common sizes.
+  * They follow the same pattern as above except that the scalar type suffix is replaced by a
+  * template parameter, i.e.:
+  *   - `MatrixSize<Type>` where `Size` can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size.
+  *   - `MatrixXSize<Type>` and `MatrixSizeX<Type>` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices.
+  *   - `VectorSize<Type>` and `RowVectorSize<Type>` for column and row vectors.
+  *
+  * With \cpp11, you can also use fully generic column and row vector types: `Vector<Type,Size>` and `RowVector<Type,Size>`.
+  *
   * \sa class Matrix
   */
 
@@ -454,6 +511,55 @@ EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)
 #undef EIGEN_MAKE_TYPEDEFS
 #undef EIGEN_MAKE_FIXED_TYPEDEFS
 
+#if EIGEN_HAS_CXX11
+
+#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix)                     \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Matrix##SizeSuffix = Matrix<Type, Size, Size>;              \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Vector##SizeSuffix = Matrix<Type, Size, 1>;                 \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using RowVector##SizeSuffix = Matrix<Type, 1, Size>;
+
+#define EIGEN_MAKE_FIXED_TYPEDEFS(Size)                           \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Matrix##Size##X = Matrix<Type, Size, Dynamic>;              \
+/** \ingroup matrixtypedefs */                                    \
+/** \brief \cpp11 */                                              \
+template <typename Type>                                          \
+using Matrix##X##Size = Matrix<Type, Dynamic, Size>;
+
+EIGEN_MAKE_TYPEDEFS(2, 2)
+EIGEN_MAKE_TYPEDEFS(3, 3)
+EIGEN_MAKE_TYPEDEFS(4, 4)
+EIGEN_MAKE_TYPEDEFS(Dynamic, X)
+EIGEN_MAKE_FIXED_TYPEDEFS(2)
+EIGEN_MAKE_FIXED_TYPEDEFS(3)
+EIGEN_MAKE_FIXED_TYPEDEFS(4)
+
+/** \ingroup matrixtypedefs
+  * \brief \cpp11 */
+template <typename Type, int Size>
+using Vector = Matrix<Type, Size, 1>;
+
+/** \ingroup matrixtypedefs
+  * \brief \cpp11 */
+template <typename Type, int Size>
+using RowVector = Matrix<Type, 1, Size>;
+
+#undef EIGEN_MAKE_TYPEDEFS
+#undef EIGEN_MAKE_FIXED_TYPEDEFS
+
+#endif // EIGEN_HAS_CXX11
+
 } // end namespace Eigen
 
 #endif // EIGEN_MATRIX_H
diff --git a/contrib/eigen/Eigen/src/Core/MatrixBase.h b/contrib/eigen/Eigen/src/Core/MatrixBase.h
index f8bcc8c6f56d68c2bc50e3beb16aafe3fd04446b..45c3a596ecb09d5e55f4eaf3c742d0badb624373 100644
--- a/contrib/eigen/Eigen/src/Core/MatrixBase.h
+++ b/contrib/eigen/Eigen/src/Core/MatrixBase.h
@@ -76,6 +76,7 @@ template<typename Derived> class MatrixBase
     using Base::coeffRef;
     using Base::lazyAssign;
     using Base::eval;
+    using Base::operator-;
     using Base::operator+=;
     using Base::operator-=;
     using Base::operator*=;
@@ -122,7 +123,6 @@ template<typename Derived> class MatrixBase
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase
 #define EIGEN_DOC_UNARY_ADDONS(X,Y)
-#   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
 #   include "../plugins/MatrixCwiseBinaryOps.h"
@@ -268,6 +268,8 @@ template<typename Derived> class MatrixBase
     Derived& setIdentity();
     EIGEN_DEVICE_FUNC
     Derived& setIdentity(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setUnit(Index i);
+    EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i);
 
     bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
     bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@@ -296,7 +298,7 @@ template<typename Derived> class MatrixBase
     EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
     { return cwiseNotEqual(other).any(); }
 
-    NoAlias<Derived,Eigen::MatrixBase > noalias();
+    NoAlias<Derived,Eigen::MatrixBase > EIGEN_DEVICE_FUNC noalias();
 
     // TODO forceAlignedAccess is temporarily disabled
     // Need to find a nicer workaround.
@@ -326,6 +328,7 @@ template<typename Derived> class MatrixBase
 
     inline const PartialPivLU<PlainObject> lu() const;
 
+    EIGEN_DEVICE_FUNC
     inline const Inverse<Derived> inverse() const;
 
     template<typename ResultType>
@@ -335,12 +338,15 @@ template<typename Derived> class MatrixBase
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
+
     template<typename ResultType>
     inline void computeInverseWithCheck(
       ResultType& inverse,
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
+
+    EIGEN_DEVICE_FUNC
     Scalar determinant() const;
 
 /////////// Cholesky module ///////////
@@ -412,15 +418,19 @@ template<typename Derived> class MatrixBase
 
 ////////// Householder module ///////////
 
+    EIGEN_DEVICE_FUNC
     void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void makeHouseholder(EssentialPart& essential,
                          Scalar& tau, RealScalar& beta) const;
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void applyHouseholderOnTheLeft(const EssentialPart& essential,
                                    const Scalar& tau,
                                    Scalar* workspace);
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void applyHouseholderOnTheRight(const EssentialPart& essential,
                                     const Scalar& tau,
                                     Scalar* workspace);
@@ -428,8 +438,10 @@ template<typename Derived> class MatrixBase
 ///////// Jacobi module /////////
 
     template<typename OtherScalar>
+    EIGEN_DEVICE_FUNC
     void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
     template<typename OtherScalar>
+    EIGEN_DEVICE_FUNC
     void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
 
 ///////// SparseCore module /////////
@@ -456,6 +468,11 @@ template<typename Derived> class MatrixBase
     const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const;
     EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine)
     EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine)
+#if EIGEN_HAS_CXX11_MATH
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine)
+#endif
     EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine)
     EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine)
     EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root)
diff --git a/contrib/eigen/Eigen/src/Core/NestByValue.h b/contrib/eigen/Eigen/src/Core/NestByValue.h
index 13adf070e8987cb452d12256d7c1ff113b8f2721..b4275768a01df3b8e2a06424010bd8d33cffea30 100644
--- a/contrib/eigen/Eigen/src/Core/NestByValue.h
+++ b/contrib/eigen/Eigen/src/Core/NestByValue.h
@@ -16,7 +16,11 @@ namespace Eigen {
 namespace internal {
 template<typename ExpressionType>
 struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType>
-{};
+{
+  enum {
+    Flags = traits<ExpressionType>::Flags & ~NestByRefBit
+  };
+};
 }
 
 /** \class NestByValue
@@ -41,57 +45,13 @@ template<typename ExpressionType> class NestByValue
 
     EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
 
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
-
-    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
-    {
-      return m_expression.coeff(row, col);
-    }
-
-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_expression.const_cast_derived().coeffRef(row, col);
-    }
-
-    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
-    {
-      return m_expression.coeff(index);
-    }
-
-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
-    {
-      return m_expression.template packet<LoadMode>(row, col);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_expression.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
-    }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); }
 
     EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
 
+    EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; }
+
   protected:
     const ExpressionType m_expression;
 };
@@ -99,12 +59,27 @@ template<typename ExpressionType> class NestByValue
 /** \returns an expression of the temporary version of *this.
   */
 template<typename Derived>
-inline const NestByValue<Derived>
+EIGEN_DEVICE_FUNC inline const NestByValue<Derived>
 DenseBase<Derived>::nestByValue() const
 {
   return NestByValue<Derived>(derived());
 }
 
+namespace internal {
+
+// Evaluator of Solve -> eval into a temporary
+template<typename ArgType>
+struct evaluator<NestByValue<ArgType> >
+  : public evaluator<ArgType>
+{
+  typedef evaluator<ArgType> Base;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue<ArgType>& xpr)
+    : Base(xpr.nestedExpression())
+  {}
+};
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_NESTBYVALUE_H
diff --git a/contrib/eigen/Eigen/src/Core/NoAlias.h b/contrib/eigen/Eigen/src/Core/NoAlias.h
index 33908010b4f13631146eb45a6bfdf04cae2e58c5..570283d90f1c5fba2b3cae629421b624128283c2 100644
--- a/contrib/eigen/Eigen/src/Core/NoAlias.h
+++ b/contrib/eigen/Eigen/src/Core/NoAlias.h
@@ -33,6 +33,7 @@ class NoAlias
   public:
     typedef typename ExpressionType::Scalar Scalar;
     
+    EIGEN_DEVICE_FUNC
     explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
     
     template<typename OtherDerived>
@@ -74,10 +75,10 @@ class NoAlias
   *
   * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag.
   * Currently, even though several expressions may alias, only product
-  * expressions have this flag. Therefore, noalias() is only usefull when
+  * expressions have this flag. Therefore, noalias() is only useful when
   * the source expression contains a matrix product.
   *
-  * Here are some examples where noalias is usefull:
+  * Here are some examples where noalias is useful:
   * \code
   * D.noalias()  = A * B;
   * D.noalias() += A.transpose() * B;
@@ -98,7 +99,7 @@ class NoAlias
   * \sa class NoAlias
   */
 template<typename Derived>
-NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
+NoAlias<Derived,MatrixBase> EIGEN_DEVICE_FUNC MatrixBase<Derived>::noalias()
 {
   return NoAlias<Derived, Eigen::MatrixBase >(derived());
 }
diff --git a/contrib/eigen/Eigen/src/Core/NumTraits.h b/contrib/eigen/Eigen/src/Core/NumTraits.h
index daf48987898f7a38e27b996515e9da44a8d4a7ef..72eac5a93c09ab5f8bc4ccc50fc8a22ae3f2e0c1 100644
--- a/contrib/eigen/Eigen/src/Core/NumTraits.h
+++ b/contrib/eigen/Eigen/src/Core/NumTraits.h
@@ -21,12 +21,14 @@ template< typename T,
           bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits10_impl
 {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static int run() { return std::numeric_limits<T>::digits10; }
 };
 
 template<typename T>
 struct default_digits10_impl<T,false,false> // Floating point
 {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static int run() {
     using std::log10;
     using std::ceil;
@@ -38,11 +40,64 @@ struct default_digits10_impl<T,false,false> // Floating point
 template<typename T>
 struct default_digits10_impl<T,false,true> // Integer
 {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int run() { return 0; }
+};
+
+
+// default implementation of digits(), based on numeric_limits if specialized,
+// 0 for integer types, and log2(epsilon()) otherwise.
+template< typename T,
+          bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
+          bool is_integer = NumTraits<T>::IsInteger>
+struct default_digits_impl
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int run() { return std::numeric_limits<T>::digits; }
+};
+
+template<typename T>
+struct default_digits_impl<T,false,false> // Floating point
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static int run() {
+    using std::log;
+    using std::ceil;
+    typedef typename NumTraits<T>::Real Real;
+    return int(ceil(-log(NumTraits<Real>::epsilon())/log(static_cast<Real>(2))));
+  }
+};
+
+template<typename T>
+struct default_digits_impl<T,false,true> // Integer
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static int run() { return 0; }
 };
 
 } // end namespace internal
 
+namespace numext {
+/** \internal bit-wise cast without changing the underlying bit representation. */
+
+// TODO: Replace by std::bit_cast (available in C++20)
+template <typename Tgt, typename Src>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) {
+#if EIGEN_HAS_TYPE_TRAITS
+  // The behaviour of memcpy is not specified for non-trivially copyable types
+  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Src>::value, THIS_TYPE_IS_NOT_SUPPORTED);
+  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Tgt>::value && std::is_default_constructible<Tgt>::value,
+                      THIS_TYPE_IS_NOT_SUPPORTED);
+#endif
+
+  EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED);
+  Tgt tgt;
+  EIGEN_USING_STD(memcpy)
+  memcpy(&tgt, &src, sizeof(Tgt));
+  return tgt;
+}
+}  // namespace numext
+
 /** \class NumTraits
   * \ingroup Core_Module
   *
@@ -71,7 +126,7 @@ struct default_digits10_impl<T,false,true> // Integer
   *     and to \c 0 otherwise.
   * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed
   *     to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers.
-  *     Stay vague here. No need to do architecture-specific stuff.
+  *     Stay vague here. No need to do architecture-specific stuff. If you don't know what this means, just use \c Eigen::HugeCost.
   * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
   * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
   *     be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
@@ -80,9 +135,18 @@ struct default_digits10_impl<T,false,true> // Integer
   * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default
   *     value by the fuzzy comparison operators.
   * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
+  * \li digits() function returning the number of radix digits (non-sign digits for integers, mantissa for floating-point). This is
+  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits">std::numeric_limits<T>::digits</a>
+  *     which is used as the default implementation if specialized.
   * \li digits10() function returning the number of decimal digits that can be represented without change. This is
   *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">std::numeric_limits<T>::digits10</a>
   *     which is used as the default implementation if specialized.
+  * \li min_exponent() and max_exponent() functions returning the highest and lowest possible values, respectively,
+  *     such that the radix raised to the power exponent-1 is a normalized floating-point number.  These are equivalent to
+  *     <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/min_exponent">std::numeric_limits<T>::min_exponent</a>/
+  *     <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/max_exponent">std::numeric_limits<T>::max_exponent</a>.
+  * \li infinity() function returning a representation of positive infinity, if available.
+  * \li quiet_NaN function returning a non-signaling "not-a-number", if available.
   */
 
 template<typename T> struct GenericNumTraits
@@ -106,42 +170,60 @@ template<typename T> struct GenericNumTraits
   typedef T Nested;
   typedef T Literal;
 
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline Real epsilon()
   {
     return numext::numeric_limits<T>::epsilon();
   }
 
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline int digits10()
   {
     return internal::default_digits10_impl<T>::run();
   }
 
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline int digits()
+  {
+    return internal::default_digits_impl<T>::run();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline int min_exponent()
+  {
+    return numext::numeric_limits<T>::min_exponent;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline int max_exponent()
+  {
+    return numext::numeric_limits<T>::max_exponent;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline Real dummy_precision()
   {
     // make sure to override this for floating-point types
     return Real(0);
   }
 
-
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline T highest() {
     return (numext::numeric_limits<T>::max)();
   }
 
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline T lowest()  {
-    return IsInteger ? (numext::numeric_limits<T>::min)() : (-(numext::numeric_limits<T>::max)());
+    return IsInteger ? (numext::numeric_limits<T>::min)()
+                     : static_cast<T>(-(numext::numeric_limits<T>::max)());
   }
 
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline T infinity() {
     return numext::numeric_limits<T>::infinity();
   }
 
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline T quiet_NaN() {
     return numext::numeric_limits<T>::quiet_NaN();
   }
@@ -153,19 +235,20 @@ template<typename T> struct NumTraits : GenericNumTraits<T>
 template<> struct NumTraits<float>
   : GenericNumTraits<float>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline float dummy_precision() { return 1e-5f; }
 };
 
 template<> struct NumTraits<double> : GenericNumTraits<double>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline double dummy_precision() { return 1e-12; }
 };
 
 template<> struct NumTraits<long double>
   : GenericNumTraits<long double>
 {
+  EIGEN_CONSTEXPR
   static inline long double dummy_precision() { return 1e-15l; }
 };
 
@@ -182,11 +265,11 @@ template<typename _Real> struct NumTraits<std::complex<_Real> >
     MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
   };
 
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline int digits10() { return NumTraits<Real>::digits10(); }
 };
 
@@ -206,16 +289,17 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
     IsInteger = NumTraits<Scalar>::IsInteger,
     IsSigned  = NumTraits<Scalar>::IsSigned,
     RequireInitialization = 1,
-    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::ReadCost,
-    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost,
-    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost
+    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::ReadCost),
+    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::AddCost),
+    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::MulCost)
   };
 
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
 
+  EIGEN_CONSTEXPR
   static inline int digits10() { return NumTraits<Scalar>::digits10(); }
 };
 
@@ -229,6 +313,7 @@ template<> struct NumTraits<std::string>
     MulCost  = HugeCost
   };
 
+  EIGEN_CONSTEXPR
   static inline int digits10() { return 0; }
 
 private:
@@ -243,6 +328,8 @@ private:
 // Empty specialization for void to allow template specialization based on NumTraits<T>::Real with T==void and SFINAE.
 template<> struct NumTraits<void> {};
 
+template<> struct NumTraits<bool> : GenericNumTraits<bool> {};
+
 } // end namespace Eigen
 
 #endif // EIGEN_NUMTRAITS_H
diff --git a/contrib/eigen/Eigen/src/Core/PermutationMatrix.h b/contrib/eigen/Eigen/src/Core/PermutationMatrix.h
index b1fb455b98c236588e5069a7fa18c3d64eeb71c5..69401bf41e5296d421fb2e024aec558e8a684e3f 100644
--- a/contrib/eigen/Eigen/src/Core/PermutationMatrix.h
+++ b/contrib/eigen/Eigen/src/Core/PermutationMatrix.h
@@ -87,25 +87,14 @@ class PermutationBase : public EigenBase<Derived>
       return derived();
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Derived& operator=(const PermutationBase& other)
-    {
-      indices() = other.indices();
-      return derived();
-    }
-    #endif
-
     /** \returns the number of rows */
-    inline Index rows() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); }
 
     /** \returns the number of columns */
-    inline Index cols() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); }
 
     /** \returns the size of a side of the respective square matrix, i.e., the number of indices */
-    inline Index size() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); }
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename DenseDerived>
@@ -333,12 +322,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
     inline PermutationMatrix(const PermutationBase<OtherDerived>& other)
       : m_indices(other.indices()) {}
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Standard copy constructor. Defined only to prevent a default copy constructor
-      * from hiding the other templated constructor */
-    inline PermutationMatrix(const PermutationMatrix& other) : m_indices(other.indices()) {}
-    #endif
-
     /** Generic constructor from expression of the indices. The indices
       * array has the meaning that the permutations sends each integer i to indices[i].
       *
@@ -373,17 +356,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
       return Base::operator=(tr.derived());
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    PermutationMatrix& operator=(const PermutationMatrix& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
     /** const version of indices(). */
     const IndicesType& indices() const { return m_indices; }
     /** \returns a reference to the stored array representing the permutation. */
diff --git a/contrib/eigen/Eigen/src/Core/PlainObjectBase.h b/contrib/eigen/Eigen/src/Core/PlainObjectBase.h
index 0f3632cfd14ec0545fe5c44f7f27a572144de15c..e2ddbd1d522a283a5992a504f023eebc48a05670 100644
--- a/contrib/eigen/Eigen/src/Core/PlainObjectBase.h
+++ b/contrib/eigen/Eigen/src/Core/PlainObjectBase.h
@@ -13,10 +13,10 @@
 
 #if defined(EIGEN_INITIALIZE_MATRICES_BY_ZERO)
 # define EIGEN_INITIALIZE_COEFFS
-# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(int i=0;i<base().size();++i) coeffRef(i)=Scalar(0);
+# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(Index i=0;i<base().size();++i) coeffRef(i)=Scalar(0);
 #elif defined(EIGEN_INITIALIZE_MATRICES_BY_NAN)
 # define EIGEN_INITIALIZE_COEFFS
-# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(int i=0;i<base().size();++i) coeffRef(i)=std::numeric_limits<Scalar>::quiet_NaN();
+# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(Index i=0;i<base().size();++i) coeffRef(i)=std::numeric_limits<Scalar>::quiet_NaN();
 #else
 # undef EIGEN_INITIALIZE_COEFFS
 # define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
@@ -104,7 +104,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
     typedef typename internal::traits<Derived>::Scalar Scalar;
-    
+
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef Derived DenseType;
@@ -118,16 +118,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     using Base::IsVectorAtCompileTime;
     using Base::Flags;
 
-    template<typename PlainObjectType, int MapOptions, typename StrideType> friend class Eigen::Map;
-    friend  class Eigen::Map<Derived, Unaligned>;
     typedef Eigen::Map<Derived, Unaligned>  MapType;
-    friend  class Eigen::Map<const Derived, Unaligned>;
     typedef const Eigen::Map<const Derived, Unaligned> ConstMapType;
-#if EIGEN_MAX_ALIGN_BYTES>0
-    // for EIGEN_MAX_ALIGN_BYTES==0, AlignedMax==Unaligned, and many compilers generate warnings for friend-ing a class twice.
-    friend  class Eigen::Map<Derived, AlignedMax>;
-    friend  class Eigen::Map<const Derived, AlignedMax>;
-#endif
     typedef Eigen::Map<Derived, AlignedMax> AlignedMapType;
     typedef const Eigen::Map<const Derived, AlignedMax> ConstAlignedMapType;
     template<typename StrideType> struct StridedMapType { typedef Eigen::Map<Derived, Unaligned, StrideType> type; };
@@ -147,10 +139,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     EIGEN_DEVICE_FUNC
     const Base& base() const { return *static_cast<const Base*>(this); }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rows() const { return m_storage.rows(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_storage.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_storage.cols(); }
 
     /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index,Index) const
       * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
@@ -358,7 +350,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * remain row-vectors and vectors remain vectors.
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other)
     {
       const OtherDerived& other = _other.derived();
@@ -383,7 +375,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
       * conservativeResize(Index, NoChange_t).
       *
-      * Matrices are resized relative to the top-left element. In case values need to be 
+      * Matrices are resized relative to the top-left element. In case values need to be
       * appended to the matrix they will be uninitialized.
       */
     EIGEN_DEVICE_FUNC
@@ -440,7 +432,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
       * conservativeResize(Index, NoChange_t).
       *
-      * Matrices are resized relative to the top-left element. In case values need to be 
+      * Matrices are resized relative to the top-left element. In case values need to be
       * appended to the matrix they will copied from \c other.
       */
     template<typename OtherDerived>
@@ -508,8 +500,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     EIGEN_DEVICE_FUNC
     PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_storage, other.m_storage);
+      _check_template_params();
+      m_storage = std::move(other.m_storage);
       return *this;
     }
 #endif
@@ -526,6 +518,71 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 //       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
     }
 
+    #if EIGEN_HAS_CXX11
+    /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11
+      *
+      * \only_for_vectors
+      *
+      * This constructor is for 1D array or vectors with more than 4 coefficients.
+      * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients.
+      *
+      * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
+      * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+      */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
+      : m_storage()
+    {
+      _check_template_params();
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4);
+      m_storage.data()[0] = a0;
+      m_storage.data()[1] = a1;
+      m_storage.data()[2] = a2;
+      m_storage.data()[3] = a3;
+      Index i = 4;
+      auto x = {(m_storage.data()[i++] = args, 0)...};
+      static_cast<void>(x);
+    }
+
+    /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer
+      * lists \cpp11
+      */
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list<std::initializer_list<Scalar>>& list)
+      : m_storage()
+    {
+      _check_template_params();
+
+      size_t list_size = 0;
+      if (list.begin() != list.end()) {
+        list_size = list.begin()->size();
+      }
+
+      // This is to allow syntax like VectorXi {{1, 2, 3, 4}}
+      if (ColsAtCompileTime == 1 && list.size() == 1) {
+        eigen_assert(list_size == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
+        resize(list_size, ColsAtCompileTime);
+        std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data());
+      } else {
+        eigen_assert(list.size() == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
+        eigen_assert(list_size == static_cast<size_t>(ColsAtCompileTime) || ColsAtCompileTime == Dynamic);
+        resize(list.size(), list_size);
+
+        Index row_index = 0;
+        for (const std::initializer_list<Scalar>& row : list) {
+          eigen_assert(list_size == row.size());
+          Index col_index = 0;
+          for (const Scalar& e : row) {
+            coeffRef(row_index, col_index) = e;
+            ++col_index;
+          }
+          ++row_index;
+        }
+      }
+    }
+    #endif  // end EIGEN_HAS_CXX11
+
     /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
@@ -564,7 +621,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
     {
       _resize_to_match(other);
@@ -652,18 +709,26 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     using Base::setConstant;
     EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val);
     EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& val);
+    EIGEN_DEVICE_FUNC Derived& setConstant(NoChange_t, Index cols, const Scalar& val);
+    EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, NoChange_t, const Scalar& val);
 
     using Base::setZero;
     EIGEN_DEVICE_FUNC Derived& setZero(Index size);
     EIGEN_DEVICE_FUNC Derived& setZero(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setZero(NoChange_t, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setZero(Index rows, NoChange_t);
 
     using Base::setOnes;
     EIGEN_DEVICE_FUNC Derived& setOnes(Index size);
     EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setOnes(NoChange_t, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, NoChange_t);
 
     using Base::setRandom;
     Derived& setRandom(Index size);
     Derived& setRandom(Index rows, Index cols);
+    Derived& setRandom(NoChange_t, Index cols);
+    Derived& setRandom(Index rows, NoChange_t);
 
     #ifdef EIGEN_PLAINOBJECTBASE_PLUGIN
     #include EIGEN_PLAINOBJECTBASE_PLUGIN
@@ -678,7 +743,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * remain row-vectors and vectors remain vectors.
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase<OtherDerived>& other)
     {
       #ifdef EIGEN_NO_AUTOMATIC_RESIZING
@@ -705,10 +770,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * \internal
       */
-    // aliasing is dealt once in internall::call_assignment
+    // aliasing is dealt once in internal::call_assignment
     // so at this stage we have to assume aliasing... and resising has to be done later.
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& _set(const DenseBase<OtherDerived>& other)
     {
       internal::call_assignment(this->derived(), other.derived());
@@ -721,7 +786,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * \sa operator=(const MatrixBase<OtherDerived>&), _set()
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase<OtherDerived>& other)
     {
       // I don't think we need this resize call since the lazyAssign will anyways resize
@@ -744,18 +809,18 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                           FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
       resize(rows,cols);
     }
-    
+
     template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
     {
       EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
       m_storage.data()[0] = Scalar(val0);
       m_storage.data()[1] = Scalar(val1);
     }
-    
+
     template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1,
                                     typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
                                                                   && (internal::is_same<T0,Index>::value)
@@ -781,8 +846,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                           FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
       resize(size);
     }
-    
-    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted)
+
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted)
     template<typename T>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
@@ -790,7 +855,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
       m_storage.data()[0] = val0;
     }
-    
+
     // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type match the index type)
     template<typename T>
     EIGEN_DEVICE_FUNC
@@ -846,7 +911,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     {
       this->derived() = r;
     }
-    
+
     // For fixed-size Array<Scalar,...>
     template<typename T>
     EIGEN_DEVICE_FUNC
@@ -858,7 +923,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     {
       Base::setConstant(val0);
     }
-    
+
     // For fixed-size Array<Index,...>
     template<typename T>
     EIGEN_DEVICE_FUNC
@@ -872,38 +937,38 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     {
       Base::setConstant(val0);
     }
-    
+
     template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
     friend struct internal::matrix_swap_impl;
 
   public:
-    
+
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal
       * \brief Override DenseBase::swap() since for dynamic-sized matrices
       * of same type it is enough to swap the data pointers.
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     void swap(DenseBase<OtherDerived> & other)
     {
       enum { SwapPointers = internal::is_same<Derived, OtherDerived>::value && Base::SizeAtCompileTime==Dynamic };
       internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.derived());
     }
-    
+
     /** \internal
       * \brief const version forwarded to DenseBase::swap
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     void swap(DenseBase<OtherDerived> const & other)
     { Base::swap(other.derived()); }
-    
-    EIGEN_DEVICE_FUNC 
+
+    EIGEN_DEVICE_FUNC
     static EIGEN_STRONG_INLINE void _check_template_params()
     {
-      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor)
-                        && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (Options&RowMajor)==0)
+      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (int(Options)&RowMajor)==RowMajor)
+                        && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (int(Options)&RowMajor)==0)
                         && ((RowsAtCompileTime == Dynamic) || (RowsAtCompileTime >= 0))
                         && ((ColsAtCompileTime == Dynamic) || (ColsAtCompileTime >= 0))
                         && ((MaxRowsAtCompileTime == Dynamic) || (MaxRowsAtCompileTime >= 0))
@@ -915,6 +980,17 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     }
 
     enum { IsPlainObjectBase = 1 };
+#endif
+  public:
+    // These apparently need to be down here for nvcc+icc to prevent duplicate
+    // Map symbol.
+    template<typename PlainObjectType, int MapOptions, typename StrideType> friend class Eigen::Map;
+    friend class Eigen::Map<Derived, Unaligned>;
+    friend class Eigen::Map<const Derived, Unaligned>;
+#if EIGEN_MAX_ALIGN_BYTES>0
+    // for EIGEN_MAX_ALIGN_BYTES==0, AlignedMax==Unaligned, and many compilers generate warnings for friend-ing a class twice.
+    friend class Eigen::Map<Derived, AlignedMax>;
+    friend class Eigen::Map<const Derived, AlignedMax>;
 #endif
 };
 
@@ -923,13 +999,19 @@ namespace internal {
 template <typename Derived, typename OtherDerived, bool IsVector>
 struct conservative_resize_like_impl
 {
+  #if EIGEN_HAS_TYPE_TRAITS
+  static const bool IsRelocatable = std::is_trivially_copyable<typename Derived::Scalar>::value;
+  #else
+  static const bool IsRelocatable = !NumTraits<typename Derived::Scalar>::RequireInitialization;
+  #endif
   static void run(DenseBase<Derived>& _this, Index rows, Index cols)
   {
     if (_this.rows() == rows && _this.cols() == cols) return;
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
 
-    if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
-         (!Derived::IsRowMajor && _this.rows() == rows) )  // column-major and we change only the number of columns
+    if ( IsRelocatable
+          && (( Derived::IsRowMajor && _this.cols() == cols) ||  // row-major and we change only the number of rows
+              (!Derived::IsRowMajor && _this.rows() == rows) ))  // column-major and we change only the number of columns
     {
       internal::check_rows_cols_for_overflow<Derived::MaxSizeAtCompileTime>::run(rows, cols);
       _this.derived().m_storage.conservativeResize(rows*cols,rows,cols);
@@ -937,7 +1019,7 @@ struct conservative_resize_like_impl
     else
     {
       // The storage order does not allow us to use reallocation.
-      typename Derived::PlainObject tmp(rows,cols);
+      Derived tmp(rows,cols);
       const Index common_rows = numext::mini(rows, _this.rows());
       const Index common_cols = numext::mini(cols, _this.cols());
       tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
@@ -957,8 +1039,9 @@ struct conservative_resize_like_impl
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived)
 
-    if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
-         (!Derived::IsRowMajor && _this.rows() == other.rows()) )  // column-major and we change only the number of columns
+    if ( IsRelocatable &&
+          (( Derived::IsRowMajor && _this.cols() == other.cols()) ||  // row-major and we change only the number of rows
+           (!Derived::IsRowMajor && _this.rows() == other.rows()) ))  // column-major and we change only the number of columns
     {
       const Index new_rows = other.rows() - _this.rows();
       const Index new_cols = other.cols() - _this.cols();
@@ -971,7 +1054,7 @@ struct conservative_resize_like_impl
     else
     {
       // The storage order does not allow us to use reallocation.
-      typename Derived::PlainObject tmp(other);
+      Derived tmp(other);
       const Index common_rows = numext::mini(tmp.rows(), _this.rows());
       const Index common_cols = numext::mini(tmp.cols(), _this.cols());
       tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
@@ -986,13 +1069,18 @@ template <typename Derived, typename OtherDerived>
 struct conservative_resize_like_impl<Derived,OtherDerived,true>
   : conservative_resize_like_impl<Derived,OtherDerived,false>
 {
-  using conservative_resize_like_impl<Derived,OtherDerived,false>::run;
-  
+  typedef conservative_resize_like_impl<Derived,OtherDerived,false> Base;
+  using Base::run;
+  using Base::IsRelocatable;
+
   static void run(DenseBase<Derived>& _this, Index size)
   {
     const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
     const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1;
-    _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
+    if(IsRelocatable)
+      _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
+    else
+      Base::run(_this.derived(), new_rows, new_cols);
   }
 
   static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
@@ -1003,7 +1091,10 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>
 
     const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
     const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
-    _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
+    if(IsRelocatable)
+      _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
+    else
+      Base::run(_this.derived(), new_rows, new_cols);
 
     if (num_new_elements > 0)
       _this.tail(num_new_elements) = other.tail(num_new_elements);
@@ -1014,7 +1105,7 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
 struct matrix_swap_impl
 {
   EIGEN_DEVICE_FUNC
-  static inline void run(MatrixTypeA& a, MatrixTypeB& b)
+  static EIGEN_STRONG_INLINE void run(MatrixTypeA& a, MatrixTypeB& b)
   {
     a.base().swap(b);
   }
diff --git a/contrib/eigen/Eigen/src/Core/Product.h b/contrib/eigen/Eigen/src/Core/Product.h
index 676c48027792c944027c01ed4abff90cbffb09d6..70a6c10639695fd5c50c833707ed6aacfe1f1b1e 100644
--- a/contrib/eigen/Eigen/src/Core/Product.h
+++ b/contrib/eigen/Eigen/src/Core/Product.h
@@ -23,25 +23,25 @@ struct traits<Product<Lhs, Rhs, Option> >
   typedef typename remove_all<Rhs>::type RhsCleaned;
   typedef traits<LhsCleaned> LhsTraits;
   typedef traits<RhsCleaned> RhsTraits;
-  
+
   typedef MatrixXpr XprKind;
-  
+
   typedef typename ScalarBinaryOpTraits<typename traits<LhsCleaned>::Scalar, typename traits<RhsCleaned>::Scalar>::ReturnType Scalar;
   typedef typename product_promote_storage_type<typename LhsTraits::StorageKind,
                                                 typename RhsTraits::StorageKind,
                                                 internal::product_type<Lhs,Rhs>::ret>::ret StorageKind;
   typedef typename promote_index_type<typename LhsTraits::StorageIndex,
                                       typename RhsTraits::StorageIndex>::type StorageIndex;
-  
+
   enum {
     RowsAtCompileTime    = LhsTraits::RowsAtCompileTime,
     ColsAtCompileTime    = RhsTraits::ColsAtCompileTime,
     MaxRowsAtCompileTime = LhsTraits::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = RhsTraits::MaxColsAtCompileTime,
-    
+
     // FIXME: only needed by GeneralMatrixMatrixTriangular
     InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsTraits::ColsAtCompileTime, RhsTraits::RowsAtCompileTime),
-    
+
     // The storage order is somewhat arbitrary here. The correct one will be determined through the evaluator.
     Flags = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? RowMajorBit
           : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0
@@ -74,10 +74,10 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
                                                                                    internal::product_type<_Lhs,_Rhs>::ret>::ret>
 {
   public:
-    
+
     typedef _Lhs Lhs;
     typedef _Rhs Rhs;
-    
+
     typedef typename ProductImpl<
         Lhs, Rhs, Option,
         typename internal::product_promote_storage_type<typename internal::traits<Lhs>::StorageKind,
@@ -90,18 +90,23 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
     typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
     typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
 
-    EIGEN_DEVICE_FUNC Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
     {
       eigen_assert(lhs.cols() == rhs.rows()
         && "invalid matrix product"
         && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
     }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
 
-    EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
-    EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const LhsNestedCleaned& lhs() const { return m_lhs; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const RhsNestedCleaned& rhs() const { return m_rhs; }
 
   protected:
 
@@ -110,13 +115,13 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
 };
 
 namespace internal {
-  
+
 template<typename Lhs, typename Rhs, int Option, int ProductTag = internal::product_type<Lhs,Rhs>::ret>
 class dense_product_base
  : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
 {};
 
-/** Convertion to scalar for inner-products */
+/** Conversion to scalar for inner-products */
 template<typename Lhs, typename Rhs, int Option>
 class dense_product_base<Lhs, Rhs, Option, InnerProduct>
  : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
@@ -126,8 +131,8 @@ class dense_product_base<Lhs, Rhs, Option, InnerProduct>
 public:
   using Base::derived;
   typedef typename Base::Scalar Scalar;
-  
-  EIGEN_STRONG_INLINE operator const Scalar() const
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const
   {
     return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
   }
@@ -148,25 +153,25 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
   : public internal::dense_product_base<Lhs,Rhs,Option>
 {
     typedef Product<Lhs, Rhs, Option> Derived;
-    
+
   public:
-    
+
     typedef typename internal::dense_product_base<Lhs, Rhs, Option> Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
   protected:
     enum {
-      IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) && 
+      IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) &&
                    (ColsAtCompileTime == 1 || ColsAtCompileTime == Dynamic),
       EnableCoeff = IsOneByOne || Option==LazyProduct
     };
-    
+
   public:
-  
+
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const
     {
       EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
       eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
-      
+
       return internal::evaluator<Derived>(derived()).coeff(row,col);
     }
 
@@ -174,11 +179,11 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
     {
       EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
       eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
-      
+
       return internal::evaluator<Derived>(derived()).coeff(i);
     }
-    
-  
+
+
 };
 
 } // end namespace Eigen
diff --git a/contrib/eigen/Eigen/src/Core/ProductEvaluators.h b/contrib/eigen/Eigen/src/Core/ProductEvaluators.h
index bce1310c96f91c550744a53eeb9d2bd4415e4cce..8cf294b287bc9a458cd9b5f0bb4c03e4a5b51126 100644
--- a/contrib/eigen/Eigen/src/Core/ProductEvaluators.h
+++ b/contrib/eigen/Eigen/src/Core/ProductEvaluators.h
@@ -14,27 +14,27 @@
 #define EIGEN_PRODUCTEVALUATORS_H
 
 namespace Eigen {
-  
+
 namespace internal {
 
 /** \internal
   * Evaluator of a product expression.
   * Since products require special treatments to handle all possible cases,
-  * we simply deffer the evaluation logic to a product_evaluator class
+  * we simply defer the evaluation logic to a product_evaluator class
   * which offers more partial specialization possibilities.
-  * 
+  *
   * \sa class product_evaluator
   */
 template<typename Lhs, typename Rhs, int Options>
-struct evaluator<Product<Lhs, Rhs, Options> > 
+struct evaluator<Product<Lhs, Rhs, Options> >
  : public product_evaluator<Product<Lhs, Rhs, Options> >
 {
   typedef Product<Lhs, Rhs, Options> XprType;
   typedef product_evaluator<XprType> Base;
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
- 
+
 // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
 // TODO we should apply that rule only if that's really helpful
 template<typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
@@ -62,12 +62,12 @@ struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
 
 
 template<typename Lhs, typename Rhs, int DiagIndex>
-struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> > 
+struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
  : public evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> >
 {
   typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
   typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
     : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
         Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
@@ -108,27 +108,27 @@ struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsSh
     : m_result(xpr.rows(), xpr.cols())
   {
     ::new (static_cast<Base*>(this)) Base(m_result);
-    
+
 // FIXME shall we handle nested_eval here?,
 // if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in permutation_matrix_product, transposition_matrix_product, etc.)
 //     typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
 //     typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
 //     typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
 //     typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
-//     
+//
 //     const LhsNested lhs(xpr.lhs());
 //     const RhsNested rhs(xpr.rhs());
-//   
+//
 //     generic_product_impl<LhsNestedCleaned, RhsNestedCleaned>::evalTo(m_result, lhs, rhs);
 
     generic_product_impl<Lhs, Rhs, LhsShape, RhsShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
   }
-  
-protected:  
+
+protected:
   PlainObject m_result;
 };
 
-// The following three shortcuts are enabled only if the scalar types match excatly.
+// The following three shortcuts are enabled only if the scalar types match exactly.
 // TODO: we could enable them for different scalar types when the product is not vectorized.
 
 // Dense = Product
@@ -137,7 +137,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
   typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
   typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
   {
     Index dstRows = src.rows();
@@ -155,7 +155,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
   typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
   typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@@ -170,7 +170,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
   typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
   typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@@ -190,7 +190,7 @@ struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBi
   typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
                         const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
                         const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
   {
     call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
@@ -217,7 +217,7 @@ template<typename DstXprType, typename OtherXpr, typename ProductType, typename
 struct assignment_from_xpr_op_product
 {
   template<typename SrcXprType, typename InitialFunc>
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
   {
     call_assignment_no_alias(dst, src.lhs(), Func1());
@@ -246,19 +246,19 @@ template<typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 {
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
   }
-  
+
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
   }
-  
+
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
 };
 
@@ -269,10 +269,10 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 
 // Column major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
 {
   evaluator<Rhs> rhsEval(rhs);
-  typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
+  ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
   // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
   // FIXME not very good if rhs is real and lhs complex while alpha is real too
   const Index cols = dst.cols();
@@ -282,10 +282,10 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
 
 // Row major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
 {
   evaluator<Lhs> lhsEval(lhs);
-  typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
+  ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
   // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
   // FIXME not very good if lhs is real and rhs complex while alpha is real too
   const Index rows = dst.rows();
@@ -298,43 +298,43 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
 {
   template<typename T> struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
   // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
-  struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
-  struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
-  struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
+  struct set  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
+  struct add  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
+  struct sub  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
   struct adds {
     Scalar m_scale;
     explicit adds(const Scalar& s) : m_scale(s) {}
-    template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
+    template<typename Dst, typename Src> void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const {
       dst.const_cast_derived() += m_scale * src;
     }
   };
-  
+
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
   }
-  
+
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
   }
-  
+
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
   }
-  
+
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
     internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
   }
-  
+
 };
 
 
@@ -343,21 +343,21 @@ template<typename Lhs, typename Rhs, typename Derived>
 struct generic_product_impl_base
 {
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
 
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }
 
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
-  
+
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
 
 };
@@ -373,8 +373,13 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
   typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;
 
   template<typename Dest>
-  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
+    // Fallback to inner product if both the lhs and rhs is a runtime vector.
+    if (lhs.rows() == 1 && rhs.cols() == 1) {
+      dst.coeffRef(0,0) += alpha * lhs.row(0).conjugate().dot(rhs.col(0));
+      return;
+    }
     LhsNested actual_lhs(lhs);
     RhsNested actual_rhs(rhs);
     internal::gemv_dense_selector<Side,
@@ -385,12 +390,12 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
 };
 
 template<typename Lhs, typename Rhs>
-struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> 
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
 {
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     // Same as: dst.noalias() = lhs.lazyProduct(rhs);
     // but easier on the compiler side
@@ -398,48 +403,71 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
   }
 
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     // dst.noalias() += lhs.lazyProduct(rhs);
     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
   }
-  
+
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     // dst.noalias() -= lhs.lazyProduct(rhs);
     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
   }
 
-  // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
-  //    dst {,+,-}= s * (A.lazyProduct(B))
-  // This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
-  // For them, this strategy is also faster than simply by-passing the heap allocation through
-  // stack allocation.
-  // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
-  // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
-  // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
-  template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
+  // This is a special evaluation path called from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
+  // This variant tries to extract scalar multiples from both the LHS and RHS and factor them out. For instance:
+  //   dst {,+,-}= (s1*A)*(B*s2)
+  // will be rewritten as:
+  //   dst {,+,-}= (s1*s2) * (A.lazyProduct(B))
+  // There are at least four benefits of doing so:
+  //  1 - huge performance gain for heap-allocated matrix types as it save costly allocations.
+  //  2 - it is faster than simply by-passing the heap allocation through stack allocation.
+  //  3 - it makes this fallback consistent with the heavy GEMM routine.
+  //  4 - it fully by-passes huge stack allocation attempts when multiplying huge fixed-size matrices.
+  //      (see https://stackoverflow.com/questions/54738495)
+  // For small fixed sizes matrices, howver, the gains are less obvious, it is sometimes x2 faster, but sometimes x3 slower,
+  // and the behavior depends also a lot on the compiler... This is why this re-writting strategy is currently
+  // enabled only when falling back from the main GEMM.
+  template<typename Dst, typename Func>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
-                                           const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
+  void eval_dynamic(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func &func)
   {
-    call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
+    enum {
+      HasScalarFactor = blas_traits<Lhs>::HasScalarFactor || blas_traits<Rhs>::HasScalarFactor,
+      ConjLhs = blas_traits<Lhs>::NeedToConjugate,
+      ConjRhs = blas_traits<Rhs>::NeedToConjugate
+    };
+    // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto
+    //        this is important for real*complex_mat
+    Scalar actualAlpha = combine_scalar_factors<Scalar>(lhs, rhs);
+
+    eval_dynamic_impl(dst,
+                      blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(),
+                      blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(),
+                      func,
+                      actualAlpha,
+                      typename conditional<HasScalarFactor,true_type,false_type>::type());
   }
 
-  // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
-  // overload more specialized.
-  template<typename Dst, typename LhsT, typename Func>
+protected:
+
+  template<typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar&  s /* == 1 */, false_type)
+  {
+    EIGEN_UNUSED_VARIABLE(s);
+    eigen_internal_assert(s==Scalar(1));
+    call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
+  }
+
+  template<typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
+  void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s, true_type)
   {
-    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
+    call_restricted_packet_assignment_no_alias(dst, s * lhs.lazyProduct(rhs), func);
   }
-  
-  
-//   template<typename Dst>
-//   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-//   { dst.noalias() += alpha * lhs.lazyProduct(rhs); }
 };
 
 // This specialization enforces the use of a coefficient-based evaluation strategy
@@ -497,7 +525,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
 
   typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
   typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
-  
+
   typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
   typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
 
@@ -516,19 +544,19 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   typedef typename find_best_packet<Scalar,ColsAtCompileTime>::type RhsVecPacketType;
 
   enum {
-      
+
     LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
     RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
     CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
                   : InnerSize == Dynamic ? HugeCost
-                  : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
+                    : InnerSize * (NumTraits<Scalar>::MulCost + int(LhsCoeffReadCost) + int(RhsCoeffReadCost))
                     + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
 
     Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
-    
+
     LhsFlags = LhsEtorType::Flags,
     RhsFlags = RhsEtorType::Flags,
-    
+
     LhsRowMajor = LhsFlags & RowMajorBit,
     RhsRowMajor = RhsFlags & RowMajorBit,
 
@@ -538,7 +566,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
     // Here, we don't care about alignment larger than the usable packet size.
     LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))),
     RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))),
-      
+
     SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,
 
     CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1),
@@ -548,12 +576,12 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
                     : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
                     : (bool(RhsRowMajor) && !CanVectorizeLhs),
 
-    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
+    Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & ~RowMajorBit)
           | (EvalToRowMajor ? RowMajorBit : 0)
           // TODO enable vectorization for mixed types
           | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0)
           | (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0),
-          
+
     LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
     RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
 
@@ -569,10 +597,10 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
     CanVectorizeInner =    SameType
                         && LhsRowMajor
                         && (!RhsRowMajor)
-                        && (LhsFlags & RhsFlags & ActualPacketAccessBit)
-                        && (InnerSize % packet_traits<Scalar>::size == 0)
+                        && (int(LhsFlags) & int(RhsFlags) & ActualPacketAccessBit)
+                        && (int(InnerSize) % packet_traits<Scalar>::size == 0)
   };
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const
   {
     return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
@@ -582,7 +610,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    * which is why we don't set the LinearAccessBit.
    * TODO: this seems possible when the result is a vector
    */
-  EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const CoeffReturnType coeff(Index index) const
   {
     const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
     const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
@@ -590,6 +619,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const PacketType packet(Index row, Index col) const
   {
     PacketType res;
@@ -601,6 +631,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const PacketType packet(Index index) const
   {
     const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
@@ -611,7 +642,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
 protected:
   typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
   typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
-  
+
   LhsEtorType m_lhsImpl;
   RhsEtorType m_rhsImpl;
 
@@ -629,7 +660,8 @@ struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProduc
   enum {
     Flags = Base::Flags | EvalBeforeNestingBit
   };
-  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit product_evaluator(const XprType& xpr)
     : Base(BaseProduct(xpr.lhs(),xpr.rhs()))
   {}
 };
@@ -641,7 +673,7 @@ struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProduc
 template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
   {
     etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
     res =  pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet<LoadMode,Packet>(Index(UnrollingIndex-1), col), res);
@@ -651,7 +683,7 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
 template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
   {
     etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
     res =  pmadd(lhs.template packet<LoadMode,Packet>(row, Index(UnrollingIndex-1)), pset1<Packet>(rhs.coeff(Index(UnrollingIndex-1), col)), res);
@@ -661,7 +693,7 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
   {
     res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),rhs.template packet<LoadMode,Packet>(Index(0), col));
   }
@@ -670,7 +702,7 @@ struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
   {
     res = pmul(lhs.template packet<LoadMode,Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
   }
@@ -679,7 +711,7 @@ struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
   {
     res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
   }
@@ -688,7 +720,7 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
   {
     res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
   }
@@ -697,7 +729,7 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
   {
     res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
     for(Index i = 0; i < innerDim; ++i)
@@ -708,7 +740,7 @@ struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
   {
     res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
     for(Index i = 0; i < innerDim; ++i)
@@ -730,7 +762,7 @@ struct generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag>
   : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag> >
 {
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
   template<typename Dest>
   static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
@@ -744,7 +776,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag>
 : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag> >
 {
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
   template<typename Dest>
   static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
@@ -765,9 +797,10 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
   : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag> >
 {
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
   template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
     selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
   }
@@ -778,7 +811,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag>
 : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag> >
 {
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-  
+
   template<typename Dest>
   static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
@@ -790,7 +823,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag>
 /***************************************************************************
 * Diagonal products
 ***************************************************************************/
-  
+
 template<typename MatrixType, typename DiagonalType, typename Derived, int ProductOrder>
 struct diagonal_product_evaluator_base
   : evaluator_base<Derived>
@@ -798,17 +831,25 @@ struct diagonal_product_evaluator_base
    typedef typename ScalarBinaryOpTraits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
 public:
   enum {
-    CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost,
-    
+    CoeffReadCost = int(NumTraits<Scalar>::MulCost) + int(evaluator<MatrixType>::CoeffReadCost) + int(evaluator<DiagonalType>::CoeffReadCost),
+
     MatrixFlags = evaluator<MatrixType>::Flags,
     DiagFlags = evaluator<DiagonalType>::Flags,
-    _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
+
+    _StorageOrder = (Derived::MaxRowsAtCompileTime==1 && Derived::MaxColsAtCompileTime!=1) ? RowMajor
+                  : (Derived::MaxColsAtCompileTime==1 && Derived::MaxRowsAtCompileTime!=1) ? ColMajor
+                  : MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
+    _SameStorageOrder = _StorageOrder == (MatrixFlags & RowMajorBit ? RowMajor : ColMajor),
+
     _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
                            ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
     _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
     // FIXME currently we need same types, but in the future the next rule should be the one
     //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
-    _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
+    _Vectorizable =   bool(int(MatrixFlags)&PacketAccessBit)
+                  &&  _SameTypes
+                  && (_SameStorageOrder || (MatrixFlags&LinearAccessBit)==LinearAccessBit)
+                  && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
     _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
     Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
     Alignment = evaluator<MatrixType>::Alignment,
@@ -817,14 +858,14 @@ public:
                       ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
                       ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
   };
-  
-  diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
+
+  EIGEN_DEVICE_FUNC diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
     : m_diagImpl(diag), m_matImpl(mat)
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
   {
     if(AsScalarProduct)
@@ -832,7 +873,7 @@ public:
     else
       return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
   }
-  
+
 protected:
   template<int LoadMode,typename PacketType>
   EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const
@@ -840,7 +881,7 @@ protected:
     return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
                           internal::pset1<PacketType>(m_diagImpl.coeff(id)));
   }
-  
+
   template<int LoadMode,typename PacketType>
   EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const
   {
@@ -851,7 +892,7 @@ protected:
     return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
                           m_diagImpl.template packet<DiagonalPacketLoadMode,PacketType>(id));
   }
-  
+
   evaluator<DiagonalType> m_diagImpl;
   evaluator<MatrixType>   m_matImpl;
 };
@@ -866,25 +907,25 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
   using Base::m_matImpl;
   using Base::coeff;
   typedef typename Base::Scalar Scalar;
-  
+
   typedef Product<Lhs, Rhs, ProductKind> XprType;
   typedef typename XprType::PlainObject PlainObject;
-  
-  enum {
-    StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor
-  };
+  typedef typename Lhs::DiagonalVectorType DiagonalType;
+
+
+  enum { StorageOrder = Base::_StorageOrder };
 
   EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
     : Base(xpr.rhs(), xpr.lhs().diagonal())
   {
   }
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
   {
     return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
   }
-  
-#ifndef __CUDACC__
+
+#ifndef EIGEN_GPUCC
   template<int LoadMode,typename PacketType>
   EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
   {
@@ -893,7 +934,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
     return this->template packet_impl<LoadMode,PacketType>(row,col, row,
                                  typename internal::conditional<int(StorageOrder)==RowMajor, internal::true_type, internal::false_type>::type());
   }
-  
+
   template<int LoadMode,typename PacketType>
   EIGEN_STRONG_INLINE PacketType packet(Index idx) const
   {
@@ -912,30 +953,30 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
   using Base::m_matImpl;
   using Base::coeff;
   typedef typename Base::Scalar Scalar;
-  
+
   typedef Product<Lhs, Rhs, ProductKind> XprType;
   typedef typename XprType::PlainObject PlainObject;
-  
-  enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor };
+
+  enum { StorageOrder = Base::_StorageOrder };
 
   EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
     : Base(xpr.lhs(), xpr.rhs().diagonal())
   {
   }
-  
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
   {
     return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
   }
-  
-#ifndef __CUDACC__
+
+#ifndef EIGEN_GPUCC
   template<int LoadMode,typename PacketType>
   EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
   {
     return this->template packet_impl<LoadMode,PacketType>(row,col, col,
                                  typename internal::conditional<int(StorageOrder)==ColMajor, internal::true_type, internal::false_type>::type());
   }
-  
+
   template<int LoadMode,typename PacketType>
   EIGEN_STRONG_INLINE PacketType packet(Index idx) const
   {
@@ -963,7 +1004,7 @@ struct permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape>
     typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
 
     template<typename Dest, typename PermutationType>
-    static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
+    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
     {
       MatrixType mat(xpr);
       const Index n = Side==OnTheLeft ? mat.rows() : mat.cols();
@@ -1017,7 +1058,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, PermutationShape, MatrixShape, ProductTag>
 {
   template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
   {
     permutation_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
   }
@@ -1027,7 +1068,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, MatrixShape, PermutationShape, ProductTag>
 {
   template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
   {
     permutation_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
   }
@@ -1037,7 +1078,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Inverse<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag>
 {
   template<typename Dest>
-  static void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
   {
     permutation_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
   }
@@ -1047,7 +1088,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, ProductTag>
 {
   template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
   {
     permutation_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
   }
@@ -1069,9 +1110,9 @@ struct transposition_matrix_product
 {
   typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
   typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
-  
+
   template<typename Dest, typename TranspositionType>
-  static inline void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr)
   {
     MatrixType mat(xpr);
     typedef typename TranspositionType::StorageIndex StorageIndex;
@@ -1094,7 +1135,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, TranspositionsShape, MatrixShape, ProductTag>
 {
   template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
   {
     transposition_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
   }
@@ -1104,7 +1145,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, MatrixShape, TranspositionsShape, ProductTag>
 {
   template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
   {
     transposition_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
   }
@@ -1115,7 +1156,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Transpose<Lhs>, Rhs, TranspositionsShape, MatrixShape, ProductTag>
 {
   template<typename Dest>
-  static void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
   {
     transposition_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
   }
@@ -1125,7 +1166,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, TranspositionsShape, ProductTag>
 {
   template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
   {
     transposition_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
   }
diff --git a/contrib/eigen/Eigen/src/Core/Random.h b/contrib/eigen/Eigen/src/Core/Random.h
index 6faf789c7618d90b15cbf8c8b39ca6a3b3992413..dab2ac8e9e84e564aa90465089d15188be8cda1c 100644
--- a/contrib/eigen/Eigen/src/Core/Random.h
+++ b/contrib/eigen/Eigen/src/Core/Random.h
@@ -128,7 +128,7 @@ DenseBase<Derived>::Random()
   * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
   */
 template<typename Derived>
-inline Derived& DenseBase<Derived>::setRandom()
+EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom()
 {
   return *this = Random(rows(), cols());
 }
@@ -177,6 +177,42 @@ PlainObjectBase<Derived>::setRandom(Index rows, Index cols)
   return setRandom();
 }
 
+/** Resizes to the given size, changing only the number of columns, and sets all
+  * coefficients in this expression to random values. For the parameter of type
+  * NoChange_t, just pass the special value \c NoChange.
+  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  *
+  * \not_reentrant
+  *
+  * \sa DenseBase::setRandom(), setRandom(Index), setRandom(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Random()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setRandom(NoChange_t, Index cols)
+{
+  return setRandom(rows(), cols);
+}
+
+/** Resizes to the given size, changing only the number of rows, and sets all
+  * coefficients in this expression to random values. For the parameter of type
+  * NoChange_t, just pass the special value \c NoChange.
+  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  *
+  * \not_reentrant
+  *
+  * \sa DenseBase::setRandom(), setRandom(Index), setRandom(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Random()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE Derived&
+PlainObjectBase<Derived>::setRandom(Index rows, NoChange_t)
+{
+  return setRandom(rows, cols());
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_RANDOM_H
diff --git a/contrib/eigen/Eigen/src/Core/Redux.h b/contrib/eigen/Eigen/src/Core/Redux.h
index 760e9f86154cb7249436753d987e2d340bc40643..b6790d11050bb340c83acaa72197eba395c80222 100644
--- a/contrib/eigen/Eigen/src/Core/Redux.h
+++ b/contrib/eigen/Eigen/src/Core/Redux.h
@@ -23,23 +23,29 @@ namespace internal {
 * Part 1 : the logic deciding a strategy for vectorization and unrolling
 ***************************************************************************/
 
-template<typename Func, typename Derived>
+template<typename Func, typename Evaluator>
 struct redux_traits
 {
 public:
-    typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType;
+    typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType;
   enum {
     PacketSize = unpacket_traits<PacketType>::size,
-    InnerMaxSize = int(Derived::IsRowMajor)
-                 ? Derived::MaxColsAtCompileTime
-                 : Derived::MaxRowsAtCompileTime
+    InnerMaxSize = int(Evaluator::IsRowMajor)
+                 ? Evaluator::MaxColsAtCompileTime
+                 : Evaluator::MaxRowsAtCompileTime,
+    OuterMaxSize = int(Evaluator::IsRowMajor)
+                 ? Evaluator::MaxRowsAtCompileTime
+                 : Evaluator::MaxColsAtCompileTime,
+    SliceVectorizedWork = int(InnerMaxSize)==Dynamic ? Dynamic
+                        : int(OuterMaxSize)==Dynamic ? (int(InnerMaxSize)>=int(PacketSize) ? Dynamic : 0)
+                        : (int(InnerMaxSize)/int(PacketSize)) * int(OuterMaxSize)
   };
 
   enum {
-    MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
+    MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
                   && (functor_traits<Func>::PacketAccess),
-    MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit),
-    MaySliceVectorize  = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize
+    MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
+    MaySliceVectorize  = bool(MightVectorize) && (int(SliceVectorizedWork)==Dynamic || int(SliceVectorizedWork)>=3)
   };
 
 public:
@@ -51,8 +57,8 @@ public:
 
 public:
   enum {
-    Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost
-         : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+    Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
+         : int(Evaluator::SizeAtCompileTime) * int(Evaluator::CoeffReadCost) + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
     UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
   };
 
@@ -64,18 +70,20 @@ public:
 #ifdef EIGEN_DEBUG_ASSIGN
   static void debug()
   {
-    std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl;
+    std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
     std::cerr.setf(std::ios::hex, std::ios::basefield);
-    EIGEN_DEBUG_VAR(Derived::Flags)
+    EIGEN_DEBUG_VAR(Evaluator::Flags)
     std::cerr.unsetf(std::ios::hex);
     EIGEN_DEBUG_VAR(InnerMaxSize)
+    EIGEN_DEBUG_VAR(OuterMaxSize)
+    EIGEN_DEBUG_VAR(SliceVectorizedWork)
     EIGEN_DEBUG_VAR(PacketSize)
     EIGEN_DEBUG_VAR(MightVectorize)
     EIGEN_DEBUG_VAR(MayLinearVectorize)
     EIGEN_DEBUG_VAR(MaySliceVectorize)
-    EIGEN_DEBUG_VAR(Traversal)
+    std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
     EIGEN_DEBUG_VAR(UnrollingLimit)
-    EIGEN_DEBUG_VAR(Unrolling)
+    std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
     std::cerr << std::endl;
   }
 #endif
@@ -87,88 +95,86 @@ public:
 
 /*** no vectorization ***/
 
-template<typename Func, typename Derived, int Start, int Length>
+template<typename Func, typename Evaluator, int Start, int Length>
 struct redux_novec_unroller
 {
   enum {
     HalfLength = Length/2
   };
 
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func)
   {
-    return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
-                redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
+    return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
+                redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func));
   }
 };
 
-template<typename Func, typename Derived, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 1>
+template<typename Func, typename Evaluator, int Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 1>
 {
   enum {
-    outer = Start / Derived::InnerSizeAtCompileTime,
-    inner = Start % Derived::InnerSizeAtCompileTime
+    outer = Start / Evaluator::InnerSizeAtCompileTime,
+    inner = Start % Evaluator::InnerSizeAtCompileTime
   };
 
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&)
   {
-    return mat.coeffByOuterInner(outer, inner);
+    return eval.coeffByOuterInner(outer, inner);
   }
 };
 
 // This is actually dead code and will never be called. It is required
 // to prevent false warnings regarding failed inlining though
 // for 0 length run() will never be called at all.
-template<typename Func, typename Derived, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 0>
+template<typename Func, typename Evaluator, int Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 0>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
   EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
 };
 
 /*** vectorization ***/
 
-template<typename Func, typename Derived, int Start, int Length>
+template<typename Func, typename Evaluator, int Start, int Length>
 struct redux_vec_unroller
 {
-  enum {
-    PacketSize = redux_traits<Func, Derived>::PacketSize,
-    HalfLength = Length/2
-  };
-
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
-
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func& func)
   {
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      HalfLength = Length/2
+    };
+
     return func.packetOp(
-            redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
-            redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
+            redux_vec_unroller<Func, Evaluator, Start, HalfLength>::template run<PacketType>(eval,func),
+            redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::template run<PacketType>(eval,func) );
   }
 };
 
-template<typename Func, typename Derived, int Start>
-struct redux_vec_unroller<Func, Derived, Start, 1>
+template<typename Func, typename Evaluator, int Start>
+struct redux_vec_unroller<Func, Evaluator, Start, 1>
 {
-  enum {
-    index = Start * redux_traits<Func, Derived>::PacketSize,
-    outer = index / int(Derived::InnerSizeAtCompileTime),
-    inner = index % int(Derived::InnerSizeAtCompileTime),
-    alignment = Derived::Alignment
-  };
-
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
-
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func&)
   {
-    return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      index = Start * PacketSize,
+      outer = index / int(Evaluator::InnerSizeAtCompileTime),
+      inner = index % int(Evaluator::InnerSizeAtCompileTime),
+      alignment = Evaluator::Alignment
+    };
+    return eval.template packetByOuterInner<alignment,PacketType>(outer, inner);
   }
 };
 
@@ -176,53 +182,65 @@ struct redux_vec_unroller<Func, Derived, Start, 1>
 * Part 3 : implementation of all cases
 ***************************************************************************/
 
-template<typename Func, typename Derived,
-         int Traversal = redux_traits<Func, Derived>::Traversal,
-         int Unrolling = redux_traits<Func, Derived>::Unrolling
+template<typename Func, typename Evaluator,
+         int Traversal = redux_traits<Func, Evaluator>::Traversal,
+         int Unrolling = redux_traits<Func, Evaluator>::Unrolling
 >
 struct redux_impl;
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  typedef typename Evaluator::Scalar Scalar;
+
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
     Scalar res;
-    res = mat.coeffByOuterInner(0, 0);
-    for(Index i = 1; i < mat.innerSize(); ++i)
-      res = func(res, mat.coeffByOuterInner(0, i));
-    for(Index i = 1; i < mat.outerSize(); ++i)
-      for(Index j = 0; j < mat.innerSize(); ++j)
-        res = func(res, mat.coeffByOuterInner(i, j));
+    res = eval.coeffByOuterInner(0, 0);
+    for(Index i = 1; i < xpr.innerSize(); ++i)
+      res = func(res, eval.coeffByOuterInner(0, i));
+    for(Index i = 1; i < xpr.outerSize(); ++i)
+      for(Index j = 0; j < xpr.innerSize(); ++j)
+        res = func(res, eval.coeffByOuterInner(i, j));
     return res;
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling>
-  : public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
-{};
+template<typename Func, typename Evaluator>
+struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
+  : redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
+{
+  typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
+  typedef typename Evaluator::Scalar Scalar;
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/)
+  {
+    return Base::run(eval,func);
+  }
+};
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
 
-  static Scalar run(const Derived &mat, const Func& func)
+  template<typename XprType>
+  static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    const Index size = mat.size();
+    const Index size = xpr.size();
     
-    const Index packetSize = redux_traits<Func, Derived>::PacketSize;
+    const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
     const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
     enum {
-      alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
-      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
+      alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
+      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
     };
-    const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
+    const Index alignedStart = internal::first_default_aligned(xpr);
     const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
     const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
     const Index alignedEnd2 = alignedStart + alignedSize2;
@@ -230,34 +248,34 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
     Scalar res;
     if(alignedSize)
     {
-      PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart);
+      PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart);
       if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
       {
-        PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize);
+        PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize);
         for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
         {
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index));
-          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize));
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index));
+          packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize));
         }
 
         packet_res0 = func.packetOp(packet_res0,packet_res1);
         if(alignedEnd>alignedEnd2)
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2));
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2));
       }
       res = func.predux(packet_res0);
 
       for(Index index = 0; index < alignedStart; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
 
       for(Index index = alignedEnd; index < size; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
     }
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = mat.coeff(0);
+      res = eval.coeff(0);
       for(Index index = 1; index < size; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
     }
 
     return res;
@@ -265,130 +283,108 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
 };
 
 // NOTE: for SliceVectorizedTraversal we simply bypass unrolling
-template<typename Func, typename Derived, int Unrolling>
-struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
+template<typename Func, typename Evaluator, int Unrolling>
+struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketType;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
 
-  EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
-    const Index innerSize = mat.innerSize();
-    const Index outerSize = mat.outerSize();
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
+    const Index innerSize = xpr.innerSize();
+    const Index outerSize = xpr.outerSize();
     enum {
-      packetSize = redux_traits<Func, Derived>::PacketSize
+      packetSize = redux_traits<Func, Evaluator>::PacketSize
     };
     const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
     Scalar res;
     if(packetedInnerSize)
     {
-      PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0);
+      PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0);
       for(Index j=0; j<outerSize; ++j)
         for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
-          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i));
+          packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i));
 
       res = func.predux(packet_res);
       for(Index j=0; j<outerSize; ++j)
         for(Index i=packetedInnerSize; i<innerSize; ++i)
-          res = func(res, mat.coeffByOuterInner(j,i));
+          res = func(res, eval.coeffByOuterInner(j,i));
     }
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func);
+      res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
     }
 
     return res;
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
   enum {
-    PacketSize = redux_traits<Func, Derived>::PacketSize,
-    Size = Derived::SizeAtCompileTime,
-    VectorizedSize = (Size / PacketSize) * PacketSize
+    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
+    Size = Evaluator::SizeAtCompileTime,
+    VectorizedSize = (int(Size) / int(PacketSize)) * int(PacketSize)
   };
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    EIGEN_ONLY_USED_FOR_DEBUG(xpr)
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
     if (VectorizedSize > 0) {
-      Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+      Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::template run<PacketType>(eval,func));
       if (VectorizedSize != Size)
-        res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
+        res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func));
       return res;
     }
     else {
-      return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
+      return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func);
     }
   }
 };
 
 // evaluator adaptor
 template<typename _XprType>
-class redux_evaluator
+class redux_evaluator : public internal::evaluator<_XprType>
 {
+  typedef internal::evaluator<_XprType> Base;
 public:
   typedef _XprType XprType;
-  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
   
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
   
   enum {
     MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
     // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
-    Flags = evaluator<XprType>::Flags & ~DirectAccessBit,
+    Flags = Base::Flags & ~DirectAccessBit,
     IsRowMajor = XprType::IsRowMajor,
     SizeAtCompileTime = XprType::SizeAtCompileTime,
-    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
-    CoeffReadCost = evaluator<XprType>::CoeffReadCost,
-    Alignment = evaluator<XprType>::Alignment
+    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
   };
   
-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
-  EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
-  EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
-
-  EIGEN_DEVICE_FUNC
-  CoeffReturnType coeff(Index row, Index col) const
-  { return m_evaluator.coeff(row, col); }
-
-  EIGEN_DEVICE_FUNC
-  CoeffReturnType coeff(Index index) const
-  { return m_evaluator.coeff(index); }
-
-  template<int LoadMode, typename PacketType>
-  PacketType packet(Index row, Index col) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
-
-  template<int LoadMode, typename PacketType>
-  PacketType packet(Index index) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(index); }
-  
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
   
   template<int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketType packetByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  { return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
   
-  const XprType & nestedExpression() const { return m_xpr; }
-  
-protected:
-  internal::evaluator<XprType> m_evaluator;
-  const XprType &m_xpr;
 };
 
 } // end namespace internal
@@ -403,39 +399,53 @@ protected:
   * The template parameter \a BinaryOp is the type of the functor \a func which must be
   * an associative operator. Both current C++98 and C++11 functor styles are handled.
   *
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  *
   * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
   */
 template<typename Derived>
 template<typename Func>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::redux(const Func& func) const
 {
   eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
 
   typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
   ThisEvaluator thisEval(derived());
-  
-  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
+
+  // The initial expression is passed to the reducer as an additional argument instead of
+  // passing it as a member of redux_evaluator to help  
+  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
 }
 
 /** \returns the minimum of all coefficients of \c *this.
-  * \warning the result is undefined if \c *this contains NaN.
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is minimum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+template<int NaNPropagation>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
-  return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
+  return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar, NaNPropagation>());
 }
 
-/** \returns the maximum of all coefficients of \c *this.
-  * \warning the result is undefined if \c *this contains NaN.
+/** \returns the maximum of all coefficients of \c *this. 
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+template<int NaNPropagation>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
-  return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
+  return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar, NaNPropagation>());
 }
 
 /** \returns the sum of all coefficients of \c *this
@@ -445,7 +455,7 @@ DenseBase<Derived>::maxCoeff() const
   * \sa trace(), prod(), mean()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::sum() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -458,7 +468,7 @@ DenseBase<Derived>::sum() const
 * \sa trace(), prod(), sum()
 */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
 #ifdef __INTEL_COMPILER
@@ -479,7 +489,7 @@ DenseBase<Derived>::mean() const
   * \sa sum(), mean(), trace()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::prod() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -494,7 +504,7 @@ DenseBase<Derived>::prod() const
   * \sa diagonal(), sum()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 MatrixBase<Derived>::trace() const
 {
   return derived().diagonal().sum();
diff --git a/contrib/eigen/Eigen/src/Core/Ref.h b/contrib/eigen/Eigen/src/Core/Ref.h
index 17a1496b842baf819ee7b2f38027ba983caf9528..c2a37eadbb6321211a4ddfd52363760eee44ea18 100644
--- a/contrib/eigen/Eigen/src/Core/Ref.h
+++ b/contrib/eigen/Eigen/src/Core/Ref.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_REF_H
 #define EIGEN_REF_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -48,7 +48,7 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
     };
     typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
   };
-  
+
 };
 
 template<typename Derived>
@@ -67,12 +67,12 @@ public:
   typedef MapBase<Derived> Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)
 
-  EIGEN_DEVICE_FUNC inline Index innerStride() const
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const
   {
     return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
   }
 
-  EIGEN_DEVICE_FUNC inline Index outerStride() const
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const
   {
     return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
          : IsVectorAtCompileTime ? this->size()
@@ -86,36 +86,122 @@ public:
       m_stride(StrideType::OuterStrideAtCompileTime==Dynamic?0:StrideType::OuterStrideAtCompileTime,
                StrideType::InnerStrideAtCompileTime==Dynamic?0:StrideType::InnerStrideAtCompileTime)
   {}
-  
+
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(RefBase)
 
 protected:
 
   typedef Stride<StrideType::OuterStrideAtCompileTime,StrideType::InnerStrideAtCompileTime> StrideBase;
 
+  // Resolves inner stride if default 0.
+  static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveInnerStride(Index inner) {
+    return inner == 0 ? 1 : inner;
+  }
+
+  // Resolves outer stride if default 0.
+  static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols, bool isVectorAtCompileTime, bool isRowMajor) {
+    return outer == 0 ? isVectorAtCompileTime ? inner * rows * cols : isRowMajor ? inner * cols : inner * rows : outer;
+  }
+
+  // Returns true if construction is valid, false if there is a stride mismatch,
+  // and fails if there is a size mismatch.
   template<typename Expression>
-  EIGEN_DEVICE_FUNC void construct(Expression& expr)
+  EIGEN_DEVICE_FUNC bool construct(Expression& expr)
   {
-    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression);
-
+    // Check matrix sizes.  If this is a compile-time vector, we do allow
+    // implicitly transposing.
+    EIGEN_STATIC_ASSERT(
+      EIGEN_PREDICATE_SAME_MATRIX_SIZE(PlainObjectType, Expression)
+      // If it is a vector, the transpose sizes might match.
+      || ( PlainObjectType::IsVectorAtCompileTime
+            && ((int(PlainObjectType::RowsAtCompileTime)==Eigen::Dynamic
+              || int(Expression::ColsAtCompileTime)==Eigen::Dynamic
+              || int(PlainObjectType::RowsAtCompileTime)==int(Expression::ColsAtCompileTime))
+            &&  (int(PlainObjectType::ColsAtCompileTime)==Eigen::Dynamic
+              || int(Expression::RowsAtCompileTime)==Eigen::Dynamic
+              || int(PlainObjectType::ColsAtCompileTime)==int(Expression::RowsAtCompileTime)))),
+      YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES
+    )
+
+    // Determine runtime rows and columns.
+    Index rows = expr.rows();
+    Index cols = expr.cols();
     if(PlainObjectType::RowsAtCompileTime==1)
     {
       eigen_assert(expr.rows()==1 || expr.cols()==1);
-      ::new (static_cast<Base*>(this)) Base(expr.data(), 1, expr.size());
+      rows = 1;
+      cols = expr.size();
     }
     else if(PlainObjectType::ColsAtCompileTime==1)
     {
       eigen_assert(expr.rows()==1 || expr.cols()==1);
-      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.size(), 1);
+      rows = expr.size();
+      cols = 1;
+    }
+    // Verify that the sizes are valid.
+    eigen_assert(
+      (PlainObjectType::RowsAtCompileTime == Dynamic) || (PlainObjectType::RowsAtCompileTime == rows));
+    eigen_assert(
+      (PlainObjectType::ColsAtCompileTime == Dynamic) || (PlainObjectType::ColsAtCompileTime == cols));
+
+
+    // If this is a vector, we might be transposing, which means that stride should swap.
+    const bool transpose = PlainObjectType::IsVectorAtCompileTime && (rows != expr.rows());
+    // If the storage format differs, we also need to swap the stride.
+    const bool row_major = ((PlainObjectType::Flags)&RowMajorBit) != 0;
+    const bool expr_row_major = (Expression::Flags&RowMajorBit) != 0;
+    const bool storage_differs =  (row_major != expr_row_major);
+
+    const bool swap_stride = (transpose != storage_differs);
+
+    // Determine expr's actual strides, resolving any defaults if zero.
+    const Index expr_inner_actual = resolveInnerStride(expr.innerStride());
+    const Index expr_outer_actual = resolveOuterStride(expr_inner_actual,
+                                                       expr.outerStride(),
+                                                       expr.rows(),
+                                                       expr.cols(),
+                                                       Expression::IsVectorAtCompileTime != 0,
+                                                       expr_row_major);
+
+    // If this is a column-major row vector or row-major column vector, the inner-stride
+    // is arbitrary, so set it to either the compile-time inner stride or 1.
+    const bool row_vector = (rows == 1);
+    const bool col_vector = (cols == 1);
+    const Index inner_stride =
+        ( (!row_major && row_vector) || (row_major && col_vector) ) ?
+            ( StrideType::InnerStrideAtCompileTime > 0 ? Index(StrideType::InnerStrideAtCompileTime) : 1)
+            : swap_stride ? expr_outer_actual : expr_inner_actual;
+
+    // If this is a column-major column vector or row-major row vector, the outer-stride
+    // is arbitrary, so set it to either the compile-time outer stride or vector size.
+    const Index outer_stride =
+      ( (!row_major && col_vector) || (row_major && row_vector) ) ?
+          ( StrideType::OuterStrideAtCompileTime > 0 ? Index(StrideType::OuterStrideAtCompileTime) : rows * cols * inner_stride)
+          : swap_stride ? expr_inner_actual : expr_outer_actual;
+
+    // Check if given inner/outer strides are compatible with compile-time strides.
+    const bool inner_valid = (StrideType::InnerStrideAtCompileTime == Dynamic)
+        || (resolveInnerStride(Index(StrideType::InnerStrideAtCompileTime)) == inner_stride);
+    if (!inner_valid) {
+      return false;
     }
-    else
-      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.rows(), expr.cols());
-    
-    if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit)))
-      ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1);
-    else
-      ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
-                                   StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());    
+
+    const bool outer_valid = (StrideType::OuterStrideAtCompileTime == Dynamic)
+        || (resolveOuterStride(
+              inner_stride,
+              Index(StrideType::OuterStrideAtCompileTime),
+              rows, cols, PlainObjectType::IsVectorAtCompileTime != 0,
+              row_major)
+            == outer_stride);
+    if (!outer_valid) {
+      return false;
+    }
+
+    ::new (static_cast<Base*>(this)) Base(expr.data(), rows, cols);
+    ::new (&m_stride) StrideBase(
+      (StrideType::OuterStrideAtCompileTime == 0) ? 0 : outer_stride,
+      (StrideType::InnerStrideAtCompileTime == 0) ? 0 : inner_stride );
+    return true;
   }
 
   StrideBase m_stride;
@@ -187,6 +273,8 @@ protected:
   * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); }
   * \endcode
   *
+  * See also the following stackoverflow questions for further references:
+  *  - <a href="http://stackoverflow.com/questions/21132538/correct-usage-of-the-eigenref-class">Correct usage of the Eigen::Ref<> class</a>
   *
   * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
   */
@@ -210,7 +298,10 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref
                                  typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
     {
       EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
-      Base::construct(expr.derived());
+      // Construction must pass since we will not create temprary storage in the non-const case.
+      const bool success = Base::construct(expr.derived());
+      EIGEN_UNUSED_VARIABLE(success)
+      eigen_assert(success);
     }
     template<typename Derived>
     EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
@@ -224,7 +315,10 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref
       EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
       EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
       EIGEN_STATIC_ASSERT(!Derived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
-      Base::construct(expr.const_cast_derived());
+      // Construction must pass since we will not create temporary storage in the non-const case.
+      const bool success = Base::construct(expr.const_cast_derived());
+      EIGEN_UNUSED_VARIABLE(success)
+      eigen_assert(success);
     }
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref)
@@ -265,7 +359,10 @@ template<typename TPlainObjectType, int Options, typename StrideType> class Ref<
     template<typename Expression>
     EIGEN_DEVICE_FUNC void construct(const Expression& expr,internal::true_type)
     {
-      Base::construct(expr);
+      // Check if we can use the underlying expr's storage directly, otherwise call the copy version.
+      if (!Base::construct(expr)) {
+        construct(expr, internal::false_type());
+      }
     }
 
     template<typename Expression>
diff --git a/contrib/eigen/Eigen/src/Core/Replicate.h b/contrib/eigen/Eigen/src/Core/Replicate.h
index 9960ef884ef68f5828caa3784e2cba2b177a9594..ab5be7e64bc2b464760d1f6b9e2998b18fa416cb 100644
--- a/contrib/eigen/Eigen/src/Core/Replicate.h
+++ b/contrib/eigen/Eigen/src/Core/Replicate.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_REPLICATE_H
 #define EIGEN_REPLICATE_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 template<typename MatrixType,int RowFactor,int ColFactor>
@@ -35,7 +35,7 @@ struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
     IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1
                : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0
                : (MatrixType::Flags & RowMajorBit) ? 1 : 0,
-    
+
     // FIXME enable DirectAccess with negative strides?
     Flags = IsRowMajor ? RowMajorBit : 0
   };
@@ -88,15 +88,15 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
                           THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
     }
 
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
 
     EIGEN_DEVICE_FUNC
     const _MatrixTypeNested& nestedExpression() const
-    { 
-      return m_matrix; 
+    {
+      return m_matrix;
     }
 
   protected:
@@ -115,7 +115,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
   */
 template<typename Derived>
 template<int RowFactor, int ColFactor>
-const Replicate<Derived,RowFactor,ColFactor>
+EIGEN_DEVICE_FUNC const Replicate<Derived,RowFactor,ColFactor>
 DenseBase<Derived>::replicate() const
 {
   return Replicate<Derived,RowFactor,ColFactor>(derived());
@@ -130,7 +130,7 @@ DenseBase<Derived>::replicate() const
   * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
   */
 template<typename ExpressionType, int Direction>
-const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
+EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
 VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
 {
   return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
diff --git a/contrib/eigen/Eigen/src/Core/ReturnByValue.h b/contrib/eigen/Eigen/src/Core/ReturnByValue.h
index c44b7673bb3ef689ae9f1073989a75fb5fd1f27f..4dad13ea11872682099c8d41438a87a5ef9726ec 100644
--- a/contrib/eigen/Eigen/src/Core/ReturnByValue.h
+++ b/contrib/eigen/Eigen/src/Core/ReturnByValue.h
@@ -60,8 +60,10 @@ template<typename Derived> class ReturnByValue
     EIGEN_DEVICE_FUNC
     inline void evalTo(Dest& dst) const
     { static_cast<const Derived*>(this)->evalTo(dst); }
-    EIGEN_DEVICE_FUNC inline Index rows() const { return static_cast<const Derived*>(this)->rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return static_cast<const Derived*>(this)->cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return static_cast<const Derived*>(this)->rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return static_cast<const Derived*>(this)->cols(); }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 #define Unusable YOU_ARE_TRYING_TO_ACCESS_A_SINGLE_COEFFICIENT_IN_A_SPECIAL_EXPRESSION_WHERE_THAT_IS_NOT_ALLOWED_BECAUSE_THAT_WOULD_BE_INEFFICIENT
@@ -79,7 +81,7 @@ template<typename Derived> class ReturnByValue
 
 template<typename Derived>
 template<typename OtherDerived>
-Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
   other.evalTo(derived());
   return derived();
@@ -90,7 +92,7 @@ namespace internal {
 // Expression is evaluated in a temporary; default implementation of Assignment is bypassed so that
 // when a ReturnByValue expression is assigned, the evaluator is not constructed.
 // TODO: Finalize port to new regime; ReturnByValue should not exist in the expression world
-  
+
 template<typename Derived>
 struct evaluator<ReturnByValue<Derived> >
   : public evaluator<typename internal::traits<Derived>::ReturnType>
@@ -98,7 +100,7 @@ struct evaluator<ReturnByValue<Derived> >
   typedef ReturnByValue<Derived> XprType;
   typedef typename internal::traits<Derived>::ReturnType PlainObject;
   typedef evaluator<PlainObject> Base;
-  
+
   EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
     : m_result(xpr.rows(), xpr.cols())
   {
diff --git a/contrib/eigen/Eigen/src/Core/Reverse.h b/contrib/eigen/Eigen/src/Core/Reverse.h
index 0640cda2a1509e099bd16a434b3114de7b1c0d38..28cdd76acaa83e93e2d9775e3e3fc1c851c2279a 100644
--- a/contrib/eigen/Eigen/src/Core/Reverse.h
+++ b/contrib/eigen/Eigen/src/Core/Reverse.h
@@ -12,7 +12,7 @@
 #ifndef EIGEN_REVERSE_H
 #define EIGEN_REVERSE_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -44,7 +44,7 @@ template<typename PacketType> struct reverse_packet_cond<PacketType,false>
   static inline PacketType run(const PacketType& x) { return x; }
 };
 
-} // end namespace internal 
+} // end namespace internal
 
 /** \class Reverse
   * \ingroup Core_Module
@@ -89,8 +89,10 @@ template<typename MatrixType, int Direction> class Reverse
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
 
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
     EIGEN_DEVICE_FUNC inline Index innerStride() const
     {
@@ -98,7 +100,7 @@ template<typename MatrixType, int Direction> class Reverse
     }
 
     EIGEN_DEVICE_FUNC const typename internal::remove_all<typename MatrixType::Nested>::type&
-    nestedExpression() const 
+    nestedExpression() const
     {
       return m_matrix;
     }
@@ -114,7 +116,7 @@ template<typename MatrixType, int Direction> class Reverse
   *
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::ReverseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType
 DenseBase<Derived>::reverse()
 {
   return ReverseReturnType(derived());
@@ -136,7 +138,7 @@ DenseBase<Derived>::reverse()
   *
   * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template<typename Derived>
-inline void DenseBase<Derived>::reverseInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace()
 {
   if(cols()>rows())
   {
@@ -161,7 +163,7 @@ inline void DenseBase<Derived>::reverseInPlace()
 }
 
 namespace internal {
-  
+
 template<int Direction>
 struct vectorwise_reverse_inplace_impl;
 
@@ -171,8 +173,10 @@ struct vectorwise_reverse_inplace_impl<Vertical>
   template<typename ExpressionType>
   static void run(ExpressionType &xpr)
   {
+    const int HalfAtCompileTime = ExpressionType::RowsAtCompileTime==Dynamic?Dynamic:ExpressionType::RowsAtCompileTime/2;
     Index half = xpr.rows()/2;
-    xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse());
+    xpr.topRows(fix<HalfAtCompileTime>(half))
+       .swap(xpr.bottomRows(fix<HalfAtCompileTime>(half)).colwise().reverse());
   }
 };
 
@@ -182,8 +186,10 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
   template<typename ExpressionType>
   static void run(ExpressionType &xpr)
   {
+    const int HalfAtCompileTime = ExpressionType::ColsAtCompileTime==Dynamic?Dynamic:ExpressionType::ColsAtCompileTime/2;
     Index half = xpr.cols()/2;
-    xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse());
+    xpr.leftCols(fix<HalfAtCompileTime>(half))
+       .swap(xpr.rightCols(fix<HalfAtCompileTime>(half)).rowwise().reverse());
   }
 };
 
@@ -201,9 +207,9 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
   *
   * \sa DenseBase::reverseInPlace(), reverse() */
 template<typename ExpressionType, int Direction>
-void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
 {
-  internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
+  internal::vectorwise_reverse_inplace_impl<Direction>::run(m_matrix);
 }
 
 } // end namespace Eigen
diff --git a/contrib/eigen/Eigen/src/Core/Select.h b/contrib/eigen/Eigen/src/Core/Select.h
index 79eec1b5b0e08a5eb045e3d482ed42d1da4e9d32..7c86bf87c170163f04b4fda8a4210526b16e3bbd 100644
--- a/contrib/eigen/Eigen/src/Core/Select.h
+++ b/contrib/eigen/Eigen/src/Core/Select.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_SELECT_H
 #define EIGEN_SELECT_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class Select
   * \ingroup Core_Module
@@ -67,8 +67,10 @@ class Select : public internal::dense_xpr_base< Select<ConditionMatrixType, Then
       eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols());
     }
 
-    inline EIGEN_DEVICE_FUNC Index rows() const { return m_condition.rows(); }
-    inline EIGEN_DEVICE_FUNC Index cols() const { return m_condition.cols(); }
+    inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_condition.rows(); }
+    inline EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_condition.cols(); }
 
     inline EIGEN_DEVICE_FUNC
     const Scalar coeff(Index i, Index j) const
@@ -120,7 +122,7 @@ class Select : public internal::dense_xpr_base< Select<ConditionMatrixType, Then
   */
 template<typename Derived>
 template<typename ThenDerived,typename ElseDerived>
-inline const Select<Derived,ThenDerived,ElseDerived>
+inline EIGEN_DEVICE_FUNC const Select<Derived,ThenDerived,ElseDerived>
 DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
                             const DenseBase<ElseDerived>& elseMatrix) const
 {
@@ -134,7 +136,7 @@ DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
   */
 template<typename Derived>
 template<typename ThenDerived>
-inline const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
+inline EIGEN_DEVICE_FUNC const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
 DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
                            const typename ThenDerived::Scalar& elseScalar) const
 {
@@ -149,7 +151,7 @@ DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
   */
 template<typename Derived>
 template<typename ElseDerived>
-inline const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
+inline EIGEN_DEVICE_FUNC const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
 DenseBase<Derived>::select(const typename ElseDerived::Scalar& thenScalar,
                            const DenseBase<ElseDerived>& elseMatrix) const
 {
diff --git a/contrib/eigen/Eigen/src/Core/SelfAdjointView.h b/contrib/eigen/Eigen/src/Core/SelfAdjointView.h
index b2e51f37ac47ceb647f5a9002fec1eba0b0a5e4e..8ce3b372a0ff792b98b8b1b00f1dd55742779bd0 100644
--- a/contrib/eigen/Eigen/src/Core/SelfAdjointView.h
+++ b/contrib/eigen/Eigen/src/Core/SelfAdjointView.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_SELFADJOINTMATRIX_H
 #define EIGEN_SELFADJOINTMATRIX_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class SelfAdjointView
   * \ingroup Core_Module
@@ -58,14 +58,15 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
     typedef MatrixTypeNestedCleaned NestedExpression;
 
     /** \brief The type of coefficients in this matrix */
-    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
+    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar;
     typedef typename MatrixType::StorageIndex StorageIndex;
     typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
+    typedef SelfAdjointView<typename internal::add_const<MatrixType>::type, UpLo> ConstSelfAdjointView;
 
     enum {
       Mode = internal::traits<SelfAdjointView>::Mode,
       Flags = internal::traits<SelfAdjointView>::Flags,
-      TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0)
+      TransposeMode = ((int(Mode) & int(Upper)) ? Lower : 0) | ((int(Mode) & int(Lower)) ? Upper : 0)
     };
     typedef typename MatrixType::PlainObject PlainObject;
 
@@ -75,14 +76,14 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
       EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY);
     }
 
-    EIGEN_DEVICE_FUNC
-    inline Index rows() const { return m_matrix.rows(); }
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const { return m_matrix.innerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return m_matrix.outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return m_matrix.innerStride(); }
 
     /** \sa MatrixBase::coeff()
       * \warning the coordinates must fit into the referenced triangular part
@@ -131,7 +132,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
     {
       return Product<OtherDerived,SelfAdjointView>(lhs.derived(),rhs);
     }
-    
+
     friend EIGEN_DEVICE_FUNC
     const SelfAdjointView<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,MatrixType,product),UpLo>
     operator*(const Scalar& s, const SelfAdjointView& mat)
@@ -197,6 +198,18 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
     inline const ConjugateReturnType conjugate() const
     { return ConjugateReturnType(m_matrix.conjugate()); }
 
+    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+     *           returns \c *this otherwise.
+     */
+    template<bool Cond>
+    EIGEN_DEVICE_FUNC
+    inline typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type
+    conjugateIf() const
+    {
+      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type ReturnType;
+      return ReturnType(m_matrix.template conjugateIf<Cond>());
+    }
+
     typedef SelfAdjointView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
     /** \sa MatrixBase::adjoint() const */
     EIGEN_DEVICE_FUNC
@@ -287,17 +300,17 @@ protected:
   using Base::m_src;
   using Base::m_functor;
 public:
-  
+
   typedef typename Base::DstEvaluatorType DstEvaluatorType;
   typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
   typedef typename Base::Scalar Scalar;
   typedef typename Base::AssignmentTraits AssignmentTraits;
-  
-  
+
+
   EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
     : Base(dst, src, func, dstExpr)
   {}
-  
+
   EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
   {
     eigen_internal_assert(row!=col);
@@ -305,12 +318,12 @@ public:
     m_functor.assignCoeff(m_dst.coeffRef(row,col), tmp);
     m_functor.assignCoeff(m_dst.coeffRef(col,row), numext::conj(tmp));
   }
-  
+
   EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
   {
     Base::assignCoeff(id,id);
   }
-  
+
   EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index)
   { eigen_internal_assert(false && "should never be called"); }
 };
@@ -324,7 +337,7 @@ public:
 /** This is the const version of MatrixBase::selfadjointView() */
 template<typename Derived>
 template<unsigned int UpLo>
-typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const
 {
   return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
@@ -341,7 +354,7 @@ MatrixBase<Derived>::selfadjointView() const
   */
 template<typename Derived>
 template<unsigned int UpLo>
-typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView()
 {
   return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
diff --git a/contrib/eigen/Eigen/src/Core/Solve.h b/contrib/eigen/Eigen/src/Core/Solve.h
index a8daea511355dbbf71c142f77ea52648627513a6..23d5cb70728f0f2e52d3f01d94bdc86e37fd9292 100644
--- a/contrib/eigen/Eigen/src/Core/Solve.h
+++ b/contrib/eigen/Eigen/src/Core/Solve.h
@@ -13,13 +13,13 @@
 namespace Eigen {
 
 template<typename Decomposition, typename RhsType, typename StorageKind> class SolveImpl;
-  
+
 /** \class Solve
   * \ingroup Core_Module
   *
   * \brief Pseudo expression representing a solving operation
   *
-  * \tparam Decomposition the type of the matrix or decomposion object
+  * \tparam Decomposition the type of the matrix or decomposition object
   * \tparam Rhstype the type of the right-hand side
   *
   * This class represents an expression of A.solve(B)
@@ -64,13 +64,13 @@ class Solve : public SolveImpl<Decomposition,RhsType,typename internal::traits<R
 public:
   typedef typename internal::traits<Solve>::PlainObject PlainObject;
   typedef typename internal::traits<Solve>::StorageIndex StorageIndex;
-  
+
   Solve(const Decomposition &dec, const RhsType &rhs)
     : m_dec(dec), m_rhs(rhs)
   {}
-  
-  EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
 
   EIGEN_DEVICE_FUNC const Decomposition& dec() const { return m_dec; }
   EIGEN_DEVICE_FUNC const RhsType&       rhs() const { return m_rhs; }
@@ -87,14 +87,14 @@ class SolveImpl<Decomposition,RhsType,Dense>
   : public MatrixBase<Solve<Decomposition,RhsType> >
 {
   typedef Solve<Decomposition,RhsType> Derived;
-  
+
 public:
-  
+
   typedef MatrixBase<Solve<Decomposition,RhsType> > Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
 
 private:
-  
+
   Scalar coeff(Index row, Index col) const;
   Scalar coeff(Index i) const;
 };
@@ -119,15 +119,15 @@ struct evaluator<Solve<Decomposition,RhsType> >
   typedef evaluator<PlainObject> Base;
 
   enum { Flags = Base::Flags | EvalBeforeNestingBit };
-  
+
   EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve)
     : m_result(solve.rows(), solve.cols())
   {
     ::new (static_cast<Base*>(this)) Base(m_result);
     solve.dec()._solve_impl(solve.rhs(), m_result);
   }
-  
-protected:  
+
+protected:
   PlainObject m_result;
 };
 
@@ -176,12 +176,12 @@ struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<t
     Index dstCols = src.cols();
     if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
       dst.resize(dstRows, dstCols);
-    
+
     src.dec().nestedExpression().nestedExpression().template _solve_impl_transposed<true>(src.rhs(), dst);
   }
 };
 
-} // end namepsace internal
+} // end namespace internal
 
 } // end namespace Eigen
 
diff --git a/contrib/eigen/Eigen/src/Core/SolveTriangular.h b/contrib/eigen/Eigen/src/Core/SolveTriangular.h
index fd0acb1a58eb1f372dfd11076175921ae07a4fbf..dfbf99523a9c9cb5328a20e7d0f80574a4ad2cbf 100644
--- a/contrib/eigen/Eigen/src/Core/SolveTriangular.h
+++ b/contrib/eigen/Eigen/src/Core/SolveTriangular.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_SOLVETRIANGULAR_H
 #define EIGEN_SOLVETRIANGULAR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -54,7 +54,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,1>
   typedef blas_traits<Lhs> LhsProductTraits;
   typedef typename LhsProductTraits::ExtractType ActualLhsType;
   typedef Map<Matrix<RhsScalar,Dynamic,1>, Aligned> MappedRhs;
-  static void run(const Lhs& lhs, Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
   {
     ActualLhsType actualLhs = LhsProductTraits::extract(lhs);
 
@@ -64,7 +64,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,1>
 
     ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhs,rhs.size(),
                                                   (useRhsDirectly ? rhs.data() : 0));
-                                                  
+
     if(!useRhsDirectly)
       MappedRhs(actualRhs,rhs.size()) = rhs;
 
@@ -85,7 +85,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
   typedef blas_traits<Lhs> LhsProductTraits;
   typedef typename LhsProductTraits::DirectLinearAccessType ActualLhsType;
 
-  static void run(const Lhs& lhs, Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
   {
     typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsProductTraits::extract(lhs);
 
@@ -118,7 +118,7 @@ struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,false> {
     DiagIndex  = IsLower ? LoopIndex : Size - LoopIndex - 1,
     StartIndex = IsLower ? 0         : DiagIndex+1
   };
-  static void run(const Lhs& lhs, Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
   {
     if (LoopIndex>0)
       rhs.coeffRef(DiagIndex) -= lhs.row(DiagIndex).template segment<LoopIndex>(StartIndex).transpose()
@@ -133,22 +133,22 @@ struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,false> {
 
 template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
 struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,true> {
-  static void run(const Lhs&, Rhs&) {}
+  static EIGEN_DEVICE_FUNC void run(const Lhs&, Rhs&) {}
 };
 
 template<typename Lhs, typename Rhs, int Mode>
 struct triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,CompleteUnrolling,1> {
-  static void run(const Lhs& lhs, Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
   { triangular_solver_unroller<Lhs,Rhs,Mode,0,Rhs::SizeAtCompileTime>::run(lhs,rhs); }
 };
 
 template<typename Lhs, typename Rhs, int Mode>
 struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
-  static void run(const Lhs& lhs, Rhs& rhs)
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs)
   {
     Transpose<const Lhs> trLhs(lhs);
     Transpose<Rhs> trRhs(rhs);
-    
+
     triangular_solver_unroller<Transpose<const Lhs>,Transpose<Rhs>,
                               ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
                               0,Rhs::SizeAtCompileTime>::run(trLhs,trRhs);
@@ -164,11 +164,11 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
-void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
 {
   OtherDerived& other = _other.const_cast_derived();
   eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
-  eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
+  eigen_assert((!(int(Mode) & int(ZeroDiag))) && bool(int(Mode) & (int(Upper) | int(Lower))));
   // If solving for a 0x0 matrix, nothing to do, simply return.
   if (derived().cols() == 0)
     return;
@@ -213,8 +213,8 @@ template<int Side, typename TriangularType, typename Rhs> struct triangular_solv
     : m_triangularMatrix(tri), m_rhs(rhs)
   {}
 
-  inline Index rows() const { return m_rhs.rows(); }
-  inline Index cols() const { return m_rhs.cols(); }
+  inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_rhs.rows(); }
+  inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
 
   template<typename Dest> inline void evalTo(Dest& dst) const
   {
diff --git a/contrib/eigen/Eigen/src/Core/SolverBase.h b/contrib/eigen/Eigen/src/Core/SolverBase.h
index 8a4adc22973768795fc1a5ebe82762e0119ef52d..5014610420f3e8cea48f0b3fef0a345f1a920f8d 100644
--- a/contrib/eigen/Eigen/src/Core/SolverBase.h
+++ b/contrib/eigen/Eigen/src/Core/SolverBase.h
@@ -14,8 +14,35 @@ namespace Eigen {
 
 namespace internal {
 
+template<typename Derived>
+struct solve_assertion {
+    template<bool Transpose_, typename Rhs>
+    static void run(const Derived& solver, const Rhs& b) { solver.template _check_solve_assertion<Transpose_>(b); }
+};
+
+template<typename Derived>
+struct solve_assertion<Transpose<Derived> >
+{
+    typedef Transpose<Derived> type;
+
+    template<bool Transpose_, typename Rhs>
+    static void run(const type& transpose, const Rhs& b)
+    {
+        internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<true>(transpose.nestedExpression(), b);
+    }
+};
 
+template<typename Scalar, typename Derived>
+struct solve_assertion<CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > >
+{
+    typedef CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > type;
 
+    template<bool Transpose_, typename Rhs>
+    static void run(const type& adjoint, const Rhs& b)
+    {
+        internal::solve_assertion<typename internal::remove_all<Transpose<Derived> >::type>::template run<true>(adjoint.nestedExpression(), b);
+    }
+};
 } // end namespace internal
 
 /** \class SolverBase
@@ -35,7 +62,7 @@ namespace internal {
   *
   * \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors.
   *
-  * \sa class PartialPivLU, class FullPivLU
+  * \sa class PartialPivLU, class FullPivLU, class HouseholderQR, class ColPivHouseholderQR, class FullPivHouseholderQR, class CompleteOrthogonalDecomposition, class LLT, class LDLT, class SVDBase
   */
 template<typename Derived>
 class SolverBase : public EigenBase<Derived>
@@ -46,6 +73,9 @@ class SolverBase : public EigenBase<Derived>
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef Scalar CoeffReturnType;
 
+    template<typename Derived_>
+    friend struct internal::solve_assertion;
+
     enum {
       RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
       ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
@@ -56,7 +86,8 @@ class SolverBase : public EigenBase<Derived>
       MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
                                                              internal::traits<Derived>::MaxColsAtCompileTime>::ret),
       IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
-                           || internal::traits<Derived>::MaxColsAtCompileTime == 1
+                           || internal::traits<Derived>::MaxColsAtCompileTime == 1,
+      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2
     };
 
     /** Default constructor */
@@ -74,7 +105,7 @@ class SolverBase : public EigenBase<Derived>
     inline const Solve<Derived, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
-      eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+      internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<false>(derived(), b);
       return Solve<Derived, Rhs>(derived(), b.derived());
     }
 
@@ -112,6 +143,13 @@ class SolverBase : public EigenBase<Derived>
     }
 
   protected:
+
+    template<bool Transpose_, typename Rhs>
+    void _check_solve_assertion(const Rhs& b) const {
+        EIGEN_ONLY_USED_FOR_DEBUG(b);
+        eigen_assert(derived().m_isInitialized && "Solver is not initialized.");
+        eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "SolverBase::solve(): invalid number of rows of the right hand side matrix b");
+    }
 };
 
 namespace internal {
diff --git a/contrib/eigen/Eigen/src/Core/StableNorm.h b/contrib/eigen/Eigen/src/Core/StableNorm.h
index 88c8d9890241cb454b0ca38aa818f9a06ea5b15d..4a3f0cca8c4e0d325e4584c41abf90ecc0133fb6 100644
--- a/contrib/eigen/Eigen/src/Core/StableNorm.h
+++ b/contrib/eigen/Eigen/src/Core/StableNorm.h
@@ -50,6 +50,71 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc
     ssq += (bl*invScale).squaredNorm();
 }
 
+template<typename VectorType, typename RealScalar>
+void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale)
+{
+  typedef typename VectorType::Scalar Scalar;
+  const Index blockSize = 4096;
+  
+  typedef typename internal::nested_eval<VectorType,2>::type VectorTypeCopy;
+  typedef typename internal::remove_all<VectorTypeCopy>::type VectorTypeCopyClean;
+  const VectorTypeCopy copy(vec);
+  
+  enum {
+    CanAlign = (   (int(VectorTypeCopyClean::Flags)&DirectAccessBit)
+                || (int(internal::evaluator<VectorTypeCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
+               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
+                 && (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
+  };
+  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<VectorTypeCopyClean>::Alignment>,
+                                                   typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
+  Index n = vec.size();
+  
+  Index bi = internal::first_default_aligned(copy);
+  if (bi>0)
+    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
+  for (; bi<n; bi+=blockSize)
+    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
+}
+
+template<typename VectorType>
+typename VectorType::RealScalar
+stable_norm_impl(const VectorType &vec, typename enable_if<VectorType::IsVectorAtCompileTime>::type* = 0 )
+{
+  using std::sqrt;
+  using std::abs;
+
+  Index n = vec.size();
+
+  if(n==1)
+    return abs(vec.coeff(0));
+
+  typedef typename VectorType::RealScalar RealScalar;
+  RealScalar scale(0);
+  RealScalar invScale(1);
+  RealScalar ssq(0); // sum of squares
+
+  stable_norm_impl_inner_step(vec, ssq, scale, invScale);
+  
+  return scale * sqrt(ssq);
+}
+
+template<typename MatrixType>
+typename MatrixType::RealScalar
+stable_norm_impl(const MatrixType &mat, typename enable_if<!MatrixType::IsVectorAtCompileTime>::type* = 0 )
+{
+  using std::sqrt;
+
+  typedef typename MatrixType::RealScalar RealScalar;
+  RealScalar scale(0);
+  RealScalar invScale(1);
+  RealScalar ssq(0); // sum of squares
+
+  for(Index j=0; j<mat.outerSize(); ++j)
+    stable_norm_impl_inner_step(mat.innerVector(j), ssq, scale, invScale);
+  return scale * sqrt(ssq);
+}
+
 template<typename Derived>
 inline typename NumTraits<typename traits<Derived>::Scalar>::Real
 blueNorm_impl(const EigenBase<Derived>& _vec)
@@ -58,52 +123,43 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
   using std::pow;
   using std::sqrt;
   using std::abs;
+
+  // This program calculates the machine-dependent constants
+  // bl, b2, slm, s2m, relerr overfl
+  // from the "basic" machine-dependent numbers
+  // nbig, ibeta, it, iemin, iemax, rbig.
+  // The following define the basic machine-dependent constants.
+  // For portability, the PORT subprograms "ilmaeh" and "rlmach"
+  // are used. For any specific computer, each of the assignment
+  // statements can be replaced
+  static const int ibeta = std::numeric_limits<RealScalar>::radix;  // base for floating-point numbers
+  static const int it    = NumTraits<RealScalar>::digits();  // number of base-beta digits in mantissa
+  static const int iemin = NumTraits<RealScalar>::min_exponent();  // minimum exponent
+  static const int iemax = NumTraits<RealScalar>::max_exponent();  // maximum exponent
+  static const RealScalar rbig   = NumTraits<RealScalar>::highest();  // largest floating-point number
+  static const RealScalar b1     = RealScalar(pow(RealScalar(ibeta),RealScalar(-((1-iemin)/2))));  // lower boundary of midrange
+  static const RealScalar b2     = RealScalar(pow(RealScalar(ibeta),RealScalar((iemax + 1 - it)/2)));  // upper boundary of midrange
+  static const RealScalar s1m    = RealScalar(pow(RealScalar(ibeta),RealScalar((2-iemin)/2)));  // scaling factor for lower range
+  static const RealScalar s2m    = RealScalar(pow(RealScalar(ibeta),RealScalar(- ((iemax+it)/2))));  // scaling factor for upper range
+  static const RealScalar eps    = RealScalar(pow(double(ibeta), 1-it));
+  static const RealScalar relerr = sqrt(eps);  // tolerance for neglecting asml
+
   const Derived& vec(_vec.derived());
-  static bool initialized = false;
-  static RealScalar b1, b2, s1m, s2m, rbig, relerr;
-  if(!initialized)
-  {
-    int ibeta, it, iemin, iemax, iexp;
-    RealScalar eps;
-    // This program calculates the machine-dependent constants
-    // bl, b2, slm, s2m, relerr overfl
-    // from the "basic" machine-dependent numbers
-    // nbig, ibeta, it, iemin, iemax, rbig.
-    // The following define the basic machine-dependent constants.
-    // For portability, the PORT subprograms "ilmaeh" and "rlmach"
-    // are used. For any specific computer, each of the assignment
-    // statements can be replaced
-    ibeta = std::numeric_limits<RealScalar>::radix;                 // base for floating-point numbers
-    it    = std::numeric_limits<RealScalar>::digits;                // number of base-beta digits in mantissa
-    iemin = std::numeric_limits<RealScalar>::min_exponent;          // minimum exponent
-    iemax = std::numeric_limits<RealScalar>::max_exponent;          // maximum exponent
-    rbig  = (std::numeric_limits<RealScalar>::max)();               // largest floating-point number
-
-    iexp  = -((1-iemin)/2);
-    b1    = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // lower boundary of midrange
-    iexp  = (iemax + 1 - it)/2;
-    b2    = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // upper boundary of midrange
-
-    iexp  = (2-iemin)/2;
-    s1m   = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // scaling factor for lower range
-    iexp  = - ((iemax+it)/2);
-    s2m   = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // scaling factor for upper range
-
-    eps     = RealScalar(pow(double(ibeta), 1-it));
-    relerr  = sqrt(eps);                                            // tolerance for neglecting asml
-    initialized = true;
-  }
   Index n = vec.size();
   RealScalar ab2 = b2 / RealScalar(n);
   RealScalar asml = RealScalar(0);
   RealScalar amed = RealScalar(0);
   RealScalar abig = RealScalar(0);
-  for(typename Derived::InnerIterator it(vec, 0); it; ++it)
+
+  for(Index j=0; j<vec.outerSize(); ++j)
   {
-    RealScalar ax = abs(it.value());
-    if(ax > ab2)     abig += numext::abs2(ax*s2m);
-    else if(ax < b1) asml += numext::abs2(ax*s1m);
-    else             amed += numext::abs2(ax);
+    for(typename Derived::InnerIterator iter(vec, j); iter; ++iter)
+    {
+      RealScalar ax = abs(iter.value());
+      if(ax > ab2)     abig += numext::abs2(ax*s2m);
+      else if(ax < b1) asml += numext::abs2(ax*s1m);
+      else             amed += numext::abs2(ax);
+    }
   }
   if(amed!=amed)
     return amed;  // we got a NaN
@@ -156,36 +212,7 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::stableNorm() const
 {
-  using std::sqrt;
-  using std::abs;
-  const Index blockSize = 4096;
-  RealScalar scale(0);
-  RealScalar invScale(1);
-  RealScalar ssq(0); // sum of square
-  
-  typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
-  typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
-  const DerivedCopy copy(derived());
-  
-  enum {
-    CanAlign = (   (int(DerivedCopyClean::Flags)&DirectAccessBit)
-                || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
-               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
-                 && (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
-  };
-  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
-                                                   typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
-  Index n = size();
-  
-  if(n==1)
-    return abs(this->coeff(0));
-  
-  Index bi = internal::first_default_aligned(copy);
-  if (bi>0)
-    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
-  for (; bi<n; bi+=blockSize)
-    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
-  return scale * sqrt(ssq);
+  return internal::stable_norm_impl(derived());
 }
 
 /** \returns the \em l2 norm of \c *this using the Blue's algorithm.
@@ -213,7 +240,10 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::hypotNorm() const
 {
-  return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
+  if(size()==1)
+    return numext::abs(coeff(0,0));
+  else
+    return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
 }
 
 } // end namespace Eigen
diff --git a/contrib/eigen/Eigen/src/Core/Stride.h b/contrib/eigen/Eigen/src/Core/Stride.h
index 513742f34b9acd5077e9146f2bd40a023a00848c..6494d51420a9b3eb195218434ace8b9f9c83d8eb 100644
--- a/contrib/eigen/Eigen/src/Core/Stride.h
+++ b/contrib/eigen/Eigen/src/Core/Stride.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_STRIDE_H
 #define EIGEN_STRIDE_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class Stride
   * \ingroup Core_Module
@@ -38,6 +38,10 @@ namespace Eigen {
   * \include Map_general_stride.cpp
   * Output: \verbinclude Map_general_stride.out
   *
+  * Both strides can be negative, however, a negative stride of -1 cannot be specified at compiletime
+  * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were
+  * not allowed).
+  *
   * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders
   */
 template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
@@ -55,6 +59,8 @@ class Stride
     Stride()
       : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime)
     {
+      // FIXME: for Eigen 4 we should use DynamicIndex instead of Dynamic.
+      // FIXME: for Eigen 4 we should also unify this API with fix<>
       eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic);
     }
 
@@ -63,7 +69,6 @@ class Stride
     Stride(Index outerStride, Index innerStride)
       : m_outer(outerStride), m_inner(innerStride)
     {
-      eigen_assert(innerStride>=0 && outerStride>=0);
     }
 
     /** Copy constructor */
@@ -73,10 +78,10 @@ class Stride
     {}
 
     /** \returns the outer stride */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index outer() const { return m_outer.value(); }
     /** \returns the inner stride */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index inner() const { return m_inner.value(); }
 
   protected:
diff --git a/contrib/eigen/Eigen/src/Core/Swap.h b/contrib/eigen/Eigen/src/Core/Swap.h
index d702009185ec790670230d3ce79aa6955ead7438..180a4e5adff9132036aa81e25beb39d7799aff45 100644
--- a/contrib/eigen/Eigen/src/Core/Swap.h
+++ b/contrib/eigen/Eigen/src/Core/Swap.h
@@ -30,12 +30,13 @@ public:
   typedef typename Base::DstXprType DstXprType;
   typedef swap_assign_op<Scalar> Functor;
   
-  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
     : Base(dst, src, func, dstExpr)
   {}
   
   template<int StoreMode, int LoadMode, typename PacketType>
-  void assignPacket(Index row, Index col)
+  EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
   {
     PacketType tmp = m_src.template packet<LoadMode,PacketType>(row,col);
     const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(row,col, m_dst.template packet<StoreMode,PacketType>(row,col));
@@ -43,7 +44,7 @@ public:
   }
   
   template<int StoreMode, int LoadMode, typename PacketType>
-  void assignPacket(Index index)
+  EIGEN_STRONG_INLINE void assignPacket(Index index)
   {
     PacketType tmp = m_src.template packet<LoadMode,PacketType>(index);
     const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(index, m_dst.template packet<StoreMode,PacketType>(index));
@@ -52,7 +53,7 @@ public:
   
   // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
   template<int StoreMode, int LoadMode, typename PacketType>
-  void assignPacketByOuterInner(Index outer, Index inner)
+  EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
   {
     Index row = Base::rowIndexByOuterInner(outer, inner); 
     Index col = Base::colIndexByOuterInner(outer, inner);
diff --git a/contrib/eigen/Eigen/src/Core/Transpose.h b/contrib/eigen/Eigen/src/Core/Transpose.h
index 960dc4510c75007c4bf3c9297b7e95c211ff93e7..2bc658f40b88bd588d39457dc9b8f19deb0c14cf 100644
--- a/contrib/eigen/Eigen/src/Core/Transpose.h
+++ b/contrib/eigen/Eigen/src/Core/Transpose.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_TRANSPOSE_H
 #define EIGEN_TRANSPOSE_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 template<typename MatrixType>
@@ -61,24 +61,27 @@ template<typename MatrixType> class Transpose
     typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
     EIGEN_DEVICE_FUNC
-    explicit inline Transpose(MatrixType& matrix) : m_matrix(matrix) {}
+    explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {}
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
 
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
 
     /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const typename internal::remove_all<MatrixTypeNested>::type&
     nestedExpression() const { return m_matrix; }
 
     /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     typename internal::remove_reference<MatrixTypeNested>::type&
     nestedExpression() { return m_matrix; }
 
     /** \internal */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     void resize(Index nrows, Index ncols) {
       m_matrix.resize(ncols,nrows);
     }
@@ -122,8 +125,10 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
     EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)
 
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index innerStride() const { return derived().nestedExpression().innerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index outerStride() const { return derived().nestedExpression().outerStride(); }
 
     typedef typename internal::conditional<
                        internal::is_lvalue<MatrixType>::value,
@@ -131,18 +136,20 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
                        const Scalar
                      >::type ScalarWithConstIfNotLvalue;
 
-    EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
-    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return derived().nestedExpression().data(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Scalar* data() const { return derived().nestedExpression().data(); }
 
     // FIXME: shall we keep the const version of coeffRef?
-    EIGEN_DEVICE_FUNC
-    inline const Scalar& coeffRef(Index rowId, Index colId) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Scalar& coeffRef(Index rowId, Index colId) const
     {
       return derived().nestedExpression().coeffRef(colId, rowId);
     }
 
-    EIGEN_DEVICE_FUNC
-    inline const Scalar& coeffRef(Index index) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Scalar& coeffRef(Index index) const
     {
       return derived().nestedExpression().coeffRef(index);
     }
@@ -170,7 +177,8 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
   *
   * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-inline Transpose<Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Transpose<Derived>
 DenseBase<Derived>::transpose()
 {
   return TransposeReturnType(derived());
@@ -182,7 +190,8 @@ DenseBase<Derived>::transpose()
   *
   * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-inline typename DenseBase<Derived>::ConstTransposeReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename DenseBase<Derived>::ConstTransposeReturnType
 DenseBase<Derived>::transpose() const
 {
   return ConstTransposeReturnType(derived());
@@ -208,7 +217,7 @@ DenseBase<Derived>::transpose() const
   *
   * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::AdjointReturnType
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType
 MatrixBase<Derived>::adjoint() const
 {
   return AdjointReturnType(this->transpose());
@@ -230,11 +239,10 @@ struct inplace_transpose_selector;
 template<typename MatrixType>
 struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
   static void run(MatrixType& m) {
-    m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
+    m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose().template triangularView<StrictlyUpper>());
   }
 };
 
-// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only.
 template<typename MatrixType>
 struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
   static void run(MatrixType& m) {
@@ -251,16 +259,66 @@ struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x Packet
   }
 };
 
+
+template <typename MatrixType, Index Alignment>
+void BlockedInPlaceTranspose(MatrixType& m) {
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
+  const Index PacketSize = internal::packet_traits<Scalar>::size;
+  eigen_assert(m.rows() == m.cols());
+  int row_start = 0;
+  for (; row_start + PacketSize <= m.rows(); row_start += PacketSize) {
+    for (int col_start = row_start; col_start + PacketSize <= m.cols(); col_start += PacketSize) {
+      PacketBlock<Packet> A;
+      if (row_start == col_start) {
+        for (Index i=0; i<PacketSize; ++i)
+          A.packet[i] = m.template packetByOuterInner<Alignment>(row_start + i,col_start);
+        internal::ptranspose(A);
+        for (Index i=0; i<PacketSize; ++i)
+          m.template writePacket<Alignment>(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), A.packet[i]);
+      } else {
+        PacketBlock<Packet> B;
+        for (Index i=0; i<PacketSize; ++i) {
+          A.packet[i] = m.template packetByOuterInner<Alignment>(row_start + i,col_start);
+          B.packet[i] = m.template packetByOuterInner<Alignment>(col_start + i, row_start);
+        }
+        internal::ptranspose(A);
+        internal::ptranspose(B);
+        for (Index i=0; i<PacketSize; ++i) {
+          m.template writePacket<Alignment>(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), B.packet[i]);
+          m.template writePacket<Alignment>(m.rowIndexByOuterInner(col_start + i, row_start), m.colIndexByOuterInner(col_start + i,row_start), A.packet[i]);
+        }
+      }
+    }
+  }
+  for (Index row = row_start; row < m.rows(); ++row) {
+    m.matrix().row(row).head(row).swap(
+        m.matrix().col(row).head(row).transpose());
+  }
+}
+
 template<typename MatrixType,bool MatchPacketSize>
-struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix
+struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square or dynamic matrix
   static void run(MatrixType& m) {
-    if (m.rows()==m.cols())
-      m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
-    else
+    typedef typename MatrixType::Scalar Scalar;
+    if (m.rows() == m.cols()) {
+      const Index PacketSize = internal::packet_traits<Scalar>::size;
+      if (!NumTraits<Scalar>::IsComplex && m.rows() >= PacketSize) {
+        if ((m.rows() % PacketSize) == 0)
+          BlockedInPlaceTranspose<MatrixType,internal::evaluator<MatrixType>::Alignment>(m);
+        else
+          BlockedInPlaceTranspose<MatrixType,Unaligned>(m);
+      }
+      else {
+        m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose().template triangularView<StrictlyUpper>());
+      }
+    } else {
       m = m.transpose().eval();
+    }
   }
 };
 
+
 } // end namespace internal
 
 /** This is the "in place" version of transpose(): it replaces \c *this by its own transpose.
@@ -278,12 +336,12 @@ struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non squ
   * Notice however that this method is only useful if you want to replace a matrix by its own transpose.
   * If you just need the transpose of a matrix, use transpose().
   *
-  * \note if the matrix is not square, then \c *this must be a resizable matrix. 
+  * \note if the matrix is not square, then \c *this must be a resizable matrix.
   * This excludes (non-square) fixed-size matrices, block-expressions and maps.
   *
   * \sa transpose(), adjoint(), adjointInPlace() */
 template<typename Derived>
-inline void DenseBase<Derived>::transposeInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
 {
   eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
                && "transposeInPlace() called on a non-square non-resizable matrix");
@@ -314,7 +372,7 @@ inline void DenseBase<Derived>::transposeInPlace()
   *
   * \sa transpose(), adjoint(), transposeInPlace() */
 template<typename Derived>
-inline void MatrixBase<Derived>::adjointInPlace()
+EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::adjointInPlace()
 {
   derived() = adjoint().eval();
 }
@@ -393,7 +451,8 @@ struct checkTransposeAliasing_impl<Derived, OtherDerived, false>
 template<typename Dst, typename Src>
 void check_for_aliasing(const Dst &dst, const Src &src)
 {
-  internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
+  if((!Dst::IsVectorAtCompileTime) && dst.rows()>1 && dst.cols()>1)
+    internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
 }
 
 } // end namespace internal
diff --git a/contrib/eigen/Eigen/src/Core/Transpositions.h b/contrib/eigen/Eigen/src/Core/Transpositions.h
index 86da5af59368ed7b779b3be2c6412ecfa783ae68..38a7b01cb51dc0a789c9b02732657ad3c4ae0a6e 100644
--- a/contrib/eigen/Eigen/src/Core/Transpositions.h
+++ b/contrib/eigen/Eigen/src/Core/Transpositions.h
@@ -10,20 +10,22 @@
 #ifndef EIGEN_TRANSPOSITIONS_H
 #define EIGEN_TRANSPOSITIONS_H
 
-namespace Eigen { 
+namespace Eigen {
 
 template<typename Derived>
 class TranspositionsBase
 {
     typedef internal::traits<Derived> Traits;
-    
+
   public:
 
     typedef typename Traits::IndicesType IndicesType;
     typedef typename IndicesType::Scalar StorageIndex;
     typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
+    EIGEN_DEVICE_FUNC
     Derived& derived() { return *static_cast<Derived*>(this); }
+    EIGEN_DEVICE_FUNC
     const Derived& derived() const { return *static_cast<const Derived*>(this); }
 
     /** Copies the \a other transpositions into \c *this */
@@ -33,26 +35,19 @@ class TranspositionsBase
       indices() = other.indices();
       return derived();
     }
-    
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Derived& operator=(const TranspositionsBase& other)
-    {
-      indices() = other.indices();
-      return derived();
-    }
-    #endif
 
     /** \returns the number of transpositions */
+    EIGEN_DEVICE_FUNC
     Index size() const { return indices().size(); }
     /** \returns the number of rows of the equivalent permutation matrix */
+    EIGEN_DEVICE_FUNC
     Index rows() const { return indices().size(); }
     /** \returns the number of columns of the equivalent permutation matrix */
+    EIGEN_DEVICE_FUNC
     Index cols() const { return indices().size(); }
 
     /** Direct access to the underlying index vector */
+    EIGEN_DEVICE_FUNC
     inline const StorageIndex& coeff(Index i) const { return indices().coeff(i); }
     /** Direct access to the underlying index vector */
     inline StorageIndex& coeffRef(Index i) { return indices().coeffRef(i); }
@@ -66,8 +61,10 @@ class TranspositionsBase
     inline StorageIndex& operator[](Index i) { return indices()(i); }
 
     /** const version of indices(). */
+    EIGEN_DEVICE_FUNC
     const IndicesType& indices() const { return derived().indices(); }
     /** \returns a reference to the stored array representing the transpositions. */
+    EIGEN_DEVICE_FUNC
     IndicesType& indices() { return derived().indices(); }
 
     /** Resizes to given size. */
@@ -84,7 +81,7 @@ class TranspositionsBase
     }
 
     // FIXME: do we want such methods ?
-    // might be usefull when the target matrix expression is complex, e.g.:
+    // might be useful when the target matrix expression is complex, e.g.:
     // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..);
     /*
     template<typename MatrixType>
@@ -171,12 +168,6 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
     inline Transpositions(const TranspositionsBase<OtherDerived>& other)
       : m_indices(other.indices()) {}
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Standard copy constructor. Defined only to prevent a default copy constructor
-      * from hiding the other templated constructor */
-    inline Transpositions(const Transpositions& other) : m_indices(other.indices()) {}
-    #endif
-
     /** Generic constructor from expression of the transposition indices. */
     template<typename Other>
     explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices)
@@ -189,25 +180,16 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
       return Base::operator=(other);
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Transpositions& operator=(const Transpositions& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
     /** Constructs an uninitialized permutation matrix of given size.
       */
     inline Transpositions(Index size) : m_indices(size)
     {}
 
     /** const version of indices(). */
+    EIGEN_DEVICE_FUNC
     const IndicesType& indices() const { return m_indices; }
     /** \returns a reference to the stored array representing the transpositions. */
+    EIGEN_DEVICE_FUNC
     IndicesType& indices() { return m_indices; }
 
   protected:
@@ -265,9 +247,11 @@ class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,P
     #endif
 
     /** const version of indices(). */
+    EIGEN_DEVICE_FUNC
     const IndicesType& indices() const { return m_indices; }
-    
+
     /** \returns a reference to the stored array representing the transpositions. */
+    EIGEN_DEVICE_FUNC
     IndicesType& indices() { return m_indices; }
 
   protected:
@@ -306,21 +290,12 @@ class TranspositionsWrapper
       return Base::operator=(other);
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    TranspositionsWrapper& operator=(const TranspositionsWrapper& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
     /** const version of indices(). */
+    EIGEN_DEVICE_FUNC
     const IndicesType& indices() const { return m_indices; }
 
     /** \returns a reference to the stored array representing the transpositions. */
+    EIGEN_DEVICE_FUNC
     IndicesType& indices() { return m_indices; }
 
   protected:
@@ -374,9 +349,12 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
 
     explicit Transpose(const TranspositionType& t) : m_transpositions(t) {}
 
-    Index size() const { return m_transpositions.size(); }
-    Index rows() const { return m_transpositions.size(); }
-    Index cols() const { return m_transpositions.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index size() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return m_transpositions.size(); }
 
     /** \returns the \a matrix with the inverse transpositions applied to the columns.
       */
@@ -395,7 +373,8 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
     {
       return Product<Transpose, OtherDerived, AliasFreeProduct>(*this, matrix.derived());
     }
-    
+
+    EIGEN_DEVICE_FUNC
     const TranspositionType& nestedExpression() const { return m_transpositions; }
 
   protected:
diff --git a/contrib/eigen/Eigen/src/Core/TriangularMatrix.h b/contrib/eigen/Eigen/src/Core/TriangularMatrix.h
index 9abb7e31a8e560eced1941a8b846d81a7c1fa2b7..fdb8bc15a5b2c9ea0c133b26856a0cd091e3127a 100644
--- a/contrib/eigen/Eigen/src/Core/TriangularMatrix.h
+++ b/contrib/eigen/Eigen/src/Core/TriangularMatrix.h
@@ -11,12 +11,12 @@
 #ifndef EIGEN_TRIANGULARMATRIX_H
 #define EIGEN_TRIANGULARMATRIX_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
-  
+
 template<int Side, typename TriangularType, typename Rhs> struct triangular_solve_retval;
-  
+
 }
 
 /** \class TriangularBase
@@ -34,16 +34,16 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
       ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
       MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
-      
+
       SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
                                                    internal::traits<Derived>::ColsAtCompileTime>::ret),
       /**< This is equal to the number of coefficients, i.e. the number of
           * rows times the number of columns, or to \a Dynamic if this is not
           * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
-      
+
       MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
                                                    internal::traits<Derived>::MaxColsAtCompileTime>::ret)
-        
+
     };
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
@@ -53,18 +53,19 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
     typedef Derived const& Nested;
 
     EIGEN_DEVICE_FUNC
-    inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); }
+    inline TriangularBase() { eigen_assert(!((int(Mode) & int(UnitDiag)) && (int(Mode) & int(ZeroDiag)))); }
+
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); }
 
-    EIGEN_DEVICE_FUNC
-    inline Index rows() const { return derived().rows(); }
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return derived().cols(); }
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const { return derived().outerStride(); }
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const { return derived().innerStride(); }
-    
     // dummy resize function
+    EIGEN_DEVICE_FUNC
     void resize(Index rows, Index cols)
     {
       EIGEN_UNUSED_VARIABLE(rows);
@@ -155,7 +156,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
   * \param MatrixType the type of the object in which we are taking the triangular part
   * \param Mode the kind of triangular matrix expression to construct. Can be #Upper,
   *             #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower.
-  *             This is in fact a bit field; it must have either #Upper or #Lower, 
+  *             This is in fact a bit field; it must have either #Upper or #Lower,
   *             and additionally it may have #UnitDiag or #ZeroDiag or neither.
   *
   * This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular
@@ -197,7 +198,8 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
     typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;
 
     typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
-    
+    typedef TriangularView<typename internal::add_const<MatrixType>::type, _Mode> ConstTriangularView;
+
   public:
 
     typedef typename internal::traits<TriangularView>::StorageKind StorageKind;
@@ -216,15 +218,15 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
     EIGEN_DEVICE_FUNC
     explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
     {}
-    
+
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView)
 
     /** \copydoc EigenBase::rows() */
-    EIGEN_DEVICE_FUNC
-    inline Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
     /** \copydoc EigenBase::cols() */
-    EIGEN_DEVICE_FUNC
-    inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
     /** \returns a const reference to the nested expression */
     EIGEN_DEVICE_FUNC
@@ -233,13 +235,25 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
     /** \returns a reference to the nested expression */
     EIGEN_DEVICE_FUNC
     NestedExpression& nestedExpression() { return m_matrix; }
-    
+
     typedef TriangularView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
     /** \sa MatrixBase::conjugate() const */
     EIGEN_DEVICE_FUNC
     inline const ConjugateReturnType conjugate() const
     { return ConjugateReturnType(m_matrix.conjugate()); }
 
+    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+     *           returns \c *this otherwise.
+     */
+    template<bool Cond>
+    EIGEN_DEVICE_FUNC
+    inline typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type
+    conjugateIf() const
+    {
+      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type ReturnType;
+      return ReturnType(m_matrix.template conjugateIf<Cond>());
+    }
+
     typedef TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
     /** \sa MatrixBase::adjoint() const */
     EIGEN_DEVICE_FUNC
@@ -255,7 +269,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
       typename MatrixType::TransposeReturnType tmp(m_matrix);
       return TransposeReturnType(tmp);
     }
-    
+
     typedef TriangularView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
     /** \sa MatrixBase::transpose() const */
     EIGEN_DEVICE_FUNC
@@ -266,10 +280,10 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
 
     template<typename Other>
     EIGEN_DEVICE_FUNC
-    inline const Solve<TriangularView, Other> 
+    inline const Solve<TriangularView, Other>
     solve(const MatrixBase<Other>& other) const
     { return Solve<TriangularView, Other>(*this, other.derived()); }
-    
+
   // workaround MSVC ICE
   #if EIGEN_COMP_MSVC
     template<int Side, typename Other>
@@ -313,7 +327,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
       else
         return m_matrix.diagonal().prod();
     }
-      
+
   protected:
 
     MatrixTypeNested m_matrix;
@@ -375,7 +389,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar,typename Other::Scalar>());
       return derived();
     }
-    
+
     /** \sa MatrixBase::operator*=() */
     EIGEN_DEVICE_FUNC
     TriangularViewType&  operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() * other; }
@@ -433,14 +447,14 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
     TriangularViewType& operator=(const TriangularViewImpl& other)
     { return *this = other.derived().nestedExpression(); }
 
-    /** \deprecated */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    /** \deprecated */
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
     void lazyAssign(const TriangularBase<OtherDerived>& other);
 
-    /** \deprecated */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    /** \deprecated */
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
     void lazyAssign(const MatrixBase<OtherDerived>& other);
 #endif
 
@@ -468,7 +482,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
       * \a Side==OnTheRight.
       *
-      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
+      * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
       *
       * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
       * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
@@ -486,7 +500,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       * \sa TriangularView::solveInPlace()
       */
     template<int Side, typename Other>
-    EIGEN_DEVICE_FUNC
     inline const internal::triangular_solve_retval<Side,TriangularViewType, Other>
     solve(const MatrixBase<Other>& other) const;
 
@@ -495,7 +508,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
       * This function will const_cast it, so constness isn't honored here.
       *
-      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
+      * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
       *
       * See TriangularView:solve() for the details.
       */
@@ -521,10 +534,10 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
     }
 
-    /** \deprecated
-      * Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */
+    /** Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    /** \deprecated */
+    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
     void swap(MatrixBase<OtherDerived> const & other)
     {
       EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
@@ -556,7 +569,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-inline TriangularView<MatrixType, Mode>&
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
 {
   internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -566,7 +579,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDer
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
 {
   internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
 }
@@ -575,7 +588,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<Ot
 
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-inline TriangularView<MatrixType, Mode>&
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
 {
   eigen_assert(Mode == int(OtherDerived::Mode));
@@ -585,7 +598,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<Othe
 
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
 {
   eigen_assert(Mode == int(OtherDerived::Mode));
   internal::call_assignment_no_alias(derived(), other.derived());
@@ -600,7 +613,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBas
   * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
 {
   evalToLazy(other.derived());
 }
@@ -626,6 +639,7 @@ void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
   */
 template<typename Derived>
 template<unsigned int Mode>
+EIGEN_DEVICE_FUNC
 typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView()
 {
@@ -635,6 +649,7 @@ MatrixBase<Derived>::triangularView()
 /** This is the const version of MatrixBase::triangularView() */
 template<typename Derived>
 template<unsigned int Mode>
+EIGEN_DEVICE_FUNC
 typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView() const
 {
@@ -700,7 +715,7 @@ bool MatrixBase<Derived>::isLowerTriangular(const RealScalar& prec) const
 
 namespace internal {
 
-  
+
 // TODO currently a triangular expression has the form TriangularView<.,.>
 //      in the future triangular-ness should be defined by the expression traits
 //      such that Transpose<TriangularView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
@@ -717,6 +732,7 @@ struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
 {
   typedef TriangularView<MatrixType,Mode> XprType;
   typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
+  EIGEN_DEVICE_FUNC
   unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
 };
 
@@ -728,7 +744,7 @@ struct Dense2Triangular         {};
 
 template<typename Kernel, unsigned int Mode, int UnrollCount, bool ClearOpposite> struct triangular_assignment_loop;
 
- 
+
 /** \internal Specialization of the dense assignment kernel for triangular matrices.
   * The main difference is that the triangular, diagonal, and opposite parts are processed through three different functions.
   * \tparam UpLo must be either Lower or Upper
@@ -745,17 +761,17 @@ protected:
   using Base::m_src;
   using Base::m_functor;
 public:
-  
+
   typedef typename Base::DstEvaluatorType DstEvaluatorType;
   typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
   typedef typename Base::Scalar Scalar;
   typedef typename Base::AssignmentTraits AssignmentTraits;
-  
-  
+
+
   EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
     : Base(dst, src, func, dstExpr)
   {}
-  
+
 #ifdef EIGEN_INTERNAL_DEBUGGING
   EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
   {
@@ -765,16 +781,16 @@ public:
 #else
   using Base::assignCoeff;
 #endif
-  
+
   EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
   {
          if(Mode==UnitDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(1));
     else if(Mode==ZeroDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(0));
     else if(Mode==0)                       Base::assignCoeff(id,id);
   }
-  
+
   EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index row, Index col)
-  { 
+  {
     eigen_internal_assert(row!=col);
     if(SetOpposite)
       m_functor.assignCoeff(m_dst.coeffRef(row,col), Scalar(0));
@@ -795,17 +811,17 @@ void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src, con
   if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
     dst.resize(dstRows, dstCols);
   DstEvaluatorType dstEvaluator(dst);
-    
+
   typedef triangular_dense_assignment_kernel< Mode&(Lower|Upper),Mode&(UnitDiag|ZeroDiag|SelfAdjoint),SetOpposite,
                                               DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
   Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
-  
+
   enum {
       unroll = DstXprType::SizeAtCompileTime != Dynamic
             && SrcEvaluatorType::CoeffReadCost < HugeCost
-            && DstXprType::SizeAtCompileTime * (DstEvaluatorType::CoeffReadCost+SrcEvaluatorType::CoeffReadCost) / 2 <= EIGEN_UNROLLING_LIMIT
+            && DstXprType::SizeAtCompileTime * (int(DstEvaluatorType::CoeffReadCost) + int(SrcEvaluatorType::CoeffReadCost)) / 2 <= EIGEN_UNROLLING_LIMIT
     };
-  
+
   triangular_assignment_loop<Kernel, Mode, unroll ? int(DstXprType::SizeAtCompileTime) : Dynamic, SetOpposite>::run(kernel);
 }
 
@@ -827,8 +843,8 @@ struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular>
   EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
     eigen_assert(int(DstXprType::Mode) == int(SrcXprType::Mode));
-    
-    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);  
+
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
   }
 };
 
@@ -837,7 +853,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense>
 {
   EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
-    call_triangular_assignment_loop<SrcXprType::Mode, (SrcXprType::Mode&SelfAdjoint)==0>(dst, src, func);  
+    call_triangular_assignment_loop<SrcXprType::Mode, (int(SrcXprType::Mode) & int(SelfAdjoint)) == 0>(dst, src, func);
   }
 };
 
@@ -846,7 +862,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular>
 {
   EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
-    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);  
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
   }
 };
 
@@ -857,19 +873,19 @@ struct triangular_assignment_loop
   // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
   typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
   typedef typename DstEvaluatorType::XprType DstXprType;
-  
+
   enum {
     col = (UnrollCount-1) / DstXprType::RowsAtCompileTime,
     row = (UnrollCount-1) % DstXprType::RowsAtCompileTime
   };
-  
+
   typedef typename Kernel::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC
   static inline void run(Kernel &kernel)
   {
     triangular_assignment_loop<Kernel, Mode, UnrollCount-1, SetOpposite>::run(kernel);
-    
+
     if(row==col)
       kernel.assignDiagonalCoeff(row);
     else if( ((Mode&Lower) && row>col) || ((Mode&Upper) && row<col) )
@@ -912,10 +928,10 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
       }
       else
         i = maxi;
-      
+
       if(i<kernel.rows()) // then i==j
         kernel.assignDiagonalCoeff(i++);
-      
+
       if (((Mode&Upper) && SetOpposite) || (Mode&Lower))
       {
         for(; i < kernel.rows(); ++i)
@@ -932,14 +948,14 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
   * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
 {
   other.derived().resize(this->rows(), this->cols());
-  internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
+  internal::call_triangular_assignment_loop<Derived::Mode, (int(Derived::Mode) & int(SelfAdjoint)) == 0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
 }
 
 namespace internal {
-  
+
 // Triangular = Product
 template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
 struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, Dense2Triangular>
@@ -952,7 +968,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_
     if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
       dst.resize(dstRows, dstCols);
 
-    dst._assignProduct(src, 1, 0);
+    dst._assignProduct(src, Scalar(1), false);
   }
 };
 
@@ -963,7 +979,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_ass
   typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
   static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,typename SrcXprType::Scalar> &)
   {
-    dst._assignProduct(src, 1, 1);
+    dst._assignProduct(src, Scalar(1), true);
   }
 };
 
@@ -974,7 +990,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_ass
   typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
   static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,typename SrcXprType::Scalar> &)
   {
-    dst._assignProduct(src, -1, 1);
+    dst._assignProduct(src, Scalar(-1), true);
   }
 };
 
diff --git a/contrib/eigen/Eigen/src/Core/VectorBlock.h b/contrib/eigen/Eigen/src/Core/VectorBlock.h
index d72fbf7e99de5919939b7720ad892af77923e7e9..71c5b95eec1114baa295c53122099e2a150dd69f 100644
--- a/contrib/eigen/Eigen/src/Core/VectorBlock.h
+++ b/contrib/eigen/Eigen/src/Core/VectorBlock.h
@@ -35,7 +35,7 @@ struct traits<VectorBlock<VectorType, Size> >
   * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
   * most of the time this is the only way it is used.
   *
-  * However, if you want to directly maniputate sub-vector expressions,
+  * However, if you want to directly manipulate sub-vector expressions,
   * for instance if you want to write a function returning such an expression, you
   * will need to use this class.
   *
@@ -71,8 +71,8 @@ template<typename VectorType, int Size> class VectorBlock
 
     /** Dynamic-size constructor
       */
-    EIGEN_DEVICE_FUNC
-    inline VectorBlock(VectorType& vector, Index start, Index size)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    VectorBlock(VectorType& vector, Index start, Index size)
       : Base(vector,
              IsColVector ? start : 0, IsColVector ? 0 : start,
              IsColVector ? size  : 1, IsColVector ? 1 : size)
@@ -82,8 +82,8 @@ template<typename VectorType, int Size> class VectorBlock
 
     /** Fixed-size constructor
       */
-    EIGEN_DEVICE_FUNC
-    inline VectorBlock(VectorType& vector, Index start)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    VectorBlock(VectorType& vector, Index start)
       : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock);
diff --git a/contrib/eigen/Eigen/src/Core/VectorwiseOp.h b/contrib/eigen/Eigen/src/Core/VectorwiseOp.h
index 4fe267e9f11f6e9f810cf7dc7ab01afa4ecdce0a..870f4f1e40bfc870a03cc1feb8a5ac9f2b7cb486 100644
--- a/contrib/eigen/Eigen/src/Core/VectorwiseOp.h
+++ b/contrib/eigen/Eigen/src/Core/VectorwiseOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -65,10 +65,10 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<Matri
     explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
       : m_matrix(mat), m_functor(func) {}
 
-    EIGEN_DEVICE_FUNC
-    Index rows() const { return (Direction==Vertical   ? 1 : m_matrix.rows()); }
-    EIGEN_DEVICE_FUNC
-    Index cols() const { return (Direction==Horizontal ? 1 : m_matrix.cols()); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return (Direction==Vertical   ? 1 : m_matrix.rows()); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return (Direction==Horizontal ? 1 : m_matrix.cols()); }
 
     EIGEN_DEVICE_FUNC
     typename MatrixType::Nested nestedExpression() const { return m_matrix; }
@@ -81,39 +81,46 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<Matri
     const MemberOp m_functor;
 };
 
-#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST)                               \
-  template <typename ResultType>                                        \
-  struct member_##MEMBER {                                              \
-    EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER)                            \
-    typedef ResultType result_type;                                     \
-    template<typename Scalar, int Size> struct Cost                     \
-    { enum { value = COST }; };                                         \
-    template<typename XprType>                                          \
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                               \
-    ResultType operator()(const XprType& mat) const                     \
-    { return mat.MEMBER(); } \
+template<typename A,typename B> struct partial_redux_dummy_func;
+
+#define EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,VECTORIZABLE,BINARYOP)                \
+  template <typename ResultType,typename Scalar>                                                            \
+  struct member_##MEMBER {                                                                  \
+    EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER)                                                \
+    typedef ResultType result_type;                                                         \
+    typedef BINARYOP<Scalar,Scalar> BinaryOp;   \
+    template<int Size> struct Cost { enum { value = COST }; };             \
+    enum { Vectorizable = VECTORIZABLE };                                                   \
+    template<typename XprType>                                                              \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                   \
+    ResultType operator()(const XprType& mat) const                                         \
+    { return mat.MEMBER(); }                                                                \
+    BinaryOp binaryFunc() const { return BinaryOp(); }                                      \
   }
 
+#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST) \
+  EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,0,partial_redux_dummy_func)
+
 namespace internal {
 
-EIGEN_MEMBER_FUNCTOR(squaredNorm, Size * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(norm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(stableNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(blueNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(hypotNorm, (Size-1) * functor_traits<scalar_hypot_op<Scalar> >::Cost );
-EIGEN_MEMBER_FUNCTOR(sum, (Size-1)*NumTraits<Scalar>::AddCost);
-EIGEN_MEMBER_FUNCTOR(mean, (Size-1)*NumTraits<Scalar>::AddCost + NumTraits<Scalar>::MulCost);
-EIGEN_MEMBER_FUNCTOR(minCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
-EIGEN_MEMBER_FUNCTOR(maxCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(all, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits<Scalar>::AddCost);
-EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost);
 
-template <int p, typename ResultType>
+EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(sum, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_sum_op);
+EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(minCoeff, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_min_op);
+EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(maxCoeff, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_max_op);
+EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost, 1, internal::scalar_product_op);
+
+template <int p, typename ResultType,typename Scalar>
 struct member_lpnorm {
   typedef ResultType result_type;
-  template<typename Scalar, int Size> struct Cost
+  enum { Vectorizable = 0 };
+  template<int Size> struct Cost
   { enum { value = (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost }; };
   EIGEN_DEVICE_FUNC member_lpnorm() {}
   template<typename XprType>
@@ -121,17 +128,20 @@ struct member_lpnorm {
   { return mat.template lpNorm<p>(); }
 };
 
-template <typename BinaryOp, typename Scalar>
+template <typename BinaryOpT, typename Scalar>
 struct member_redux {
+  typedef BinaryOpT BinaryOp;
   typedef typename result_of<
                      BinaryOp(const Scalar&,const Scalar&)
                    >::type  result_type;
-  template<typename _Scalar, int Size> struct Cost
-  { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
+
+  enum { Vectorizable = functor_traits<BinaryOp>::PacketAccess };
+  template<int Size> struct Cost { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
   EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {}
   template<typename Derived>
   EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase<Derived>& mat) const
   { return mat.redux(m_functor); }
+  const BinaryOp& binaryFunc() const { return m_functor; }
   const BinaryOp m_functor;
 };
 }
@@ -139,18 +149,38 @@ struct member_redux {
 /** \class VectorwiseOp
   * \ingroup Core_Module
   *
-  * \brief Pseudo expression providing partial reduction operations
+  * \brief Pseudo expression providing broadcasting and partial reduction operations
   *
   * \tparam ExpressionType the type of the object on which to do partial reductions
-  * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal)
+  * \tparam Direction indicates whether to operate on columns (#Vertical) or rows (#Horizontal)
   *
-  * This class represents a pseudo expression with partial reduction features.
+  * This class represents a pseudo expression with broadcasting and partial reduction features.
   * It is the return type of DenseBase::colwise() and DenseBase::rowwise()
-  * and most of the time this is the only way it is used.
+  * and most of the time this is the only way it is explicitly used.
+  *
+  * To understand the logic of rowwise/colwise expression, let's consider a generic case `A.colwise().foo()`
+  * where `foo` is any method of `VectorwiseOp`. This expression is equivalent to applying `foo()` to each
+  * column of `A` and then re-assemble the outputs in a matrix expression:
+  * \code [A.col(0).foo(), A.col(1).foo(), ..., A.col(A.cols()-1).foo()] \endcode
   *
   * Example: \include MatrixBase_colwise.cpp
   * Output: \verbinclude MatrixBase_colwise.out
   *
+  * The begin() and end() methods are obviously exceptions to the previous rule as they
+  * return STL-compatible begin/end iterators to the rows or columns of the nested expression.
+  * Typical use cases include for-range-loop and calls to STL algorithms:
+  *
+  * Example: \include MatrixBase_colwise_iterator_cxx11.cpp
+  * Output: \verbinclude MatrixBase_colwise_iterator_cxx11.out
+  *
+  * For a partial reduction on an empty input, some rules apply.
+  * For the sake of clarity, let's consider a vertical reduction:
+  *   - If the number of columns is zero, then a 1x0 row-major vector expression is returned.
+  *   - Otherwise, if the number of rows is zero, then
+  *       - a row vector of zeros is returned for sum-like reductions (sum, squaredNorm, norm, etc.)
+  *       - a row vector of ones is returned for a product reduction (e.g., <code>MatrixXd(n,0).colwise().prod()</code>)
+  *       - an assert is triggered for all other reductions (minCoeff,maxCoeff,redux(bin_op))
+  *
   * \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr
   */
 template<typename ExpressionType, int Direction> class VectorwiseOp
@@ -163,11 +193,11 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     typedef typename internal::ref_selector<ExpressionType>::non_const_type ExpressionTypeNested;
     typedef typename internal::remove_all<ExpressionTypeNested>::type ExpressionTypeNestedCleaned;
 
-    template<template<typename _Scalar> class Functor,
-                      typename Scalar_=Scalar> struct ReturnType
+    template<template<typename OutScalar,typename InputScalar> class Functor,
+                      typename ReturnScalar=Scalar> struct ReturnType
     {
       typedef PartialReduxExpr<ExpressionType,
-                               Functor<Scalar_>,
+                               Functor<ReturnScalar,Scalar>,
                                Direction
                               > Type;
     };
@@ -187,23 +217,6 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
   protected:
 
-    typedef typename internal::conditional<isVertical,
-                               typename ExpressionType::ColXpr,
-                               typename ExpressionType::RowXpr>::type SubVector;
-    /** \internal
-      * \returns the i-th subvector according to the \c Direction */
-    EIGEN_DEVICE_FUNC
-    SubVector subVector(Index i)
-    {
-      return SubVector(m_matrix.derived(),i);
-    }
-
-    /** \internal
-      * \returns the number of subvectors in the direction \c Direction */
-    EIGEN_DEVICE_FUNC
-    Index subVectors() const
-    { return isVertical?m_matrix.cols():m_matrix.rows(); }
-
     template<typename OtherDerived> struct ExtendedType {
       typedef Replicate<OtherDerived,
                         isVertical   ? 1 : ExpressionType::RowsAtCompileTime,
@@ -258,42 +271,101 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     EIGEN_DEVICE_FUNC
     inline const ExpressionType& _expression() const { return m_matrix; }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
+    /** STL-like <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">RandomAccessIterator</a>
+      * iterator type over the columns or rows as returned by the begin() and end() methods.
+      */
+    random_access_iterator_type iterator;
+    /** This is the const version of iterator (aka read-only) */
+    random_access_iterator_type const_iterator;
+    #else
+    typedef internal::subvector_stl_iterator<ExpressionType,               DirectionType(Direction)> iterator;
+    typedef internal::subvector_stl_iterator<const ExpressionType,         DirectionType(Direction)> const_iterator;
+    typedef internal::subvector_stl_reverse_iterator<ExpressionType,       DirectionType(Direction)> reverse_iterator;
+    typedef internal::subvector_stl_reverse_iterator<const ExpressionType, DirectionType(Direction)> const_reverse_iterator;
+    #endif
+
+    /** returns an iterator to the first row (rowwise) or column (colwise) of the nested expression.
+      * \sa end(), cbegin()
+      */
+    iterator                 begin()       { return iterator      (m_matrix, 0); }
+    /** const version of begin() */
+    const_iterator           begin() const { return const_iterator(m_matrix, 0); }
+    /** const version of begin() */
+    const_iterator          cbegin() const { return const_iterator(m_matrix, 0); }
+
+    /** returns a reverse iterator to the last row (rowwise) or column (colwise) of the nested expression.
+      * \sa rend(), crbegin()
+      */
+    reverse_iterator        rbegin()       { return reverse_iterator       (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()-1); }
+	/** const version of rbegin() */
+    const_reverse_iterator  rbegin() const { return const_reverse_iterator (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()-1); }
+	/** const version of rbegin() */
+	const_reverse_iterator crbegin() const { return const_reverse_iterator (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()-1); }
+
+    /** returns an iterator to the row (resp. column) following the last row (resp. column) of the nested expression
+      * \sa begin(), cend()
+      */
+    iterator                 end()         { return iterator      (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
+    /** const version of end() */
+    const_iterator           end()  const  { return const_iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
+    /** const version of end() */
+    const_iterator          cend()  const  { return const_iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
+
+    /** returns a reverse iterator to the row (resp. column) before the first row (resp. column) of the nested expression
+      * \sa begin(), cend()
+      */
+    reverse_iterator        rend()         { return reverse_iterator       (m_matrix, -1); }
+    /** const version of rend() */
+    const_reverse_iterator  rend()  const  { return const_reverse_iterator (m_matrix, -1); }
+    /** const version of rend() */
+    const_reverse_iterator crend()  const  { return const_reverse_iterator (m_matrix, -1); }
+
     /** \returns a row or column vector expression of \c *this reduxed by \a func
       *
       * The template parameter \a BinaryOp is the type of the functor
       * of the custom redux operator. Note that func must be an associative operator.
       *
+      * \warning the size along the reduction direction must be strictly positive,
+      *          otherwise an assertion is triggered.
+      *
       * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise()
       */
     template<typename BinaryOp>
     EIGEN_DEVICE_FUNC
     const typename ReduxReturnType<BinaryOp>::Type
     redux(const BinaryOp& func = BinaryOp()) const
-    { return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func)); }
+    {
+      eigen_assert(redux_length()>0 && "you are using an empty matrix");
+      return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func));
+    }
 
     typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
     typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
-    typedef typename ReturnType<internal::member_squaredNorm,RealScalar>::Type SquaredNormReturnType;
-    typedef typename ReturnType<internal::member_norm,RealScalar>::Type NormReturnType;
+    typedef PartialReduxExpr<const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const ExpressionTypeNestedCleaned>,internal::member_sum<RealScalar,RealScalar>,Direction> SquaredNormReturnType;
+    typedef CwiseUnaryOp<internal::scalar_sqrt_op<RealScalar>, const SquaredNormReturnType> NormReturnType;
     typedef typename ReturnType<internal::member_blueNorm,RealScalar>::Type BlueNormReturnType;
     typedef typename ReturnType<internal::member_stableNorm,RealScalar>::Type StableNormReturnType;
     typedef typename ReturnType<internal::member_hypotNorm,RealScalar>::Type HypotNormReturnType;
     typedef typename ReturnType<internal::member_sum>::Type SumReturnType;
-    typedef typename ReturnType<internal::member_mean>::Type MeanReturnType;
+    typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(SumReturnType,Scalar,quotient) MeanReturnType;
     typedef typename ReturnType<internal::member_all>::Type AllReturnType;
     typedef typename ReturnType<internal::member_any>::Type AnyReturnType;
-    typedef PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> CountReturnType;
+    typedef PartialReduxExpr<ExpressionType, internal::member_count<Index,Scalar>, Direction> CountReturnType;
     typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
     typedef Reverse<const ExpressionType, Direction> ConstReverseReturnType;
     typedef Reverse<ExpressionType, Direction> ReverseReturnType;
 
     template<int p> struct LpNormReturnType {
-      typedef PartialReduxExpr<ExpressionType, internal::member_lpnorm<p,RealScalar>,Direction> Type;
+      typedef PartialReduxExpr<ExpressionType, internal::member_lpnorm<p,RealScalar,Scalar>,Direction> Type;
     };
 
     /** \returns a row (or column) vector expression of the smallest coefficient
       * of each column (or row) of the referenced expression.
       *
+      * \warning the size along the reduction direction must be strictly positive,
+      *          otherwise an assertion is triggered.
+      *
       * \warning the result is undefined if \c *this contains NaN.
       *
       * Example: \include PartialRedux_minCoeff.cpp
@@ -302,11 +374,17 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * \sa DenseBase::minCoeff() */
     EIGEN_DEVICE_FUNC
     const MinCoeffReturnType minCoeff() const
-    { return MinCoeffReturnType(_expression()); }
+    {
+      eigen_assert(redux_length()>0 && "you are using an empty matrix");
+      return MinCoeffReturnType(_expression());
+    }
 
     /** \returns a row (or column) vector expression of the largest coefficient
       * of each column (or row) of the referenced expression.
       *
+      * \warning the size along the reduction direction must be strictly positive,
+      *          otherwise an assertion is triggered.
+      *
       * \warning the result is undefined if \c *this contains NaN.
       *
       * Example: \include PartialRedux_maxCoeff.cpp
@@ -315,7 +393,10 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * \sa DenseBase::maxCoeff() */
     EIGEN_DEVICE_FUNC
     const MaxCoeffReturnType maxCoeff() const
-    { return MaxCoeffReturnType(_expression()); }
+    {
+      eigen_assert(redux_length()>0 && "you are using an empty matrix");
+      return MaxCoeffReturnType(_expression());
+    }
 
     /** \returns a row (or column) vector expression of the squared norm
       * of each column (or row) of the referenced expression.
@@ -327,7 +408,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * \sa DenseBase::squaredNorm() */
     EIGEN_DEVICE_FUNC
     const SquaredNormReturnType squaredNorm() const
-    { return SquaredNormReturnType(_expression()); }
+    { return SquaredNormReturnType(m_matrix.cwiseAbs2()); }
 
     /** \returns a row (or column) vector expression of the norm
       * of each column (or row) of the referenced expression.
@@ -339,7 +420,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * \sa DenseBase::norm() */
     EIGEN_DEVICE_FUNC
     const NormReturnType norm() const
-    { return NormReturnType(_expression()); }
+    { return NormReturnType(squaredNorm()); }
 
     /** \returns a row (or column) vector expression of the norm
       * of each column (or row) of the referenced expression.
@@ -404,7 +485,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     * \sa DenseBase::mean() */
     EIGEN_DEVICE_FUNC
     const MeanReturnType mean() const
-    { return MeanReturnType(_expression()); }
+    { return sum() / Scalar(Direction==Vertical?m_matrix.rows():m_matrix.cols()); }
 
     /** \returns a row (or column) vector expression representing
       * whether \b all coefficients of each respective column (or row) are \c true.
@@ -500,7 +581,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
       //eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME
-      return const_cast<ExpressionType&>(m_matrix = extendedTo(other.derived()));
+      return m_matrix = extendedTo(other.derived());
     }
 
     /** Adds the vector \a other to each subvector of \c *this */
@@ -510,7 +591,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      return const_cast<ExpressionType&>(m_matrix += extendedTo(other.derived()));
+      return m_matrix += extendedTo(other.derived());
     }
 
     /** Substracts the vector \a other to each subvector of \c *this */
@@ -520,7 +601,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      return const_cast<ExpressionType&>(m_matrix -= extendedTo(other.derived()));
+      return m_matrix -= extendedTo(other.derived());
     }
 
     /** Multiples each subvector of \c *this by the vector \a other */
@@ -532,7 +613,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
       m_matrix *= extendedTo(other.derived());
-      return const_cast<ExpressionType&>(m_matrix);
+      return m_matrix;
     }
 
     /** Divides each subvector of \c *this by the vector \a other */
@@ -544,7 +625,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
       m_matrix /= extendedTo(other.derived());
-      return const_cast<ExpressionType&>(m_matrix);
+      return m_matrix;
     }
 
     /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
@@ -609,7 +690,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     EIGEN_DEVICE_FUNC
     CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
                   const ExpressionTypeNestedCleaned,
-                  const typename OppositeExtendedType<typename ReturnType<internal::member_norm,RealScalar>::Type>::Type>
+                  const typename OppositeExtendedType<NormReturnType>::Type>
     normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); }
 
 
@@ -658,7 +739,15 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     EIGEN_DEVICE_FUNC
     const HNormalizedReturnType hnormalized() const;
 
+#   ifdef EIGEN_VECTORWISEOP_PLUGIN
+#     include EIGEN_VECTORWISEOP_PLUGIN
+#   endif
+
   protected:
+    Index redux_length() const
+    {
+      return Direction==Vertical ? m_matrix.rows() : m_matrix.cols();
+    }
     ExpressionTypeNested m_matrix;
 };
 
@@ -670,7 +759,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
   * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::ColwiseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ColwiseReturnType
 DenseBase<Derived>::colwise()
 {
   return ColwiseReturnType(derived());
@@ -684,7 +773,7 @@ DenseBase<Derived>::colwise()
   * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::RowwiseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::RowwiseReturnType
 DenseBase<Derived>::rowwise()
 {
   return RowwiseReturnType(derived());
diff --git a/contrib/eigen/Eigen/src/Core/Visitor.h b/contrib/eigen/Eigen/src/Core/Visitor.h
index 54c1883d98f85cff9d74aa3b217a3fa448e6d713..00bcca87768eba2f48d31759a14a3c42beb3f6b1 100644
--- a/contrib/eigen/Eigen/src/Core/Visitor.h
+++ b/contrib/eigen/Eigen/src/Core/Visitor.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_VISITOR_H
 #define EIGEN_VISITOR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -40,6 +40,14 @@ struct visitor_impl<Visitor, Derived, 1>
   }
 };
 
+// This specialization enables visitors on empty matrices at compile-time
+template<typename Visitor, typename Derived>
+struct visitor_impl<Visitor, Derived, 0> {
+  EIGEN_DEVICE_FUNC
+  static inline void run(const Derived &/*mat*/, Visitor& /*visitor*/)
+  {}
+};
+
 template<typename Visitor, typename Derived>
 struct visitor_impl<Visitor, Derived, Dynamic>
 {
@@ -62,22 +70,22 @@ class visitor_evaluator
 public:
   EIGEN_DEVICE_FUNC
   explicit visitor_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
-  
+
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  
+
   enum {
     RowsAtCompileTime = XprType::RowsAtCompileTime,
     CoeffReadCost = internal::evaluator<XprType>::CoeffReadCost
   };
-  
-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_xpr.size(); }
 
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   { return m_evaluator.coeff(row, col); }
-  
+
 protected:
   internal::evaluator<XprType> m_evaluator;
   const XprType &m_xpr;
@@ -99,6 +107,8 @@ protected:
   * \note compared to one or two \em for \em loops, visitors offer automatic
   * unrolling for small fixed size matrix.
   *
+  * \note if the matrix is empty, then the visitor is left unchanged.
+  *
   * \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux()
   */
 template<typename Derived>
@@ -106,12 +116,15 @@ template<typename Visitor>
 EIGEN_DEVICE_FUNC
 void DenseBase<Derived>::visit(Visitor& visitor) const
 {
+  if(size()==0)
+    return;
+
   typedef typename internal::visitor_evaluator<Derived> ThisEvaluator;
   ThisEvaluator thisEval(derived());
-  
+
   enum {
     unroll =  SizeAtCompileTime != Dynamic
-           && SizeAtCompileTime * ThisEvaluator::CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost <= EIGEN_UNROLLING_LIMIT
+           && SizeAtCompileTime * int(ThisEvaluator::CoeffReadCost) + (SizeAtCompileTime-1) * int(internal::functor_traits<Visitor>::Cost) <= EIGEN_UNROLLING_LIMIT
   };
   return internal::visitor_impl<Visitor, ThisEvaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(thisEval, visitor);
 }
@@ -124,6 +137,9 @@ namespace internal {
 template <typename Derived>
 struct coeff_visitor
 {
+  // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc
+  EIGEN_DEVICE_FUNC
+  coeff_visitor() : row(-1), col(-1), res(0) {}
   typedef typename Derived::Scalar Scalar;
   Index row, col;
   Scalar res;
@@ -141,7 +157,7 @@ struct coeff_visitor
   *
   * \sa DenseBase::minCoeff(Index*, Index*)
   */
-template <typename Derived>
+template <typename Derived, int NaNPropagation>
 struct min_coeff_visitor : coeff_visitor<Derived>
 {
   typedef typename Derived::Scalar Scalar;
@@ -157,8 +173,40 @@ struct min_coeff_visitor : coeff_visitor<Derived>
   }
 };
 
-template<typename Scalar>
-struct functor_traits<min_coeff_visitor<Scalar> > {
+template <typename Derived>
+struct min_coeff_visitor<Derived, PropagateNumbers> : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value < this->res))
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template <typename Derived>
+struct min_coeff_visitor<Derived, PropagateNaN> : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if((numext::isnan)(value) || value < this->res)
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template<typename Scalar, int NaNPropagation>
+    struct functor_traits<min_coeff_visitor<Scalar, NaNPropagation> > {
   enum {
     Cost = NumTraits<Scalar>::AddCost
   };
@@ -169,10 +217,10 @@ struct functor_traits<min_coeff_visitor<Scalar> > {
   *
   * \sa DenseBase::maxCoeff(Index*, Index*)
   */
-template <typename Derived>
+template <typename Derived, int NaNPropagation>
 struct max_coeff_visitor : coeff_visitor<Derived>
 {
-  typedef typename Derived::Scalar Scalar; 
+  typedef typename Derived::Scalar Scalar;
   EIGEN_DEVICE_FUNC
   void operator() (const Scalar& value, Index i, Index j)
   {
@@ -185,8 +233,40 @@ struct max_coeff_visitor : coeff_visitor<Derived>
   }
 };
 
-template<typename Scalar>
-struct functor_traits<max_coeff_visitor<Scalar> > {
+template <typename Derived>
+struct max_coeff_visitor<Derived, PropagateNumbers> : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value > this->res))
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template <typename Derived>
+struct max_coeff_visitor<Derived, PropagateNaN> : coeff_visitor<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  void operator() (const Scalar& value, Index i, Index j)
+  {
+    if((numext::isnan)(value) || value > this->res)
+    {
+      this->res = value;
+      this->row = i;
+      this->col = j;
+    }
+  }
+};
+
+template<typename Scalar, int NaNPropagation>
+struct functor_traits<max_coeff_visitor<Scalar, NaNPropagation> > {
   enum {
     Cost = NumTraits<Scalar>::AddCost
   };
@@ -196,17 +276,24 @@ struct functor_traits<max_coeff_visitor<Scalar> > {
 
 /** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
   * \returns the minimum of all coefficients of *this and puts in *row and *col its location.
-  * \warning the result is undefined if \c *this contains NaN.
+  *
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
   *
   * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
   */
 template<typename Derived>
-template<typename IndexType>
+template<int NaNPropagation, typename IndexType>
 EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 {
-  internal::min_coeff_visitor<Derived> minVisitor;
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
+  internal::min_coeff_visitor<Derived, NaNPropagation> minVisitor;
   this->visit(minVisitor);
   *rowId = minVisitor.row;
   if (colId) *colId = minVisitor.col;
@@ -214,18 +301,25 @@ DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 }
 
 /** \returns the minimum of all coefficients of *this and puts in *index its location.
-  * \warning the result is undefined if \c *this contains NaN. 
+  *
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
   *
   * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff()
   */
 template<typename Derived>
-template<typename IndexType>
+template<int NaNPropagation, typename IndexType>
 EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* index) const
 {
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  internal::min_coeff_visitor<Derived> minVisitor;
+      internal::min_coeff_visitor<Derived, NaNPropagation> minVisitor;
   this->visit(minVisitor);
   *index = IndexType((RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row);
   return minVisitor.res;
@@ -233,17 +327,24 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
 
 /** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
   * \returns the maximum of all coefficients of *this and puts in *row and *col its location.
-  * \warning the result is undefined if \c *this contains NaN. 
+  *
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
   *
   * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
   */
 template<typename Derived>
-template<typename IndexType>
+template<int NaNPropagation, typename IndexType>
 EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
 {
-  internal::max_coeff_visitor<Derived> maxVisitor;
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
+  internal::max_coeff_visitor<Derived, NaNPropagation> maxVisitor;
   this->visit(maxVisitor);
   *rowPtr = maxVisitor.row;
   if (colPtr) *colPtr = maxVisitor.col;
@@ -251,18 +352,25 @@ DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
 }
 
 /** \returns the maximum of all coefficients of *this and puts in *index its location.
-  * \warning the result is undefined if \c *this contains NaN.
+  *
+  * In case \c *this contains NaN, NaNPropagation determines the behavior:
+  *   NaNPropagation == PropagateFast : undefined
+  *   NaNPropagation == PropagateNaN : result is NaN
+  *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
   *
   * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
   */
 template<typename Derived>
-template<typename IndexType>
+template<int NaNPropagation, typename IndexType>
 EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* index) const
 {
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  internal::max_coeff_visitor<Derived> maxVisitor;
+      internal::max_coeff_visitor<Derived, NaNPropagation> maxVisitor;
   this->visit(maxVisitor);
   *index = (RowsAtCompileTime==1) ? maxVisitor.col : maxVisitor.row;
   return maxVisitor.res;
diff --git a/contrib/eigen/Eigen/src/Core/arch/AVX/Complex.h b/contrib/eigen/Eigen/src/Core/arch/AVX/Complex.h
index 7fa61969dc1c4b3b9d95db2f253509450a4bdb1d..ab7bd6c651f05f99e5173937d7624bcc731ca5ed 100644
--- a/contrib/eigen/Eigen/src/Core/arch/AVX/Complex.h
+++ b/contrib/eigen/Eigen/src/Core/arch/AVX/Complex.h
@@ -22,6 +22,7 @@ struct Packet4cf
   __m256  v;
 };
 
+#ifndef EIGEN_VECTORIZE_AVX512
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
   typedef Packet4cf type;
@@ -37,6 +38,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     HasMul    = 1,
     HasDiv    = 1,
     HasNegate = 1,
+    HasSqrt   = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
@@ -44,8 +46,20 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     HasSetLinear = 0
   };
 };
+#endif
 
-template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet4cf> {
+  typedef std::complex<float> type;
+  typedef Packet2cf half;
+  typedef Packet8f as_real;
+  enum {
+    size=4,
+    alignment=Aligned32,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
 
 template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
@@ -67,10 +81,17 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, con
   return Packet4cf(result);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) {
+  __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ);
+  return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet4cf pand   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf por    <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf pxor   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(b.v,a.v)); }
 
 template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
 template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
@@ -140,70 +161,12 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packe
                      Packet2cf(_mm256_extractf128_ps(a.v,1))));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
-{
-  Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0));
-  Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0));
-  t0 = _mm256_hadd_ps(t0,t1);
-  Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0));
-  Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0));
-  t2 = _mm256_hadd_ps(t2,t3);
-  
-  t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4));
-  t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4));
-
-  return Packet4cf(_mm256_add_ps(t1,t3));
-}
-
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
 {
   return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
                          Packet2cf(_mm256_extractf128_ps(a.v, 1))));
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet4cf>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
-  {
-    if (Offset==0) return;
-    palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
-  }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
 
 template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
@@ -228,6 +191,7 @@ struct Packet2cd
   __m256d  v;
 };
 
+#ifndef EIGEN_VECTORIZE_AVX512
 template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 {
   typedef Packet2cd type;
@@ -243,6 +207,7 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
     HasMul    = 1,
     HasDiv    = 1,
     HasNegate = 1,
+    HasSqrt   = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
@@ -250,8 +215,20 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
     HasSetLinear = 0
   };
 };
+#endif
 
-template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet2cd> {
+  typedef std::complex<double> type;
+  typedef Packet1cd half;
+  typedef Packet4d as_real;
+  enum {
+    size=2,
+    alignment=Aligned32,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
 
 template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
@@ -272,10 +249,17 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, con
   return Packet2cd(_mm256_addsub_pd(even, odd));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) {
+  __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ);
+  return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet2cd pand   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd por    <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd pxor   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(b.v,a.v)); }
 
 template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
 { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
@@ -327,63 +311,12 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Pack
                      Packet1cd(_mm256_extractf128_pd(a.v,1))));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
-{
-  Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4));
-  Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4));
-
-  return Packet2cd(_mm256_add_pd(t0,t1));
-}
-
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
 {
   return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
                      Packet1cd(_mm256_extractf128_pd(a.v,1))));
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet2cd>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
-  {
-    if (Offset==0) return;
-    palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
-  }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
 
 template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
@@ -424,24 +357,12 @@ ptranspose(PacketBlock<Packet2cd,2>& kernel) {
  kernel.packet[0].v = tmp;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cf pinsertfirst(const Packet4cf& a, std::complex<float> b)
-{
-  return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,1|2));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd pinsertfirst(const Packet2cd& a, std::complex<double> b)
-{
-  return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,1|2));
+template<> EIGEN_STRONG_INLINE Packet2cd psqrt<Packet2cd>(const Packet2cd& a) {
+  return psqrt_complex<Packet2cd>(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cf pinsertlast(const Packet4cf& a, std::complex<float> b)
-{
-  return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,(1<<7)|(1<<6)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd pinsertlast(const Packet2cd& a, std::complex<double> b)
-{
-  return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,(1<<3)|(1<<2)));
+template<> EIGEN_STRONG_INLINE Packet4cf psqrt<Packet4cf>(const Packet4cf& a) {
+  return psqrt_complex<Packet4cf>(a);
 }
 
 } // end namespace internal
diff --git a/contrib/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h b/contrib/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h
index 6af67ce2d65062bc58f92228bbcf7108ec2ad97c..67041c812c3afe841ed0cac3dbd03825d6a880b2 100644
--- a/contrib/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/contrib/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_MATH_FUNCTIONS_AVX_H
 #define EIGEN_MATH_FUNCTIONS_AVX_H
 
-/* The sin, cos, exp, and log functions of this file are loosely derived from
+/* The sin and cos functions of this file are loosely derived from
  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
  */
 
@@ -18,187 +18,50 @@ namespace Eigen {
 
 namespace internal {
 
-inline Packet8i pshiftleft(Packet8i v, int n)
-{
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_slli_epi32(v, n);
-#else
-  __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n);
-  __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n);
-  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
-#endif
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+psin<Packet8f>(const Packet8f& _x) {
+  return psin_float(_x);
 }
 
-inline Packet8f pshiftright(Packet8f v, int n)
-{
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n));
-#else
-  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n);
-  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n);
-  return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1));
-#endif
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+pcos<Packet8f>(const Packet8f& _x) {
+  return pcos_float(_x);
 }
 
-// Sine function
-// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and
-// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants
-// are (anti-)symmetric and thus have only odd/even coefficients
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-psin<Packet8f>(const Packet8f& _x) {
-  Packet8f x = _x;
+plog<Packet8f>(const Packet8f& _x) {
+  return plog_float(_x);
+}
 
-  // Some useful values.
-  _EIGEN_DECLARE_CONST_Packet8i(one, 1);
-  _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
-  _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f);
-  _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f);
-  _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f);
-  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f);
-  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f);
-  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f);
-  _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f);
-
-  // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period.
-  Packet8f z = pmul(x, p8f_one_over_pi);
-  Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four));
-  x = pmadd(shift, p8f_neg_pi_first, x);
-  x = pmadd(shift, p8f_neg_pi_second, x);
-  x = pmadd(shift, p8f_neg_pi_third, x);
-  z = pmul(x, p8f_four_over_pi);
-
-  // Make a mask for the entries that need flipping, i.e. wherever the shift
-  // is odd.
-  Packet8i shift_ints = _mm256_cvtps_epi32(shift);
-  Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one)));
-  Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31);
-
-  // Create a mask for which interpolant to use, i.e. if z > 1, then the mask
-  // is set to ones for that entry.
-  Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ);
-
-  // Evaluate the polynomial for the interval [1,3] in z.
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f);
-  Packet8f z_minus_two = psub(z, p8f_two);
-  Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two);
-  Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4);
-  right = pmadd(right, z_minus_two2, p8f_coeff_right_2);
-  right = pmadd(right, z_minus_two2, p8f_coeff_right_0);
-
-  // Evaluate the polynomial for the interval [-1,1] in z.
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f);
-  Packet8f z2 = pmul(z, z);
-  Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5);
-  left = pmadd(left, z2, p8f_coeff_left_3);
-  left = pmadd(left, z2, p8f_coeff_left_1);
-  left = pmul(left, z);
-
-  // Assemble the results, i.e. select the left and right polynomials.
-  left = _mm256_andnot_ps(ival_mask, left);
-  right = _mm256_and_ps(ival_mask, right);
-  Packet8f res = _mm256_or_ps(left, right);
-
-  // Flip the sign on the odd intervals and return the result.
-  res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask));
-  return res;
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
+plog<Packet4d>(const Packet4d& _x) {
+  return plog_double(_x);
 }
 
-// Natural logarithm
-// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
-// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
-// be easily approximated by a polynomial centered on m=1 for stability.
-// TODO(gonnet): Further reduce the interval allowing for lower-degree
-//               polynomial interpolants -> ... -> profit!
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-plog<Packet8f>(const Packet8f& _x) {
-  Packet8f x = _x;
-  _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
-  _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f);
-
-  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
-  // The smallest non denormalized float number.
-  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000);
-  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000);
-
-  // Polynomial coefficients.
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f);
-
-  Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN
-  Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
-
-  // Truncate input values to the minimum positive normal.
-  x = pmax(x, p8f_min_norm_pos);
-
-  Packet8f emm0 = pshiftright(x,23);
-  Packet8f e = _mm256_sub_ps(emm0, p8f_126f);
-
-  // Set the exponents to -1, i.e. x are in the range [0.5,1).
-  x = _mm256_and_ps(x, p8f_inv_mant_mask);
-  x = _mm256_or_ps(x, p8f_half);
-
-  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
-  // and shift by -1. The values are then centered around 0, which improves
-  // the stability of the polynomial evaluation.
-  //   if( x < SQRTHF ) {
-  //     e -= 1;
-  //     x = x + x - 1.0;
-  //   } else { x = x - 1.0; }
-  Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ);
-  Packet8f tmp = _mm256_and_ps(x, mask);
-  x = psub(x, p8f_1);
-  e = psub(e, _mm256_and_ps(p8f_1, mask));
-  x = padd(x, tmp);
-
-  Packet8f x2 = pmul(x, x);
-  Packet8f x3 = pmul(x2, x);
-
-  // Evaluate the polynomial approximant of degree 8 in three parts, probably
-  // to improve instruction-level parallelism.
-  Packet8f y, y1, y2;
-  y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1);
-  y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4);
-  y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7);
-  y = pmadd(y, x, p8f_cephes_log_p2);
-  y1 = pmadd(y1, x, p8f_cephes_log_p5);
-  y2 = pmadd(y2, x, p8f_cephes_log_p8);
-  y = pmadd(y, x3, y1);
-  y = pmadd(y, x3, y2);
-  y = pmul(y, x3);
-
-  // Add the logarithm of the exponent back to the result of the interpolation.
-  y1 = pmul(e, p8f_cephes_log_q1);
-  tmp = pmul(x2, p8f_half);
-  y = padd(y, y1);
-  x = psub(x, tmp);
-  y2 = pmul(e, p8f_cephes_log_q2);
-  x = padd(x, y);
-  x = padd(x, y2);
-
-  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
-  return _mm256_or_ps(
-      _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)),
-      _mm256_and_ps(iszero_mask, p8f_minus_inf));
+plog2<Packet8f>(const Packet8f& _x) {
+  return plog2_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
+plog2<Packet4d>(const Packet4d& _x) {
+  return plog2_double(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f plog1p<Packet8f>(const Packet8f& _x) {
+  return generic_plog1p(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f pexpm1<Packet8f>(const Packet8f& _x) {
+  return generic_expm1(_x);
 }
 
 // Exponential function. Works by writing "x = m*log(2) + r" where
@@ -207,149 +70,21 @@ plog<Packet8f>(const Packet8f& _x) {
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
 pexp<Packet8f>(const Packet8f& _x) {
-  _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
-  _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f);
-
-  _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f);
-  _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f);
-
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f);
-
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f);
-
-  // Clamp x.
-  Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo);
-
-  // Express exp(x) as exp(m*ln(2) + r), start by extracting
-  // m = floor(x/ln(2) + 0.5).
-  Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half));
-
-// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
-// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
-// truncation errors. Note that we don't use the "pmadd" function here to
-// ensure that a precision-preserving FMA instruction is used.
-#ifdef EIGEN_VECTORIZE_FMA
-  _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f);
-  Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x);
-#else
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f);
-  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f);
-  Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1));
-  r = psub(r, pmul(m, p8f_cephes_exp_C2));
-#endif
-
-  Packet8f r2 = pmul(r, r);
-
-  // TODO(gonnet): Split into odd/even polynomials and try to exploit
-  //               instruction-level parallelism.
-  Packet8f y = p8f_cephes_exp_p0;
-  y = pmadd(y, r, p8f_cephes_exp_p1);
-  y = pmadd(y, r, p8f_cephes_exp_p2);
-  y = pmadd(y, r, p8f_cephes_exp_p3);
-  y = pmadd(y, r, p8f_cephes_exp_p4);
-  y = pmadd(y, r, p8f_cephes_exp_p5);
-  y = pmadd(y, r2, r);
-  y = padd(y, p8f_1);
-
-  // Build emm0 = 2^m.
-  Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127));
-  emm0 = pshiftleft(emm0, 23);
-
-  // Return 2^m * exp(r).
-  return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
+  return pexp_float(_x);
 }
 
 // Hyperbolic Tangent function.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-ptanh<Packet8f>(const Packet8f& x) {
-  return internal::generic_fast_tanh_float(x);
+ptanh<Packet8f>(const Packet8f& _x) {
+  return internal::generic_fast_tanh_float(_x);
 }
 
+// Exponential function for doubles.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
 pexp<Packet4d>(const Packet4d& _x) {
-  Packet4d x = _x;
-
-  _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
-  _EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
-  _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
-
-  _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
-  _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
-
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
-
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
-
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
-
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
-  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
-  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
-
-  Packet4d tmp, fx;
-
-  // clamp x
-  x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
-  // Express exp(x) as exp(g + n*log(2)).
-  fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
-
-  // Get the integer modulus of log(2), i.e. the "n" described above.
-  fx = _mm256_floor_pd(fx);
-
-  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
-  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
-  // digits right.
-  tmp = pmul(fx, p4d_cephes_exp_C1);
-  Packet4d z = pmul(fx, p4d_cephes_exp_C2);
-  x = psub(x, tmp);
-  x = psub(x, z);
-
-  Packet4d x2 = pmul(x, x);
-
-  // Evaluate the numerator polynomial of the rational interpolant.
-  Packet4d px = p4d_cephes_exp_p0;
-  px = pmadd(px, x2, p4d_cephes_exp_p1);
-  px = pmadd(px, x2, p4d_cephes_exp_p2);
-  px = pmul(px, x);
-
-  // Evaluate the denominator polynomial of the rational interpolant.
-  Packet4d qx = p4d_cephes_exp_q0;
-  qx = pmadd(qx, x2, p4d_cephes_exp_q1);
-  qx = pmadd(qx, x2, p4d_cephes_exp_q2);
-  qx = pmadd(qx, x2, p4d_cephes_exp_q3);
-
-  // I don't really get this bit, copied from the SSE2 routines, so...
-  // TODO(gonnet): Figure out what is going on here, perhaps find a better
-  // rational interpolant?
-  x = _mm256_div_pd(px, psub(qx, px));
-  x = pmadd(p4d_2, x, p4d_1);
-
-  // Build e=2^n by constructing the exponents in a 128-bit vector and
-  // shifting them to where they belong in double-precision values.
-  __m128i emm0 = _mm256_cvtpd_epi32(fx);
-  emm0 = _mm_add_epi32(emm0, p4i_1023);
-  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
-  __m128i lo = _mm_slli_epi64(emm0, 52);
-  __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
-  __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
-  e = _mm256_insertf128_si256(e, hi, 1);
-
-  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
-  // non-finite values in the input.
-  return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
+  return pexp_double(_x);
 }
 
 // Functions for sqrt.
@@ -362,37 +97,39 @@ pexp<Packet4d>(const Packet4d& _x) {
 // For detail see here: http://www.beyond3d.com/content/articles/8/
 #if EIGEN_FAST_MATH
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-psqrt<Packet8f>(const Packet8f& _x) {
-  Packet8f half = pmul(_x, pset1<Packet8f>(.5f));
-  Packet8f denormal_mask = _mm256_and_ps(
-      _mm256_cmp_ps(_x, pset1<Packet8f>((std::numeric_limits<float>::min)()),
-                    _CMP_LT_OQ),
-      _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ));
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f psqrt<Packet8f>(const Packet8f& _x) {
+  Packet8f minus_half_x = pmul(_x, pset1<Packet8f>(-0.5f));
+  Packet8f denormal_mask = pandnot(
+      pcmp_lt(_x, pset1<Packet8f>((std::numeric_limits<float>::min)())),
+      pcmp_lt(_x, pzero(_x)));
 
   // Compute approximate reciprocal sqrt.
   Packet8f x = _mm256_rsqrt_ps(_x);
   // Do a single step of Newton's iteration.
-  x = pmul(x, psub(pset1<Packet8f>(1.5f), pmul(half, pmul(x,x))));
+  x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1<Packet8f>(1.5f)));
   // Flush results for denormals to zero.
-  return _mm256_andnot_ps(denormal_mask, pmul(_x,x));
+  return pandnot(pmul(_x,x), denormal_mask);
 }
+
 #else
+
 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet8f psqrt<Packet8f>(const Packet8f& x) {
-  return _mm256_sqrt_ps(x);
+Packet8f psqrt<Packet8f>(const Packet8f& _x) {
+  return _mm256_sqrt_ps(_x);
 }
+
 #endif
+
 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4d psqrt<Packet4d>(const Packet4d& x) {
-  return _mm256_sqrt_pd(x);
+Packet4d psqrt<Packet4d>(const Packet4d& _x) {
+  return _mm256_sqrt_pd(_x);
 }
-#if EIGEN_FAST_MATH
 
+#if EIGEN_FAST_MATH
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
   _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
-  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000);
   _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
   _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
   _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
@@ -401,36 +138,88 @@ Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
 
   // select only the inverse sqrt of positive normal inputs (denormals are
   // flushed to zero and cause infs as well).
-  Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
-  Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x));
-
-  // Fill in NaNs and Infs for the negative/zero entries.
-  Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ);
-  Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask);
-  Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan),
-                                        _mm256_and_ps(zero_mask, p8f_inf));
-
-  // Do a single step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
-
-  // Insert NaNs and Infs in all the right places.
-  return _mm256_or_ps(x, infs_and_nans);
+  Packet8f lt_min_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
+  Packet8f inf_mask =  _mm256_cmp_ps(_x, p8f_inf, _CMP_EQ_OQ);
+  Packet8f not_normal_finite_mask = _mm256_or_ps(lt_min_mask, inf_mask);
+
+  // Compute an approximate result using the rsqrt intrinsic.
+  Packet8f y_approx = _mm256_rsqrt_ps(_x);
+
+  // Do a single step of Newton-Raphson iteration to improve the approximation.
+  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
+  // It is essential to evaluate the inner term like this because forming
+  // y_n^2 may over- or underflow.
+  Packet8f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p8f_one_point_five));
+
+  // Select the result of the Newton-Raphson step for positive normal arguments.
+  // For other arguments, choose the output of the intrinsic. This will
+  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if
+  // x is zero or a positive denormalized float (equivalent to flushing positive
+  // denormalized inputs to zero).
+  return pselect<Packet8f>(not_normal_finite_mask, y_approx, y_newton);
 }
 
 #else
 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet8f prsqrt<Packet8f>(const Packet8f& x) {
+Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
   _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
-  return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x));
+  return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(_x));
 }
 #endif
 
 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4d prsqrt<Packet4d>(const Packet4d& x) {
+Packet4d prsqrt<Packet4d>(const Packet4d& _x) {
   _EIGEN_DECLARE_CONST_Packet4d(one, 1.0);
-  return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x));
+  return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(_x));
 }
 
+F16_PACKET_FUNCTION(Packet8f, Packet8h, psin)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, plog)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, plog2)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, plog1p)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pexpm1)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt)
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pfrexp(const Packet8h& a, Packet8h& exponent) {
+  Packet8f fexponent;
+  const Packet8h out = float2half(pfrexp<Packet8f>(half2float(a), fexponent));
+  exponent = float2half(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pldexp(const Packet8h& a, const Packet8h& exponent) {
+  return float2half(pldexp<Packet8f>(half2float(a), half2float(exponent)));
+}
+
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pcos)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog2)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog1p)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexpm1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, prsqrt)
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pfrexp(const Packet8bf& a, Packet8bf& exponent) {
+  Packet8f fexponent;
+  const Packet8bf out = F32ToBf16(pfrexp<Packet8f>(Bf16ToF32(a), fexponent));
+  exponent = F32ToBf16(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pldexp(const Packet8bf& a, const Packet8bf& exponent) {
+  return F32ToBf16(pldexp<Packet8f>(Bf16ToF32(a), Bf16ToF32(exponent)));
+}
 
 }  // end namespace internal
 
diff --git a/contrib/eigen/Eigen/src/Core/arch/AVX/PacketMath.h b/contrib/eigen/Eigen/src/Core/arch/AVX/PacketMath.h
index 923a124b2066370ba28dd590920301d0966d114d..7fc32fd7192a0caff2e12079187041bbebb627df 100644
--- a/contrib/eigen/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/contrib/eigen/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -18,11 +18,11 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
 
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
 #endif
 
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
@@ -31,10 +31,14 @@ namespace internal {
 typedef __m256  Packet8f;
 typedef __m256i Packet8i;
 typedef __m256d Packet4d;
+typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
+typedef eigen_packet_wrapper<__m128i, 3> Packet8bf;
 
 template<> struct is_arithmetic<__m256>  { enum { value = true }; };
 template<> struct is_arithmetic<__m256i> { enum { value = true }; };
 template<> struct is_arithmetic<__m256d> { enum { value = true }; };
+template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
+template<> struct is_arithmetic<Packet8bf> { enum { value = true }; };
 
 #define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
   const Packet8f p8f_##NAME = pset1<Packet8f>(X)
@@ -58,21 +62,28 @@ template<> struct packet_traits<float>  : default_packet_traits
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=8,
+    size = 8,
     HasHalfPacket = 1,
 
-    HasDiv  = 1,
-    HasSin  = EIGEN_FAST_MATH,
-    HasCos  = 0,
-    HasLog  = 1,
-    HasExp  = 1,
+    HasCmp  = 1,
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasExp = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasTanh  = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
     HasBlend = 1,
     HasRound = 1,
     HasFloor = 1,
-    HasCeil = 1
+    HasCeil = 1,
+    HasRint = 1
   };
 };
 template<> struct packet_traits<double> : default_packet_traits
@@ -85,14 +96,104 @@ template<> struct packet_traits<double> : default_packet_traits
     size=4,
     HasHalfPacket = 1,
 
+    HasCmp  = 1,
     HasDiv  = 1,
+    HasLog  = 1,
     HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasBlend = 1,
     HasRound = 1,
     HasFloor = 1,
-    HasCeil = 1
+    HasCeil = 1,
+    HasRint = 1
+  };
+};
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet8h type;
+  // There is no half-size packet for Packet8h.
+  typedef Packet8h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+
+    HasCmp    = 1,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasSin    = EIGEN_FAST_MATH,
+    HasCos    = EIGEN_FAST_MATH,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasAbs2   = 0,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 1,
+    HasSetLinear = 0,
+    HasLog    = 1,
+    HasLog1p  = 1,
+    HasExpm1  = 1,
+    HasExp    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasTanh   = EIGEN_FAST_MATH,
+    HasErf    = EIGEN_FAST_MATH,
+    HasBlend  = 0,
+    HasRound  = 1,
+    HasFloor  = 1,
+    HasCeil   = 1,
+    HasRint   = 1,
+    HasBessel = 1,
+    HasNdtri  = 1
+  };
+};
+
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet8bf type;
+  // There is no half-size packet for current Packet8bf.
+  // TODO: support as SSE path.
+  typedef Packet8bf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasAbs2   = 0,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 1,
+    HasSetLinear = 0,
+    HasLog = 1,
+    HasLog1p  = 1,
+    HasExpm1  = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasBlend = 0,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+    HasBessel = 1,
+    HasNdtri  = 1
   };
 };
 #endif
@@ -113,14 +214,45 @@ template<> struct packet_traits<int>    : default_packet_traits
 };
 */
 
-template<> struct unpacket_traits<Packet8f> { typedef float  type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; };
-template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; };
-template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; };
+template<> struct unpacket_traits<Packet8f> {
+  typedef float     type;
+  typedef Packet4f  half;
+  typedef Packet8i  integer_packet;
+  typedef uint8_t   mask_t;
+  enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true, masked_store_available=true};
+};
+template<> struct unpacket_traits<Packet4d> {
+  typedef double type;
+  typedef Packet2d half;
+  enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false, masked_load_available=false, masked_store_available=false}; };
+template<> struct unpacket_traits<Packet8bf> { typedef bfloat16 type; typedef Packet8bf half; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; };
+
+// Helper function for bit packing snippet of low precision comparison.
+// It packs the flags from 16x16 to 8x16.
+EIGEN_STRONG_INLINE __m128i Pack16To8(Packet8f rf) {
+  return _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0),
+                         _mm256_extractf128_si256(_mm256_castps_si256(rf), 1));
+}
+
 
 template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float&  from) { return _mm256_set1_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int&    from) { return _mm256_set1_epi32(from); }
 
+template<> EIGEN_STRONG_INLINE Packet8f pset1frombits<Packet8f>(unsigned int from) { return _mm256_castsi256_ps(pset1<Packet8i>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4d pset1frombits<Packet4d>(uint64_t from) { return _mm256_castsi256_pd(_mm256_set1_epi64x(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); }
+template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); }
+template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); }
+
+
+template<> EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f& /*a*/) { return _mm256_castsi256_ps(_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1)); }
+template<> EIGEN_STRONG_INLINE Packet8i peven_mask(const Packet8i& /*a*/) { return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); }
+template<> EIGEN_STRONG_INLINE Packet4d peven_mask(const Packet4d& /*a*/) { return _mm256_castsi256_pd(_mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1)); }
+
 template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float*  from) { return _mm256_broadcast_ss(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
 
@@ -129,9 +261,27 @@ template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { retur
 
 template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_add_epi32(a,b);
+#else
+  __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_sub_epi32(a,b);
+#else
+  __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
 {
@@ -148,7 +298,15 @@ template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
 
 template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
-
+template<> EIGEN_STRONG_INLINE Packet8i pmul<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_mullo_epi32(a,b);
+#else
+  const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
@@ -157,7 +315,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co
   return pset1<Packet8i>(0);
 }
 
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
 template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
 #if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
   // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
@@ -184,14 +342,112 @@ template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d&
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); }
+
+template<> EIGEN_STRONG_INLINE Packet4d pcmp_le(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LE_OQ); }
+template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LT_OQ); }
+template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a, b, _CMP_NGE_UQ); }
+template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); }
+
+
+template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_cmpeq_epi32(a,b);
+#else
+  __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // There appears to be a bug in GCC, by which the optimizer may flip
+  // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  Packet8f res;
+  asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  // Arguments are swapped to match NaN propagation behavior of std::min.
+  return _mm256_min_ps(b,a);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // See pmin above
+  Packet4d res;
+  asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  // Arguments are swapped to match NaN propagation behavior of std::min.
+  return _mm256_min_pd(b,a);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // See pmin above
+  Packet8f res;
+  asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  // Arguments are swapped to match NaN propagation behavior of std::max.
+  return _mm256_max_ps(b,a);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // See pmin above
+  Packet4d res;
+  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  // Arguments are swapped to match NaN propagation behavior of std::max.
+  return _mm256_max_pd(b,a);
+#endif
+}
 
-template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
+// Add specializations for min/max with prescribed NaN progation.
+template<>
+EIGEN_STRONG_INLINE Packet8f pmin<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet8f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4d pmin<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet4d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8f pmax<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet8f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4d pmax<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet4d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8f pmin<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet8f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4d pmin<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet4d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8f pmax<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet8f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4d pmax<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet4d>);
+}
 
-template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
-template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet8f print<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet4d print<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
 
 template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); }
 template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); }
@@ -199,17 +455,124 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { ret
 template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
 template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
 
+
+template<> EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  // vpcmpeqd has lower latency than the more general vcmpps
+  return _mm256_cmpeq_epi32(a,a);
+#else
+  const __m256 b = _mm256_castsi256_ps(a);
+  return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  // vpcmpeqd has lower latency than the more general vcmpps
+  const __m256i b = _mm256_castps_si256(a);
+  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b));
+#else
+  return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  // vpcmpeqq has lower latency than the more general vcmppd
+  const __m256i b = _mm256_castpd_si256(a);
+  return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b));
+#else
+  return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ);
+#endif
+}
+
 template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_and_si256(a,b);
+#else
+  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_or_si256(a,b);
+#else
+  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_xor_si256(a,b);
+#else
+  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
 
-template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); }
+template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); }
+template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_andnot_si256(b,a);
+#else
+  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a)));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a)
+{
+  const Packet8f mask = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x80000000u));
+  const Packet8f prev0dot5 = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
+  return _mm256_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a)
+{
+  const Packet4d mask = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
+  const Packet4d prev0dot5 = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
+  return _mm256_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b)
+{ return _mm256_blendv_ps(b,a,mask); }
+template<> EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& mask, const Packet4d& a, const Packet4d& b)
+{ return _mm256_blendv_pd(b,a,mask); }
+
+template<int N> EIGEN_STRONG_INLINE Packet8i parithmetic_shift_right(Packet8i a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_srai_epi32(a, N);
+#else
+  __m128i lo = _mm_srai_epi32(_mm256_extractf128_si256(a, 0), N);
+  __m128i hi = _mm_srai_epi32(_mm256_extractf128_si256(a, 1), N);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet8i plogical_shift_right(Packet8i a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_srli_epi32(a, N);
+#else
+  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(a, 0), N);
+  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(a, 1), N);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet8i plogical_shift_left(Packet8i a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_slli_epi32(a, N);
+#else
+  __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), N);
+  __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), N);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
@@ -219,6 +582,14 @@ template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EI
 template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
 
+template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from, uint8_t umask) {
+  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
+  const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
+  mask = por<Packet8i>(mask, bit_mask);
+  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask);
+}
+
 // Loads 4 floats from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3, a3}
 template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
 {
@@ -226,7 +597,7 @@ template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
 //   Packet8f tmp  = _mm256_castps128_ps256(_mm_loadu_ps(from));
 //   tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
 //   return _mm256_unpacklo_ps(tmp,tmp);
-  
+
   // _mm256_insertf128_ps is very slow on Haswell, thus:
   Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
   // mimic an "inplace" permutation of the lower 128bits using a blend
@@ -256,6 +627,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f&
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
 
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from, uint8_t umask) {
+  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
+  const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
+  mask = por<Packet8i>(mask, bit_mask);
+  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
+  EIGEN_DEBUG_UNALIGNED_STORE return _mm256_maskstore_ps(to, mask, from);
+}
+
 // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
 // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
 template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride)
@@ -354,47 +733,66 @@ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
   return _mm256_and_pd(a,mask);
 }
 
-// preduxp should be ok
-// FIXME: why is this ok? why isn't the simply implementation working as expected?
-template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
-{
-    __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
-    __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
-    __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
-    __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
-
-    __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
-    __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
-    __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
-    __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
-
-    __m256 perm1 =  _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
-    __m256 perm2 =  _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
-    __m256 perm3 =  _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
-    __m256 perm4 =  _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
+template<> EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
+  return pfrexp_generic(a,exponent);
+}
 
-    __m256 sum1 = _mm256_add_ps(perm1, hsum5);
-    __m256 sum2 = _mm256_add_ps(perm2, hsum6);
-    __m256 sum3 = _mm256_add_ps(perm3, hsum7);
-    __m256 sum4 = _mm256_add_ps(perm4, hsum8);
+// Extract exponent without existence of Packet4l.
+template<>
+EIGEN_STRONG_INLINE  
+Packet4d pfrexp_generic_get_biased_exponent(const Packet4d& a) {
+  const Packet4d cst_exp_mask  = pset1frombits<Packet4d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+  __m256i a_expo = _mm256_castpd_si256(pand(a, cst_exp_mask));
+#ifdef EIGEN_VECTORIZE_AVX2
+  a_expo = _mm256_srli_epi64(a_expo, 52);
+  __m128i lo = _mm256_extractf128_si256(a_expo, 0);
+  __m128i hi = _mm256_extractf128_si256(a_expo, 1);
+#else
+  __m128i lo = _mm256_extractf128_si256(a_expo, 0);
+  __m128i hi = _mm256_extractf128_si256(a_expo, 1);
+  lo = _mm_srli_epi64(lo, 52);
+  hi = _mm_srli_epi64(hi, 52);
+#endif
+  Packet2d exponent_lo = _mm_cvtepi32_pd(vec4i_swizzle1(lo, 0, 2, 1, 3));
+  Packet2d exponent_hi = _mm_cvtepi32_pd(vec4i_swizzle1(hi, 0, 2, 1, 3));
+  Packet4d exponent = _mm256_insertf128_pd(_mm256_setzero_pd(), exponent_lo, 0);
+  exponent = _mm256_insertf128_pd(exponent, exponent_hi, 1);
+  return exponent;
+}
 
-    __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
-    __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
 
-    __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
-    return final;
+template<> EIGEN_STRONG_INLINE Packet4d pfrexp<Packet4d>(const Packet4d& a, Packet4d& exponent) {
+  return pfrexp_generic(a, exponent);
 }
-template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
-{
- Packet4d tmp0, tmp1;
 
-  tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
-  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
-
-  tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
-  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
+template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) {
+  return pldexp_generic(a, exponent);
+}
 
-  return _mm256_blend_pd(tmp0, tmp1, 0xC);
+template<> EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet4d max_exponent = pset1<Packet4d>(2099.0);
+  const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+  
+  // Split 2^e into four factors and multiply.
+  const Packet4i bias = pset1<Packet4i>(1023);
+  Packet4i b = parithmetic_shift_right<2>(e);  // floor(e/4)
+  
+  // 2^b
+  Packet4i hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3);
+  Packet4i lo = _mm_slli_epi64(hi, 52);
+  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
+  Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
+  Packet4d out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
+  
+  // 2^(e - 3b)
+  b = psub(psub(psub(e, b), b), b);  // e - 3b
+  hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3);
+  lo = _mm_slli_epi64(hi, 52);
+  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
+  c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
+  out = pmul(out, c); // a * 2^e
+  return out;
 }
 
 template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
@@ -406,7 +804,7 @@ template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
   return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
+template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a)
 {
   return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
 }
@@ -450,93 +848,16 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
   return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
 }
 
+// not needed yet
+// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
+// {
+//   return _mm256_movemask_ps(x)==0xFF;
+// }
 
-template<int Offset>
-struct palign_impl<Offset,Packet8f>
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
 {
-  static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
-  {
-    if (Offset==1)
-    {
-      first = _mm256_blend_ps(first, second, 1);
-      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
-      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
-      first = _mm256_blend_ps(tmp1, tmp2, 0x88);
-    }
-    else if (Offset==2)
-    {
-      first = _mm256_blend_ps(first, second, 3);
-      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
-      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
-      first = _mm256_blend_ps(tmp1, tmp2, 0xcc);
-    }
-    else if (Offset==3)
-    {
-      first = _mm256_blend_ps(first, second, 7);
-      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
-      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
-      first = _mm256_blend_ps(tmp1, tmp2, 0xee);
-    }
-    else if (Offset==4)
-    {
-      first = _mm256_blend_ps(first, second, 15);
-      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
-      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
-      first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0));
-    }
-    else if (Offset==5)
-    {
-      first = _mm256_blend_ps(first, second, 31);
-      first = _mm256_permute2f128_ps(first, first, 1);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
-      first = _mm256_permute2f128_ps(tmp, tmp, 1);
-      first = _mm256_blend_ps(tmp, first, 0x88);
-    }
-    else if (Offset==6)
-    {
-      first = _mm256_blend_ps(first, second, 63);
-      first = _mm256_permute2f128_ps(first, first, 1);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
-      first = _mm256_permute2f128_ps(tmp, tmp, 1);
-      first = _mm256_blend_ps(tmp, first, 0xcc);
-    }
-    else if (Offset==7)
-    {
-      first = _mm256_blend_ps(first, second, 127);
-      first = _mm256_permute2f128_ps(first, first, 1);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
-      first = _mm256_permute2f128_ps(tmp, tmp, 1);
-      first = _mm256_blend_ps(tmp, first, 0xee);
-    }
-  }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet4d>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
-  {
-    if (Offset==1)
-    {
-      first = _mm256_blend_pd(first, second, 1);
-      __m256d tmp = _mm256_permute_pd(first, 5);
-      first = _mm256_permute2f128_pd(tmp, tmp, 1);
-      first = _mm256_blend_pd(tmp, first, 0xA);
-    }
-    else if (Offset==2)
-    {
-      first = _mm256_blend_pd(first, second, 3);
-      first = _mm256_permute2f128_pd(first, first, 1);
-    }
-    else if (Offset==3)
-    {
-      first = _mm256_blend_pd(first, second, 7);
-      __m256d tmp = _mm256_permute_pd(first, 5);
-      first = _mm256_permute2f128_pd(tmp, tmp, 1);
-      first = _mm256_blend_pd(tmp, first, 5);
-    }
-  }
-};
+  return _mm256_movemask_ps(x)!=0;
+}
 
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet8f,8>& kernel) {
@@ -610,24 +931,640 @@ template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, cons
   return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b)
+// Packet math for Eigen::half
+
+template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; };
+
+template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
+  return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
+  return numext::bit_cast<Eigen::half>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
+  return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+  _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h
+ploaddup<Packet8h>(const Eigen::half*  from) {
+  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+  const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
+  const numext::uint16_t d = numext::bit_cast<numext::uint16_t>(from[3]);
+  return _mm_set_epi16(d, d, c, c, b, b, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h
+ploadquad<Packet8h>(const Eigen::half* from) {
+  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+  return _mm_set_epi16(b, b, b, b, a, a, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
+ return _mm_cmpeq_epi32(a, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pabs(const Packet8h& a) {
+  const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm_andnot_si128(sign_mask, a);
+}
+
+EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm256_cvtph_ps(a);
+#else
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, a);
+  float f0(aux[0]);
+  float f1(aux[1]);
+  float f2(aux[2]);
+  float f3(aux[3]);
+  float f4(aux[4]);
+  float f5(aux[5]);
+  float f6(aux[6]);
+  float f7(aux[7]);
+
+  return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+#else
+  EIGEN_ALIGN32 float aux[8];
+  pstore(aux, a);
+  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[0]));
+  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[1]));
+  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[2]));
+  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[3]));
+  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[4]));
+  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[5]));
+  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[6]));
+  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(Eigen::half(aux[7]));
+  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a,
+                                            const Packet8h& b) {
+  return float2half(pmin<Packet8f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a,
+                                            const Packet8h& b) {
+  return float2half(pmax<Packet8f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h plset<Packet8h>(const half& a) {
+  return float2half(plset<Packet8f>(static_cast<float>(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
+  // in some cases Packet4i is a wrapper around __m128i, so we either need to
+  // cast to Packet4i to directly call the intrinsics as below:
+  return _mm_or_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) {
+  return _mm_xor_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) {
+  return _mm_and_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) {
+  return _mm_andnot_si128(b,a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
+  return _mm_blendv_epi8(b, a, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
+  return float2half(pround<Packet8f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
+  return float2half(print<Packet8f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
+  return float2half(pceil<Packet8f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
+  return float2half(pfloor<Packet8f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) {
+  return Pack16To8(pcmp_eq(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a,const Packet8h& b) {
+  return Pack16To8(pcmp_le(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a,const Packet8h& b) {
+  return Pack16To8(pcmp_lt(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a,const Packet8h& b) {
+  return Pack16To8(pcmp_lt_or_nan(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
+  Packet8h sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm_xor_si128(a, sign_mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = padd(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = psub(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pmul(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pdiv(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
 {
-  return _mm256_blend_ps(a,pset1<Packet8f>(b),1);
+  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0*stride]);
+  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1*stride]);
+  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2*stride]);
+  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3*stride]);
+  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4*stride]);
+  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5*stride]);
+  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6*stride]);
+  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7*stride]);
+  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b)
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
 {
-  return _mm256_blend_pd(a,pset1<Packet4d>(b),1);
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, from);
+  to[stride*0] = aux[0];
+  to[stride*1] = aux[1];
+  to[stride*2] = aux[2];
+  to[stride*3] = aux[3];
+  to[stride*4] = aux[4];
+  to[stride*5] = aux[5];
+  to[stride*6] = aux[6];
+  to[stride*7] = aux[7];
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_max<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_min<Packet8f>(af);
+  return Eigen::half(reduced);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b)
+template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_mul<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
 {
-  return _mm256_blend_ps(a,pset1<Packet8f>(b),(1<<7));
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  return _mm_shuffle_epi8(a,m);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8h,8>& kernel) {
+  __m128i a = kernel.packet[0];
+  __m128i b = kernel.packet[1];
+  __m128i c = kernel.packet[2];
+  __m128i d = kernel.packet[3];
+  __m128i e = kernel.packet[4];
+  __m128i f = kernel.packet[5];
+  __m128i g = kernel.packet[6];
+  __m128i h = kernel.packet[7];
+
+  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+
+  kernel.packet[0] = a0b0c0d0e0f0g0h0;
+  kernel.packet[1] = a1b1c1d1e1f1g1h1;
+  kernel.packet[2] = a2b2c2d2e2f2g2h2;
+  kernel.packet[3] = a3b3c3d3e3f3g3h3;
+  kernel.packet[4] = a4b4c4d4e4f4g4h4;
+  kernel.packet[5] = a5b5c5d5e5f5g5h5;
+  kernel.packet[6] = a6b6c6d6e6f6g6h6;
+  kernel.packet[7] = a7b7c7d7e7f7g7h7;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8h,4>& kernel) {
+  EIGEN_ALIGN32 Eigen::half in[4][8];
+  pstore<Eigen::half>(in[0], kernel.packet[0]);
+  pstore<Eigen::half>(in[1], kernel.packet[1]);
+  pstore<Eigen::half>(in[2], kernel.packet[2]);
+  pstore<Eigen::half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN32 Eigen::half out[4][8];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][2*i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+4] = in[j][2*i+1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet8h>(out[0]);
+  kernel.packet[1] = pload<Packet8h>(out[1]);
+  kernel.packet[2] = pload<Packet8h>(out[2]);
+  kernel.packet[3] = pload<Packet8h>(out[3]);
+}
+
+// BFloat16 implementation.
+
+EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  __m256i extend = _mm256_cvtepu16_epi32(a);
+  return _mm256_castsi256_ps(_mm256_slli_epi32(extend, 16));
+#else
+  __m128i lo = _mm_cvtepu16_epi32(a);
+  __m128i hi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
+  __m128i lo_shift = _mm_slli_epi32(lo, 16);
+  __m128i hi_shift = _mm_slli_epi32(hi, 16);
+  return _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo_shift), hi_shift, 1));
+#endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b)
+// Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm.
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) {
+  Packet8bf r;
+
+  __m256i input = _mm256_castps_si256(a);
+
+#ifdef EIGEN_VECTORIZE_AVX2
+  // uint32_t lsb = (input >> 16);
+  __m256i t = _mm256_srli_epi32(input, 16);
+  // uint32_t lsb = lsb & 1;
+  t = _mm256_and_si256(t, _mm256_set1_epi32(1));
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t = _mm256_add_epi32(t, _mm256_set1_epi32(0x7fff));
+  // input += rounding_bias;
+  t = _mm256_add_epi32(t, input);
+  // input = input >> 16;
+  t = _mm256_srli_epi32(t, 16);
+  // Check NaN before converting back to bf16
+  __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
+  __m256i nan = _mm256_set1_epi32(0x7fc0);
+  t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask));
+  // output = numext::bit_cast<uint16_t>(input);
+  return _mm_packus_epi32(_mm256_extractf128_si256(t, 0),
+                         _mm256_extractf128_si256(t, 1));
+#else
+  // uint32_t lsb = (input >> 16);
+  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(input, 0), 16);
+  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(input, 1), 16);
+  // uint32_t lsb = lsb & 1;
+  lo = _mm_and_si128(lo, _mm_set1_epi32(1));
+  hi = _mm_and_si128(hi, _mm_set1_epi32(1));
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  lo = _mm_add_epi32(lo, _mm_set1_epi32(0x7fff));
+  hi = _mm_add_epi32(hi, _mm_set1_epi32(0x7fff));
+  // input += rounding_bias;
+  lo = _mm_add_epi32(lo, _mm256_extractf128_si256(input, 0));
+  hi = _mm_add_epi32(hi, _mm256_extractf128_si256(input, 1));
+  // input = input >> 16;
+  lo = _mm_srli_epi32(lo, 16);
+  hi = _mm_srli_epi32(hi, 16);
+  // Check NaN before converting back to bf16
+  __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
+  __m128i nan = _mm_set1_epi32(0x7fc0);
+  lo = _mm_blendv_epi8(nan, lo, _mm_castps_si128(_mm256_castps256_ps128(mask)));
+  hi = _mm_blendv_epi8(nan, hi, _mm_castps_si128(_mm256_extractf128_ps(mask, 1)));
+  // output = numext::bit_cast<uint16_t>(input);
+  return _mm_packus_epi32(lo, hi);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
+  return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet8bf>(const Packet8bf& from) {
+  return numext::bit_cast<bfloat16>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
+  return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
+  _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf
+ploaddup<Packet8bf>(const bfloat16* from) {
+  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+  const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
+  const numext::uint16_t d = numext::bit_cast<numext::uint16_t>(from[3]);
+  return _mm_set_epi16(d, d, c, c, b, b, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf
+ploadquad<Packet8bf>(const bfloat16* from) {
+  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+  return _mm_set_epi16(b, b, b, b, a, a, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf ptrue(const Packet8bf& a) {
+ return _mm_cmpeq_epi32(a, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
+  const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm_andnot_si128(sign_mask, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a,
+                                                const Packet8bf& b) {
+  return F32ToBf16(pmin<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a,
+                                                const Packet8bf& b) {
+  return F32ToBf16(pmax<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
+  return F32ToBf16(plset<Packet8f>(static_cast<float>(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf por(const Packet8bf& a,const Packet8bf& b) {
+  return _mm_or_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a,const Packet8bf& b) {
+  return _mm_xor_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pand(const Packet8bf& a,const Packet8bf& b) {
+  return _mm_and_si128(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pandnot(const Packet8bf& a,const Packet8bf& b) {
+  return _mm_andnot_si128(b,a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pselect(const Packet8bf& mask, const Packet8bf& a, const Packet8bf& b) {
+  return _mm_blendv_epi8(b, a, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a)
 {
-  return _mm256_blend_pd(a,pset1<Packet4d>(b),(1<<3));
+  return F32ToBf16(pround<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
+  return F32ToBf16(print<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
+  return F32ToBf16(pceil<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
+  return F32ToBf16(pfloor<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a,const Packet8bf& b) {
+  return Pack16To8(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a,const Packet8bf& b) {
+  return Pack16To8(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a,const Packet8bf& b) {
+  return Pack16To8(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a,const Packet8bf& b) {
+  return Pack16To8(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pconj(const Packet8bf& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) {
+  Packet8bf sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm_xor_si128(a, sign_mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(padd<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(psub<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(pmul<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(pdiv<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
+{
+  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0*stride]);
+  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1*stride]);
+  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2*stride]);
+  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3*stride]);
+  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4*stride]);
+  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5*stride]);
+  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6*stride]);
+  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7*stride]);
+  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
+{
+  EIGEN_ALIGN32 bfloat16 aux[8];
+  pstore(aux, from);
+  to[stride*0] = aux[0];
+  to[stride*1] = aux[1];
+  to[stride*2] = aux[2];
+  to[stride*3] = aux[3];
+  to[stride*4] = aux[4];
+  to[stride*5] = aux[5];
+  to[stride*6] = aux[6];
+  to[stride*7] = aux[7];
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_max<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_min<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_mul<Packet8f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  return _mm_shuffle_epi8(a,m);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8bf,8>& kernel) {
+  __m128i a = kernel.packet[0];
+  __m128i b = kernel.packet[1];
+  __m128i c = kernel.packet[2];
+  __m128i d = kernel.packet[3];
+  __m128i e = kernel.packet[4];
+  __m128i f = kernel.packet[5];
+  __m128i g = kernel.packet[6];
+  __m128i h = kernel.packet[7];
+
+  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+  kernel.packet[0] = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+  kernel.packet[1] = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+  kernel.packet[2] = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+  kernel.packet[3] = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+  kernel.packet[4] = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+  kernel.packet[5] = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+  kernel.packet[6] = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+  kernel.packet[7] = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8bf,4>& kernel) {
+  __m128i a = kernel.packet[0];
+  __m128i b = kernel.packet[1];
+  __m128i c = kernel.packet[2];
+  __m128i d = kernel.packet[3];
+
+  __m128i ab_03 = _mm_unpacklo_epi16(a, b);
+  __m128i cd_03 = _mm_unpacklo_epi16(c, d);
+  __m128i ab_47 = _mm_unpackhi_epi16(a, b);
+  __m128i cd_47 = _mm_unpackhi_epi16(c, d);
+
+  kernel.packet[0] = _mm_unpacklo_epi32(ab_03, cd_03);
+  kernel.packet[1] = _mm_unpackhi_epi32(ab_03, cd_03);
+  kernel.packet[2] = _mm_unpacklo_epi32(ab_47, cd_47);
+  kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47);
 }
 
 } // end namespace internal
diff --git a/contrib/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h b/contrib/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h
index 83bfdc604bec57599ff151840c0280f8ec7f6702..d507fb67b21d1379b7eacc444bdfbcd8d7058ba5 100644
--- a/contrib/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/contrib/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -35,15 +35,79 @@ struct type_casting_traits<int, float> {
 };
 
 
+#ifndef EIGEN_VECTORIZE_AVX512
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<bfloat16, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<float, bfloat16> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+#endif  // EIGEN_VECTORIZE_AVX512
 
 template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
-  return _mm256_cvtps_epi32(a);
+  return _mm256_cvttps_epi32(a);
 }
 
 template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
   return _mm256_cvtepi32_ps(a);
 }
 
+template<> EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i,Packet8f>(const Packet8f& a) {
+  return _mm256_castps_si256(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f,Packet8i>(const Packet8i& a) {
+  return _mm256_castsi256_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
+  return half2float(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) {
+  return Bf16ToF32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+  return float2half(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8f, Packet8bf>(const Packet8f& a) {
+  return F32ToBf16(a);
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/contrib/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h b/contrib/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h
index b259c1e1f92db1565eefe9d23418ac1103f2ad79..6fd726d29cffeb66926b3ebb1f337065ed8e51f4 100644
--- a/contrib/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/contrib/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -15,13 +15,13 @@ namespace Eigen {
 namespace internal {
 
 // Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics.
-#if EIGEN_GNUC_AT_LEAST(5, 3)
+#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG  || EIGEN_COMP_MSVC >= 1923
 
 #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
   const Packet16f p16f_##NAME = pset1<Packet16f>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \
-  const Packet16f p16f_##NAME = (__m512)pset1<Packet16i>(X)
+  const Packet16f p16f_##NAME =  preinterpret<Packet16f,Packet16i>(pset1<Packet16i>(X))
 
 #define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \
   const Packet8d p8d_##NAME = pset1<Packet8d>(X)
@@ -29,108 +29,41 @@ namespace internal {
 #define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
   const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
 
+#define _EIGEN_DECLARE_CONST_Packet16bf(NAME, X) \
+  const Packet16bf p16bf_##NAME = pset1<Packet16bf>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet16bf_FROM_INT(NAME, X) \
+  const Packet16bf p16bf_##NAME =  preinterpret<Packet16bf,Packet16i>(pset1<Packet16i>(X))
 
-// Natural logarithm
-// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
-// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
-// be easily approximated by a polynomial centered on m=1 for stability.
-#if defined(EIGEN_VECTORIZE_AVX512DQ)
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 plog<Packet16f>(const Packet16f& _x) {
-  Packet16f x = _x;
-  _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
-  _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet16f(126f, 126.0f);
-
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
-  // The smallest non denormalized float number.
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
-
-  // Polynomial coefficients.
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_SQRTHF, 0.707106781186547524f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p0, 7.0376836292E-2f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p1, -1.1514610310E-1f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p2, 1.1676998740E-1f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p3, -1.2420140846E-1f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p4, +1.4249322787E-1f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p5, -1.6668057665E-1f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p6, +2.0000714765E-1f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p7, -2.4999993993E-1f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p8, +3.3333331174E-1f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q1, -2.12194440e-4f);
-  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);
-
-  // invalid_mask is set to true when x is NaN
-  __mmask16 invalid_mask =  _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
-  __mmask16 iszero_mask  =  _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ);
-      
-  // Truncate input values to the minimum positive normal.
-  x = pmax(x, p16f_min_norm_pos);
-
-  // Extract the shifted exponents.
-  Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23));
-  Packet16f e = _mm512_sub_ps(emm0, p16f_126f);
-
-  // Set the exponents to -1, i.e. x are in the range [0.5,1).
-  x = _mm512_and_ps(x, p16f_inv_mant_mask);
-  x = _mm512_or_ps(x, p16f_half);
-
-  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
-  // and shift by -1. The values are then centered around 0, which improves
-  // the stability of the polynomial evaluation.
-  //   if( x < SQRTHF ) {
-  //     e -= 1;
-  //     x = x + x - 1.0;
-  //   } else { x = x - 1.0; }
-  __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
-  Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x);
-  x = psub(x, p16f_1);
-  e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1));
-  x = padd(x, tmp);
-
-  Packet16f x2 = pmul(x, x);
-  Packet16f x3 = pmul(x2, x);
-
-  // Evaluate the polynomial approximant of degree 8 in three parts, probably
-  // to improve instruction-level parallelism.
-  Packet16f y, y1, y2;
-  y = pmadd(p16f_cephes_log_p0, x, p16f_cephes_log_p1);
-  y1 = pmadd(p16f_cephes_log_p3, x, p16f_cephes_log_p4);
-  y2 = pmadd(p16f_cephes_log_p6, x, p16f_cephes_log_p7);
-  y = pmadd(y, x, p16f_cephes_log_p2);
-  y1 = pmadd(y1, x, p16f_cephes_log_p5);
-  y2 = pmadd(y2, x, p16f_cephes_log_p8);
-  y = pmadd(y, x3, y1);
-  y = pmadd(y, x3, y2);
-  y = pmul(y, x3);
-
-  // Add the logarithm of the exponent back to the result of the interpolation.
-  y1 = pmul(e, p16f_cephes_log_q1);
-  tmp = pmul(x2, p16f_half);
-  y = padd(y, y1);
-  x = psub(x, tmp);
-  y2 = pmul(e, p16f_cephes_log_q2);
-  x = padd(x, y);
-  x = padd(x, y2);
-
-  __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
-  // Filter out invalid inputs, i.e.:
-  //  - negative arg will be NAN,
-  //  - 0 will be -INF.
-  //  - +INF will be +INF
-  return _mm512_mask_blend_ps(iszero_mask,
-            _mm512_mask_blend_ps(invalid_mask,
-              _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
-              p16f_nan),
-            p16f_minus_inf);
+  return plog_float(_x);
 }
 
-#endif
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
+plog<Packet8d>(const Packet8d& _x) {
+  return plog_double(_x);
+}
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, plog)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog)
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+plog2<Packet16f>(const Packet16f& _x) {
+  return plog2_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
+plog2<Packet8d>(const Packet8d& _x) {
+  return plog2_double(_x);
+}
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, plog2)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog2)
 
 // Exponential function. Works by writing "x = m*log(2) + r" where
 // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
@@ -166,17 +99,17 @@ pexp<Packet16f>(const Packet16f& _x) {
   _EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f);
   Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x);
   Packet16f r2 = pmul(r, r);
+  Packet16f r3 = pmul(r2, r);
 
-  // TODO(gonnet): Split into odd/even polynomials and try to exploit
-  //               instruction-level parallelism.
-  Packet16f y = p16f_cephes_exp_p0;
-  y = pmadd(y, r, p16f_cephes_exp_p1);
-  y = pmadd(y, r, p16f_cephes_exp_p2);
-  y = pmadd(y, r, p16f_cephes_exp_p3);
-  y = pmadd(y, r, p16f_cephes_exp_p4);
-  y = pmadd(y, r, p16f_cephes_exp_p5);
-  y = pmadd(y, r2, r);
-  y = padd(y, p16f_1);
+  // Evaluate the polynomial approximant,improved by instruction-level parallelism.
+  Packet16f y, y1, y2;
+  y  = pmadd(p16f_cephes_exp_p0, r, p16f_cephes_exp_p1);
+  y1 = pmadd(p16f_cephes_exp_p3, r, p16f_cephes_exp_p4);
+  y2 = padd(r, p16f_1);
+  y  = pmadd(y, r, p16f_cephes_exp_p2);
+  y1 = pmadd(y1, r, p16f_cephes_exp_p5);
+  y  = pmadd(y, r3, y1);
+  y  = pmadd(y, r2, y2);
 
   // Build emm0 = 2^m.
   Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127));
@@ -186,74 +119,40 @@ pexp<Packet16f>(const Packet16f& _x) {
   return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x);
 }
 
-/*template <>
+template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 pexp<Packet8d>(const Packet8d& _x) {
-  Packet8d x = _x;
-
-  _EIGEN_DECLARE_CONST_Packet8d(1, 1.0);
-  _EIGEN_DECLARE_CONST_Packet8d(2, 2.0);
-
-  _EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437);
-  _EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303);
-
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599);
-
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1);
-
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0);
-
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6);
-
-  // clamp x
-  x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo);
-
-  // Express exp(x) as exp(g + n*log(2)).
-  const Packet8d n =
-      _mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT);
-
-  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
-  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
-  // digits right.
-  const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1);
-  const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2);
-  x = psub(x, nC1);
-  x = psub(x, nC2);
-
-  const Packet8d x2 = pmul(x, x);
+  return pexp_double(_x);
+}
 
-  // Evaluate the numerator polynomial of the rational interpolant.
-  Packet8d px = p8d_cephes_exp_p0;
-  px = pmadd(px, x2, p8d_cephes_exp_p1);
-  px = pmadd(px, x2, p8d_cephes_exp_p2);
-  px = pmul(px, x);
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp)
 
-  // Evaluate the denominator polynomial of the rational interpolant.
-  Packet8d qx = p8d_cephes_exp_q0;
-  qx = pmadd(qx, x2, p8d_cephes_exp_q1);
-  qx = pmadd(qx, x2, p8d_cephes_exp_q2);
-  qx = pmadd(qx, x2, p8d_cephes_exp_q3);
+template <>
+EIGEN_STRONG_INLINE Packet16h pfrexp(const Packet16h& a, Packet16h& exponent) {
+  Packet16f fexponent;
+  const Packet16h out = float2half(pfrexp<Packet16f>(half2float(a), fexponent));
+  exponent = float2half(fexponent);
+  return out;
+}
 
-  // I don't really get this bit, copied from the SSE2 routines, so...
-  // TODO(gonnet): Figure out what is going on here, perhaps find a better
-  // rational interpolant?
-  x = _mm512_div_pd(px, psub(qx, px));
-  x = pmadd(p8d_2, x, p8d_1);
+template <>
+EIGEN_STRONG_INLINE Packet16h pldexp(const Packet16h& a, const Packet16h& exponent) {
+  return float2half(pldexp<Packet16f>(half2float(a), half2float(exponent)));
+}
 
-  // Build e=2^n.
-  const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64(
-      _mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52));
+template <>
+EIGEN_STRONG_INLINE Packet16bf pfrexp(const Packet16bf& a, Packet16bf& exponent) {
+  Packet16f fexponent;
+  const Packet16bf out = F32ToBf16(pfrexp<Packet16f>(Bf16ToF32(a), fexponent));
+  exponent = F32ToBf16(fexponent);
+  return out;
+}
 
-  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
-  // non-finite values in the input.
-  return pmax(pmul(x, e), _x);
-  }*/
+template <>
+EIGEN_STRONG_INLINE Packet16bf pldexp(const Packet16bf& a, const Packet16bf& exponent) {
+  return F32ToBf16(pldexp<Packet16f>(Bf16ToF32(a), Bf16ToF32(exponent)));
+}
 
 // Functions for sqrt.
 // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
@@ -304,83 +203,157 @@ template <>
 EIGEN_STRONG_INLINE Packet16f psqrt<Packet16f>(const Packet16f& x) {
   return _mm512_sqrt_ps(x);
 }
+
 template <>
 EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) {
   return _mm512_sqrt_pd(x);
 }
 #endif
 
-// Functions for rsqrt.
-// Almost identical to the sqrt routine, just leave out the last multiplication
-// and fill in NaN/Inf where needed. Note that this function only exists as an
-// iterative version for doubles since there is no instruction for diretly
-// computing the reciprocal square root in AVX-512.
-#ifdef EIGEN_FAST_MATH
+F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt)
+
+// prsqrt for float.
+#if defined(EIGEN_VECTORIZE_AVX512ER)
+
+template <>
+EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
+  return _mm512_rsqrt28_ps(x);
+}
+#elif EIGEN_FAST_MATH
+
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 prsqrt<Packet16f>(const Packet16f& _x) {
   _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
   _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
   _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
 
   Packet16f neg_half = pmul(_x, p16f_minus_half);
 
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
-  Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
-
-  // Fill in NaNs and Infs for the negative/zero entries.
-  __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
-  Packet16f infs_and_nans = _mm512_mask_blend_ps(
-      neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
-
-  // Do a single step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
+  // Identity infinite, negative and denormal arguments.
+  __mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ);
+  __mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ);
+  __mmask16 not_finite_pos_mask = not_pos_mask | inf_mask;
+
+  // Compute an approximate result using the rsqrt intrinsic, forcing +inf
+  // for denormals for consistency with AVX and SSE implementations.
+  Packet16f y_approx = _mm512_rsqrt14_ps(_x);
+
+  // Do a single step of Newton-Raphson iteration to improve the approximation.
+  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
+  // It is essential to evaluate the inner term like this because forming
+  // y_n^2 may over- or underflow.
+  Packet16f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p16f_one_point_five));
+
+  // Select the result of the Newton-Raphson step for positive finite arguments.
+  // For other arguments, choose the output of the intrinsic. This will
+  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
+  return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx);
+}
+#else
 
-  // Insert NaNs and Infs in all the right places.
-  return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
+template <>
+EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
+  _EIGEN_DECLARE_CONST_Packet16f(one, 1.0f);
+  return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x));
 }
+#endif
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, prsqrt)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, prsqrt)
 
+// prsqrt for double.
+#if EIGEN_FAST_MATH
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 prsqrt<Packet8d>(const Packet8d& _x) {
-  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
-  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL);
   _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
   _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
-  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
+  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
 
   Packet8d neg_half = pmul(_x, p8d_minus_half);
 
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
-  Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
+  // Identity infinite, negative and denormal arguments.
+  __mmask8 inf_mask = _mm512_cmp_pd_mask(_x, p8d_inf, _CMP_EQ_OQ);
+  __mmask8 not_pos_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LE_OQ);
+  __mmask8 not_finite_pos_mask = not_pos_mask | inf_mask;
+
+  // Compute an approximate result using the rsqrt intrinsic, forcing +inf
+  // for denormals for consistency with AVX and SSE implementations.
+#if defined(EIGEN_VECTORIZE_AVX512ER)
+  Packet8d y_approx = _mm512_rsqrt28_pd(_x);
+#else
+  Packet8d y_approx = _mm512_rsqrt14_pd(_x);
+#endif
+  // Do one or two steps of Newton-Raphson's to improve the approximation, depending on the
+  // starting accuracy (either 2^-14 or 2^-28, depending on whether AVX512ER is available).
+  // The Newton-Raphson algorithm has quadratic convergence and roughly doubles the number
+  // of correct digits for each step.
+  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
+  // It is essential to evaluate the inner term like this because forming
+  // y_n^2 may over- or underflow.
+  Packet8d y_newton = pmul(y_approx, pmadd(neg_half, pmul(y_approx, y_approx), p8d_one_point_five));
+#if !defined(EIGEN_VECTORIZE_AVX512ER)
+  y_newton = pmul(y_newton, pmadd(y_newton, pmul(neg_half, y_newton), p8d_one_point_five));
+#endif
+  // Select the result of the Newton-Raphson step for positive finite arguments.
+  // For other arguments, choose the output of the intrinsic. This will
+  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
+  return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet8d prsqrt<Packet8d>(const Packet8d& x) {
+  _EIGEN_DECLARE_CONST_Packet8d(one, 1.0f);
+  return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(x));
+}
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet16f plog1p<Packet16f>(const Packet16f& _x) {
+  return generic_plog1p(_x);
+}
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, plog1p)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog1p)
 
-  // Fill in NaNs and Infs for the negative/zero entries.
-  __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
-  Packet8d infs_and_nans = _mm512_mask_blend_pd(
-      neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet16f pexpm1<Packet16f>(const Packet16f& _x) {
+  return generic_expm1(_x);
+}
 
-  // Do a first step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1)
 
-  // Do a second step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+#endif
+
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+psin<Packet16f>(const Packet16f& _x) {
+  return psin_float(_x);
+}
 
-  // Insert NaNs and Infs in all the right places.
-  return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+pcos<Packet16f>(const Packet16f& _x) {
+  return pcos_float(_x);
 }
-#elif defined(EIGEN_VECTORIZE_AVX512ER)
+
 template <>
-EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
-  return _mm512_rsqrt28_ps(x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+ptanh<Packet16f>(const Packet16f& _x) {
+  return internal::generic_fast_tanh_float(_x);
 }
-#endif
-#endif
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, psin)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh)
+
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pcos)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh)
 
 }  // end namespace internal
 
diff --git a/contrib/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h b/contrib/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h
index 000b7762ff2bd0c9bb1c3a5a0b80a57e6084a424..34d49ab660c3670061c670ca6210ecafed39f7f1 100644
--- a/contrib/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/contrib/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -31,6 +31,8 @@ namespace internal {
 typedef __m512 Packet16f;
 typedef __m512i Packet16i;
 typedef __m512d Packet8d;
+typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
+typedef eigen_packet_wrapper<__m256i, 2> Packet16bf;
 
 template <>
 struct is_arithmetic<__m512> {
@@ -45,6 +47,51 @@ struct is_arithmetic<__m512d> {
   enum { value = true };
 };
 
+template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
+
+template <>
+struct packet_traits<half> : default_packet_traits {
+  typedef Packet16h type;
+  // There is no half-size packet for Packet16h.
+  typedef Packet16h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 1,
+
+    HasCmp    = 1,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasAbs2   = 0,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 1,
+    HasSetLinear = 0,
+    HasLog    = 1,
+    HasLog1p  = 1,
+    HasExpm1  = 1,
+    HasExp    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasSin    = EIGEN_FAST_MATH,
+    HasCos    = EIGEN_FAST_MATH,
+    HasTanh   = EIGEN_FAST_MATH,
+    HasErf    = EIGEN_FAST_MATH,
+    HasBlend = 0,
+    HasRound  = 1,
+    HasFloor  = 1,
+    HasCeil   = 1,
+    HasRint   = 1,
+    HasBessel = 1,
+    HasNdtri  = 1
+  };
+};
+
 template<> struct packet_traits<float>  : default_packet_traits
 {
   typedef Packet16f type;
@@ -54,16 +101,32 @@ template<> struct packet_traits<float>  : default_packet_traits
     AlignedOnScalar = 1,
     size = 16,
     HasHalfPacket = 1,
+
+    HasAbs = 1,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 1,
     HasBlend = 0,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
-#ifdef EIGEN_VECTORIZE_AVX512DQ
     HasLog = 1,
-#endif
+    HasLog1p  = 1,
+    HasExpm1  = 1,
+    HasNdtri = 1,
+    HasBessel  = 1,
     HasExp = 1,
     HasSqrt = EIGEN_FAST_MATH,
     HasRsqrt = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
 #endif
-    HasDiv = 1
+    HasCmp  = 1,
+    HasDiv = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1
   };
  };
 template<> struct packet_traits<double> : default_packet_traits
@@ -76,10 +139,17 @@ template<> struct packet_traits<double> : default_packet_traits
     size = 8,
     HasHalfPacket = 1,
 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
+    HasLog  = 1,
+    HasExp = 1,
     HasSqrt = EIGEN_FAST_MATH,
     HasRsqrt = EIGEN_FAST_MATH,
 #endif
-    HasDiv = 1
+    HasCmp  = 1,
+    HasDiv = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1
   };
 };
 
@@ -100,19 +170,27 @@ struct unpacket_traits<Packet16f> {
   typedef float type;
   typedef Packet8f half;
   typedef Packet16i integer_packet;
-  enum { size = 16, alignment=Aligned64 };
+  typedef uint16_t mask_t;
+  enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=true, masked_store_available=true };
 };
 template <>
 struct unpacket_traits<Packet8d> {
   typedef double type;
   typedef Packet4d half;
-  enum { size = 8, alignment=Aligned64 };
+  enum { size = 8, alignment=Aligned64, vectorizable=true, masked_load_available=false, masked_store_available=false };
 };
 template <>
 struct unpacket_traits<Packet16i> {
   typedef int type;
   typedef Packet8i half;
-  enum { size = 16, alignment=Aligned64 };
+  enum { size = 16, alignment=Aligned64, vectorizable=false, masked_load_available=false, masked_store_available=false };
+};
+
+template<>
+struct unpacket_traits<Packet16h> {
+  typedef Eigen::half type;
+  typedef Packet8h half;
+  enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
 };
 
 template <>
@@ -128,6 +206,33 @@ EIGEN_STRONG_INLINE Packet16i pset1<Packet16i>(const int& from) {
   return _mm512_set1_epi32(from);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16f pset1frombits<Packet16f>(unsigned int from) {
+  return _mm512_castsi512_ps(_mm512_set1_epi32(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pset1frombits<Packet8d>(const numext::uint64_t from) {
+  return _mm512_castsi512_pd(_mm512_set1_epi64(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f& /*a*/) { return _mm512_setzero_ps(); }
+template<> EIGEN_STRONG_INLINE Packet8d pzero(const Packet8d& /*a*/) { return _mm512_setzero_pd(); }
+template<> EIGEN_STRONG_INLINE Packet16i pzero(const Packet16i& /*a*/) { return _mm512_setzero_si512(); }
+
+template<> EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /*a*/) {
+  return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
+                                              0, -1, 0, -1, 0, -1, 0, -1));
+}
+template<> EIGEN_STRONG_INLINE Packet16i peven_mask(const Packet16i& /*a*/) {
+  return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
+                          0, -1, 0, -1, 0, -1, 0, -1);
+}
+template<> EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) {
+  return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1,
+                                              0, 0, -1, -1, 0, 0, -1, -1));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
   return _mm512_broadcastss_ps(_mm_load_ps1(from));
@@ -217,7 +322,7 @@ EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a,
 template <>
 EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a,
                                               const Packet16i& b) {
-  return _mm512_mul_epi32(a, b);
+  return _mm512_mullo_epi32(a, b);
 }
 
 template <>
@@ -244,6 +349,24 @@ EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b,
 }
 #endif
 
+template <>
+EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask,
+                                           const Packet16f& a,
+                                           const Packet16f& b) {
+  __mmask16 mask16 = _mm512_cmp_epi32_mask(
+      _mm512_castps_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
+  return _mm512_mask_blend_ps(mask16, a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask,
+                                          const Packet8d& a,
+                                          const Packet8d& b) {
+  __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask),
+                                         _mm512_setzero_epi32(), _MM_CMPINT_EQ);
+  return _mm512_mask_blend_pd(mask8, a, b);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a,
                                               const Packet16f& b) {
@@ -270,6 +393,41 @@ EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
   return _mm512_max_pd(b, a);
 }
 
+// Add specializations for min/max with prescribed NaN progation.
+template<>
+EIGEN_STRONG_INLINE Packet16f pmin<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet16f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8d pmin<PropagateNumbers, Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet8d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet16f pmax<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet16f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8d pmax<PropagateNumbers, Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet8d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet16f pmin<PropagateNaN, Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet16f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8d pmin<PropagateNaN, Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet8d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet16f pmax<PropagateNaN, Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet16f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet8d pmax<PropagateNaN, Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet8d>);
+}
+
+
 #ifdef EIGEN_VECTORIZE_AVX512DQ
 template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
 template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
@@ -310,6 +468,85 @@ EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) {
   return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
+  __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
+}
+
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_le(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ);
+  return _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
+  return _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ);
+  return _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f print<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet8d print<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_CUR_DIRECTION); }
+
+template<> EIGEN_STRONG_INLINE Packet16f pceil<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF); }
+template<> EIGEN_STRONG_INLINE Packet8d pceil<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF); }
+
+template<> EIGEN_STRONG_INLINE Packet16f pfloor<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF); }
+template<> EIGEN_STRONG_INLINE Packet8d pfloor<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF); }
+
+template <>
+EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(const Packet16i& /*a*/) {
+  return _mm512_set1_epi32(0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f ptrue<Packet16f>(const Packet16f& a) {
+  return _mm512_castsi512_ps(ptrue<Packet16i>(_mm512_castps_si512(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d ptrue<Packet8d>(const Packet8d& a) {
+  return _mm512_castsi512_pd(ptrue<Packet16i>(_mm512_castpd_si512(a)));
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
                                               const Packet16i& b) {
@@ -411,6 +648,21 @@ EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,const Packet8d&
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet16f pround<Packet16f>(const Packet16f& a)
+{
+  // Work-around for default std::round rounding mode.
+  const Packet16f mask = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x80000000u));
+  const Packet16f prev0dot5 = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
+  return _mm512_roundscale_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+template<> EIGEN_STRONG_INLINE Packet8d pround<Packet8d>(const Packet8d& a)
+{
+  // Work-around for default std::round rounding mode.
+  const Packet8d mask = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
+  const Packet8d prev0dot5 = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
+  return _mm512_roundscale_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
 template<int N> EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
   return _mm512_srai_epi32(a, N);
 }
@@ -451,6 +703,12 @@ EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
       reinterpret_cast<const __m512i*>(from));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from, uint16_t umask) {
+  __mmask16 mask = static_cast<__mmask16>(umask);
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_ps(mask, from);
+}
+
 // Loads 8 floats from memory a returns the packet
 // {a0, a0  a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
 template <>
@@ -535,6 +793,11 @@ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
       reinterpret_cast<__m512i*>(to), from);
 }
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from, uint16_t umask) {
+  __mmask16 mask = static_cast<__mmask16>(umask);
+  EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_ps(to, mask, from);
+}
 
 template <>
 EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
@@ -631,6 +894,59 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
                                    _mm512_set1_epi64(0x7fffffffffffffff)));
 }
 
+template<>
+EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent){
+  return pfrexp_generic(a, exponent);
+}
+
+// Extract exponent without existence of Packet8l.
+template<>
+EIGEN_STRONG_INLINE  
+Packet8d pfrexp_generic_get_biased_exponent(const Packet8d& a) {
+  const Packet8d cst_exp_mask  = pset1frombits<Packet8d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+  #ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52));
+  #else
+  return _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52)));
+  #endif
+}
+
+template<>
+EIGEN_STRONG_INLINE Packet8d pfrexp<Packet8d>(const Packet8d& a, Packet8d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet8d max_exponent = pset1<Packet8d>(2099.0);
+  const Packet8i e = _mm512_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+  
+  // Split 2^e into four factors and multiply.
+  const Packet8i bias = pset1<Packet8i>(1023);
+  Packet8i b = parithmetic_shift_right<2>(e);  // floor(e/4)
+  
+  // 2^b
+  const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+  Packet8i hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
+  Packet8i lo = _mm256_slli_epi64(hi, 52);
+  hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
+  Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
+  Packet8d out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
+  
+  // 2^(e - 3b)
+  b = psub(psub(psub(e, b), b), b);  // e - 3b
+  hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
+  lo = _mm256_slli_epi64(hi, 52);
+  hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
+  c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
+  out = pmul(out, c);  // a * 2^e
+  return out;
+}
+
 #ifdef EIGEN_VECTORIZE_AVX512DQ
 // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                           \
@@ -686,27 +1002,26 @@ EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
+EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
-  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
-  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
-  return padd(lane0, lane1);
+  __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
+  __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
+  return _mm256_add_ps(lane0, lane1);
 #else
-  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
-  Packet4f sum0 = padd(lane0, lane2);
-  Packet4f sum1 = padd(lane1, lane3);
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 sum0 = _mm_add_ps(lane0, lane2);
+  __m128 sum1 = _mm_add_ps(lane1, lane3);
   return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
 #endif
 }
 template <>
-EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
-  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
-  Packet4d res = padd(lane0, lane1);
-  return res;
+EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  return _mm256_add_pd(lane0, lane1);
 }
 
 template <>
@@ -777,196 +1092,13 @@ EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
   return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f* vecs)
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x)
 {
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[2], vecs2);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[3], vecs3);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[4], vecs4);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[5], vecs5);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[6], vecs6);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[7], vecs7);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[8], vecs8);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[9], vecs9);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[10], vecs10);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[11], vecs11);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[12], vecs12);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[13], vecs13);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[14], vecs14);
-  EIGEN_EXTRACT_8f_FROM_16f(vecs[15], vecs15);
-
-  __m256 hsum1 = _mm256_hadd_ps(vecs0_0, vecs1_0);
-  __m256 hsum2 = _mm256_hadd_ps(vecs2_0, vecs3_0);
-  __m256 hsum3 = _mm256_hadd_ps(vecs4_0, vecs5_0);
-  __m256 hsum4 = _mm256_hadd_ps(vecs6_0, vecs7_0);
-
-  __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
-  __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
-  __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
-  __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
-
-  __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
-  __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
-  __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
-  __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
-
-  __m256 sum1 = _mm256_add_ps(perm1, hsum5);
-  __m256 sum2 = _mm256_add_ps(perm2, hsum6);
-  __m256 sum3 = _mm256_add_ps(perm3, hsum7);
-  __m256 sum4 = _mm256_add_ps(perm4, hsum8);
-
-  __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
-  __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
-
-  __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
-
-  hsum1 = _mm256_hadd_ps(vecs0_1, vecs1_1);
-  hsum2 = _mm256_hadd_ps(vecs2_1, vecs3_1);
-  hsum3 = _mm256_hadd_ps(vecs4_1, vecs5_1);
-  hsum4 = _mm256_hadd_ps(vecs6_1, vecs7_1);
-
-  hsum5 = _mm256_hadd_ps(hsum1, hsum1);
-  hsum6 = _mm256_hadd_ps(hsum2, hsum2);
-  hsum7 = _mm256_hadd_ps(hsum3, hsum3);
-  hsum8 = _mm256_hadd_ps(hsum4, hsum4);
-
-  perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
-  perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
-  perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
-  perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
-
-  sum1 = _mm256_add_ps(perm1, hsum5);
-  sum2 = _mm256_add_ps(perm2, hsum6);
-  sum3 = _mm256_add_ps(perm3, hsum7);
-  sum4 = _mm256_add_ps(perm4, hsum8);
-
-  blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
-  blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
-
-  final = padd(final, _mm256_blend_ps(blend1, blend2, 0xf0));
-
-  hsum1 = _mm256_hadd_ps(vecs8_0, vecs9_0);
-  hsum2 = _mm256_hadd_ps(vecs10_0, vecs11_0);
-  hsum3 = _mm256_hadd_ps(vecs12_0, vecs13_0);
-  hsum4 = _mm256_hadd_ps(vecs14_0, vecs15_0);
-
-  hsum5 = _mm256_hadd_ps(hsum1, hsum1);
-  hsum6 = _mm256_hadd_ps(hsum2, hsum2);
-  hsum7 = _mm256_hadd_ps(hsum3, hsum3);
-  hsum8 = _mm256_hadd_ps(hsum4, hsum4);
-
-  perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
-  perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
-  perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
-  perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
-
-  sum1 = _mm256_add_ps(perm1, hsum5);
-  sum2 = _mm256_add_ps(perm2, hsum6);
-  sum3 = _mm256_add_ps(perm3, hsum7);
-  sum4 = _mm256_add_ps(perm4, hsum8);
-
-  blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
-  blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
-
-  __m256 final_1 = _mm256_blend_ps(blend1, blend2, 0xf0);
-
-  hsum1 = _mm256_hadd_ps(vecs8_1, vecs9_1);
-  hsum2 = _mm256_hadd_ps(vecs10_1, vecs11_1);
-  hsum3 = _mm256_hadd_ps(vecs12_1, vecs13_1);
-  hsum4 = _mm256_hadd_ps(vecs14_1, vecs15_1);
-
-  hsum5 = _mm256_hadd_ps(hsum1, hsum1);
-  hsum6 = _mm256_hadd_ps(hsum2, hsum2);
-  hsum7 = _mm256_hadd_ps(hsum3, hsum3);
-  hsum8 = _mm256_hadd_ps(hsum4, hsum4);
-
-  perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
-  perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
-  perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
-  perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
-
-  sum1 = _mm256_add_ps(perm1, hsum5);
-  sum2 = _mm256_add_ps(perm2, hsum6);
-  sum3 = _mm256_add_ps(perm3, hsum7);
-  sum4 = _mm256_add_ps(perm4, hsum8);
-
-  blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
-  blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
-
-  final_1 = padd(final_1, _mm256_blend_ps(blend1, blend2, 0xf0));
-
-  __m512 final_output;
-
-  EIGEN_INSERT_8f_INTO_16f(final_output, final, final_1);
-  return final_output;
+  Packet16i xi = _mm512_castps_si512(x);
+  __mmask16 tmp = _mm512_test_epi32_mask(xi,xi);
+  return !_mm512_kortestz(tmp,tmp);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs)
-{
-  Packet4d vecs0_0 = _mm512_extractf64x4_pd(vecs[0], 0);
-  Packet4d vecs0_1 = _mm512_extractf64x4_pd(vecs[0], 1);
-
-  Packet4d vecs1_0 = _mm512_extractf64x4_pd(vecs[1], 0);
-  Packet4d vecs1_1 = _mm512_extractf64x4_pd(vecs[1], 1);
-
-  Packet4d vecs2_0 = _mm512_extractf64x4_pd(vecs[2], 0);
-  Packet4d vecs2_1 = _mm512_extractf64x4_pd(vecs[2], 1);
-
-  Packet4d vecs3_0 = _mm512_extractf64x4_pd(vecs[3], 0);
-  Packet4d vecs3_1 = _mm512_extractf64x4_pd(vecs[3], 1);
-
-  Packet4d vecs4_0 = _mm512_extractf64x4_pd(vecs[4], 0);
-  Packet4d vecs4_1 = _mm512_extractf64x4_pd(vecs[4], 1);
-
-  Packet4d vecs5_0 = _mm512_extractf64x4_pd(vecs[5], 0);
-  Packet4d vecs5_1 = _mm512_extractf64x4_pd(vecs[5], 1);
-
-  Packet4d vecs6_0 = _mm512_extractf64x4_pd(vecs[6], 0);
-  Packet4d vecs6_1 = _mm512_extractf64x4_pd(vecs[6], 1);
-
-  Packet4d vecs7_0 = _mm512_extractf64x4_pd(vecs[7], 0);
-  Packet4d vecs7_1 = _mm512_extractf64x4_pd(vecs[7], 1);
-
-  Packet4d tmp0, tmp1;
-
-  tmp0 = _mm256_hadd_pd(vecs0_0, vecs1_0);
-  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
-
-  tmp1 = _mm256_hadd_pd(vecs2_0, vecs3_0);
-  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
-
-  __m256d final_0 = _mm256_blend_pd(tmp0, tmp1, 0xC);
-
-  tmp0 = _mm256_hadd_pd(vecs0_1, vecs1_1);
-  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
-
-  tmp1 = _mm256_hadd_pd(vecs2_1, vecs3_1);
-  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
-
-  final_0 = padd(final_0, _mm256_blend_pd(tmp0, tmp1, 0xC));
-
-  tmp0 = _mm256_hadd_pd(vecs4_0, vecs5_0);
-  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
-
-  tmp1 = _mm256_hadd_pd(vecs6_0, vecs7_0);
-  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
-
-  __m256d final_1 = _mm256_blend_pd(tmp0, tmp1, 0xC);
-
-  tmp0 = _mm256_hadd_pd(vecs4_1, vecs5_1);
-  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
-
-  tmp1 = _mm256_hadd_pd(vecs6_1, vecs7_1);
-  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
-
-  final_1 = padd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC));
-
-  __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0);
-
-  return _mm512_insertf64x4(final_output, final_1, 1);
-}
- 
 
 
 #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
@@ -1242,61 +1374,927 @@ EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
   return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
-  return _mm512_cvttps_epi32(a);
+// Packet math for Eigen::half
+template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
+  return _mm256_set1_epi16(from.x);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
+  return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
+  return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
+  // (void*) -> workaround clang warning:
+  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+  _mm256_store_si256((__m256i*)(void*)to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
+  // (void*) -> workaround clang warning:
+  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+  _mm256_storeu_si256((__m256i*)(void*)to, from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h
+ploaddup<Packet16h>(const Eigen::half*  from) {
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  unsigned short e = from[4].x;
+  unsigned short f = from[5].x;
+  unsigned short g = from[6].x;
+  unsigned short h = from[7].x;
+  return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h
+ploadquad(const Eigen::half* from) {
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
+}
+
+EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm512_cvtph_ps(a);
+#else
+  EIGEN_ALIGN64 half aux[16];
+  pstore(aux, a);
+  float f0(aux[0]);
+  float f1(aux[1]);
+  float f2(aux[2]);
+  float f3(aux[3]);
+  float f4(aux[4]);
+  float f5(aux[5]);
+  float f6(aux[6]);
+  float f7(aux[7]);
+  float f8(aux[8]);
+  float f9(aux[9]);
+  float fa(aux[10]);
+  float fb(aux[11]);
+  float fc(aux[12]);
+  float fd(aux[13]);
+  float fe(aux[14]);
+  float ff(aux[15]);
+
+  return _mm512_set_ps(
+      ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+#else
+  EIGEN_ALIGN64 float aux[16];
+  pstore(aux, a);
+  half h0(aux[0]);
+  half h1(aux[1]);
+  half h2(aux[2]);
+  half h3(aux[3]);
+  half h4(aux[4]);
+  half h5(aux[5]);
+  half h6(aux[6]);
+  half h7(aux[7]);
+  half h8(aux[8]);
+  half h9(aux[9]);
+  half ha(aux[10]);
+  half hb(aux[11]);
+  half hc(aux[12]);
+  half hd(aux[13]);
+  half he(aux[14]);
+  half hf(aux[15]);
+
+  return _mm256_set_epi16(
+      hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
+      h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
+  return ptrue(Packet8i(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pabs(const Packet16h& a) {
+  const __m256i sign_mask = _mm256_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm256_andnot_si256(sign_mask, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a,
+                                              const Packet16h& b) {
+  return float2half(pmin<Packet16f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a,
+                                              const Packet16h& b) {
+  return float2half(pmax<Packet16f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h plset<Packet16h>(const half& a) {
+  return float2half(plset<Packet16f>(static_cast<float>(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
+  // in some cases Packet8i is a wrapper around __m256i, so we need to
+  // cast to Packet8i to call the correct overload.
+  return por(Packet8i(a),Packet8i(b));
+}
+template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) {
+  return pxor(Packet8i(a),Packet8i(b));
+}
+template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) {
+  return pand(Packet8i(a),Packet8i(b));
+}
+template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) {
+  return pandnot(Packet8i(a),Packet8i(b));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
-  return _mm512_cvtepi32_ps(a);
+template<> EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
+  return _mm256_blendv_epi8(b, a, mask);
 }
 
-template <int Offset>
-struct palign_impl<Offset, Packet16f> {
-  static EIGEN_STRONG_INLINE void run(Packet16f& first,
-                                      const Packet16f& second) {
-    if (Offset != 0) {
-      __m512i first_idx = _mm512_set_epi32(
-          Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11,
-          Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6,
-          Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset);
+template<> EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
+  return float2half(pround<Packet16f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
+  return float2half(print<Packet16f>(half2float(a)));
+}
 
-      __m512i second_idx =
-          _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4,
-                           Offset - 5, Offset - 6, Offset - 7, Offset - 8,
-                           Offset - 9, Offset - 10, Offset - 11, Offset - 12,
-                           Offset - 13, Offset - 14, Offset - 15, Offset - 16);
+template<> EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
+  return float2half(pceil<Packet16f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
+  return float2half(pfloor<Packet16f>(half2float(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  return Pack32To16(pcmp_eq(af, bf));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a,const Packet16h& b) {
+  return Pack32To16(pcmp_le(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a,const Packet16h& b) {
+  return Pack32To16(pcmp_lt(half2float(a), half2float(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a,const Packet16h& b) {
+  return Pack32To16(pcmp_lt_or_nan(half2float(a), half2float(b)));
+}
 
-      unsigned short mask = 0xFFFF;
-      mask <<= (16 - Offset);
+template<> EIGEN_STRONG_INLINE Packet16h pconj(const Packet16h& a) { return a; }
 
-      first = _mm512_permutexvar_ps(first_idx, first);
-      Packet16f tmp = _mm512_permutexvar_ps(second_idx, second);
-      first = _mm512_mask_blend_ps(mask, first, tmp);
+template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
+  Packet16h sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
+  return _mm256_xor_si256(a, sign_mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = padd(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = psub(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = pmul(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = pdiv(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
+  Packet16f from_float = half2float(from);
+  return half(predux(from_float));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
+  Packet8h lane0 = _mm256_extractf128_si256(a, 0);
+  Packet8h lane1 = _mm256_extractf128_si256(a, 1);
+  return padd<Packet8h>(lane0, lane1);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet16h>(const Packet16h& a) {
+  Packet16f af = half2float(a);
+  float reduced = predux_max<Packet16f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet16h>(const Packet16h& a) {
+  Packet16f af = half2float(a);
+  float reduced = predux_min<Packet16f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
+  Packet16f from_float = half2float(from);
+  return half(predux_mul(from_float));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  return _mm256_insertf128_si256(
+                    _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a,1),m)),
+                                           _mm_shuffle_epi8(_mm256_extractf128_si256(a,0),m), 1);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
+{
+  return _mm256_set_epi16(
+      from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
+      from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
+      from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
+      from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
+{
+  EIGEN_ALIGN64 half aux[16];
+  pstore(aux, from);
+  to[stride*0] = aux[0];
+  to[stride*1] = aux[1];
+  to[stride*2] = aux[2];
+  to[stride*3] = aux[3];
+  to[stride*4] = aux[4];
+  to[stride*5] = aux[5];
+  to[stride*6] = aux[6];
+  to[stride*7] = aux[7];
+  to[stride*8] = aux[8];
+  to[stride*9] = aux[9];
+  to[stride*10] = aux[10];
+  to[stride*11] = aux[11];
+  to[stride*12] = aux[12];
+  to[stride*13] = aux[13];
+  to[stride*14] = aux[14];
+  to[stride*15] = aux[15];
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,16>& kernel) {
+  __m256i a = kernel.packet[0];
+  __m256i b = kernel.packet[1];
+  __m256i c = kernel.packet[2];
+  __m256i d = kernel.packet[3];
+  __m256i e = kernel.packet[4];
+  __m256i f = kernel.packet[5];
+  __m256i g = kernel.packet[6];
+  __m256i h = kernel.packet[7];
+  __m256i i = kernel.packet[8];
+  __m256i j = kernel.packet[9];
+  __m256i k = kernel.packet[10];
+  __m256i l = kernel.packet[11];
+  __m256i m = kernel.packet[12];
+  __m256i n = kernel.packet[13];
+  __m256i o = kernel.packet[14];
+  __m256i p = kernel.packet[15];
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
+  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
+  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
+  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
+  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
+  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
+
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
+  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
+  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
+  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
+  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
+  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
+  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
+  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
+  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
+  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
+  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
+
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
+  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
+  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
+  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
+  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
+  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
+
+  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
+  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
+  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
+  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
+  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
+  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
+  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
+  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
+  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
+  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
+  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
+  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
+  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
+  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
+  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
+  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
+  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+  __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
+
+  kernel.packet[0] = a_p_0;
+  kernel.packet[1] = a_p_1;
+  kernel.packet[2] = a_p_2;
+  kernel.packet[3] = a_p_3;
+  kernel.packet[4] = a_p_4;
+  kernel.packet[5] = a_p_5;
+  kernel.packet[6] = a_p_6;
+  kernel.packet[7] = a_p_7;
+  kernel.packet[8] = a_p_8;
+  kernel.packet[9] = a_p_9;
+  kernel.packet[10] = a_p_a;
+  kernel.packet[11] = a_p_b;
+  kernel.packet[12] = a_p_c;
+  kernel.packet[13] = a_p_d;
+  kernel.packet[14] = a_p_e;
+  kernel.packet[15] = a_p_f;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,8>& kernel) {
+  EIGEN_ALIGN64 half in[8][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+  pstore<half>(in[4], kernel.packet[4]);
+  pstore<half>(in[5], kernel.packet[5]);
+  pstore<half>(in[6], kernel.packet[6]);
+  pstore<half>(in[7], kernel.packet[7]);
+
+  EIGEN_ALIGN64 half out[8][16];
+
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      out[i][j] = in[j][2*i];
+    }
+    for (int j = 0; j < 8; ++j) {
+      out[i][j+8] = in[j][2*i+1];
     }
   }
-};
-template <int Offset>
-struct palign_impl<Offset, Packet8d> {
-  static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) {
-    if (Offset != 0) {
-      __m512i first_idx = _mm512_set_epi32(
-          0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0,
-          Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset);
-
-      __m512i second_idx = _mm512_set_epi32(
-          0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0,
-          Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8);
-
-      unsigned char mask = 0xFF;
-      mask <<= (8 - Offset);
-
-      first = _mm512_permutexvar_pd(first_idx, first);
-      Packet8d tmp = _mm512_permutexvar_pd(second_idx, second);
-      first = _mm512_mask_blend_pd(mask, first, tmp);
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+  kernel.packet[4] = pload<Packet16h>(out[4]);
+  kernel.packet[5] = pload<Packet16h>(out[5]);
+  kernel.packet[6] = pload<Packet16h>(out[6]);
+  kernel.packet[7] = pload<Packet16h>(out[7]);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,4>& kernel) {
+  EIGEN_ALIGN64 half in[4][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN64 half out[4][16];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][4*i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+4] = in[j][4*i+1];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+8] = in[j][4*i+2];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+12] = in[j][4*i+3];
     }
   }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+}
+
+template <> struct is_arithmetic<Packet16bf> { enum { value = true }; };
+
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet16bf type;
+  typedef Packet8bf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 1,
+    HasBlend = 0,
+    HasInsert = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+    HasLog = 1,  // Currently fails test with bad accuracy.
+    HasLog1p  = 1,
+    HasExpm1  = 1,
+    HasNdtri = 1,
+    HasBessel  = 1,
+#endif
+    HasExp = 1,
+    HasSqrt = EIGEN_FAST_MATH,
+    HasRsqrt = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+#endif
+    HasCmp  = 1,
+    HasDiv = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet16bf>
+{
+  typedef bfloat16 type;
+  enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  typedef Packet8bf half;
 };
 
+template <>
+EIGEN_STRONG_INLINE Packet16bf pset1<Packet16bf>(const bfloat16& from) {
+  return _mm256_set1_epi16(from.value);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst<Packet16bf>(const Packet16bf& from) {
+  bfloat16 t;
+  t.value = static_cast<unsigned short>(_mm256_extract_epi16(from, 0));
+  return t;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pload<Packet16bf>(const bfloat16* from) {
+  return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf ploadu<Packet16bf>(const bfloat16* from) {
+  return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to,
+                                          const Packet16bf& from) {
+  _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to,
+                                           const Packet16bf& from) {
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf
+ploaddup<Packet16bf>(const bfloat16* from) {
+  Packet16bf r;
+  unsigned short a = from[0].value;
+  unsigned short b = from[1].value;
+  unsigned short c = from[2].value;
+  unsigned short d = from[3].value;
+  unsigned short e = from[4].value;
+  unsigned short f = from[5].value;
+  unsigned short g = from[6].value;
+  unsigned short h = from[7].value;
+  return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf
+ploadquad(const bfloat16* from) {
+  Packet16bf r;
+  unsigned short a = from[0].value;
+  unsigned short b = from[1].value;
+  unsigned short c = from[2].value;
+  unsigned short d = from[3].value;
+  return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
+}
+
+EIGEN_STRONG_INLINE Packet16f Bf16ToF32(const Packet16bf& a) {
+  return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16));
+}
+
+// Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm.
+EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) {
+  Packet16bf r;
+
+#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1)
+  // Since GCC 10.1 supports avx512bf16 and C style explicit cast
+  // (C++ static_cast is not supported yet), do converion via intrinsic
+  // and register path for performance.
+  r = (__m256i)(_mm512_cvtneps_pbh(a));
+
+#else
+  __m512i t;
+  __m512i input = _mm512_castps_si512(a);
+  __m512i nan = _mm512_set1_epi32(0x7fc0);
+
+  // uint32_t lsb = (input >> 16) & 1;
+  t = _mm512_and_si512(_mm512_srli_epi32(input, 16), _mm512_set1_epi32(1));
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t = _mm512_add_epi32(t, _mm512_set1_epi32(0x7fff));
+  // input += rounding_bias;
+  t = _mm512_add_epi32(t, input);
+  // input = input >> 16;
+  t = _mm512_srli_epi32(t, 16);
+
+  // Check NaN before converting back to bf16
+  __mmask16 mask = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q);
+
+  t = _mm512_mask_blend_epi32(mask, nan, t);
+  // output.value = static_cast<uint16_t>(input);
+  r = _mm512_cvtepi32_epi16(t);
+#endif // EIGEN_VECTORIZE_AVX512BF16
+
+  return r;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf ptrue(const Packet16bf& a) {
+  return ptrue<Packet8i>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf por(const Packet16bf& a, const Packet16bf& b) {
+  return por<Packet8i>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pxor(const Packet16bf& a, const Packet16bf& b) {
+  return pxor<Packet8i>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pand(const Packet16bf& a, const Packet16bf& b) {
+  return pand<Packet8i>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pandnot(const Packet16bf& a,
+                                       const Packet16bf& b) {
+  return pandnot<Packet8i>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pselect(const Packet16bf& mask,
+                                       const Packet16bf& a,
+                                       const Packet16bf& b) {
+  // Input mask is expected to be all 0/1, handle it with 8-bit
+  // intrinsic for performance.
+  return _mm256_blendv_epi8(b, a, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf pround<Packet16bf>(const Packet16bf& a)
+{
+  return F32ToBf16(pround<Packet16f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf print<Packet16bf>(const Packet16bf& a) {
+  return F32ToBf16(print<Packet16f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf pceil<Packet16bf>(const Packet16bf& a) {
+  return F32ToBf16(pceil<Packet16f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16bf pfloor<Packet16bf>(const Packet16bf& a) {
+  return F32ToBf16(pfloor<Packet16f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcmp_eq(const Packet16bf& a,
+                                       const Packet16bf& b) {
+  return Pack32To16(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcmp_le(const Packet16bf& a,
+                                       const Packet16bf& b) {
+  return Pack32To16(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcmp_lt(const Packet16bf& a,
+                                       const Packet16bf& b) {
+  return Pack32To16(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcmp_lt_or_nan(const Packet16bf& a,
+                                              const Packet16bf& b) {
+  return Pack32To16(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pnegate(const Packet16bf& a) {
+  Packet16bf sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
+  return _mm256_xor_si256(a, sign_mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pconj(const Packet16bf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pabs(const Packet16bf& a) {
+  const __m256i sign_mask = _mm256_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm256_andnot_si256(sign_mask, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf padd<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(padd<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf psub<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(psub<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(pmul<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pdiv<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(pdiv<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmin<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(pmin<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmax<Packet16bf>(const Packet16bf& a,
+                                                const Packet16bf& b) {
+  return F32ToBf16(pmax<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf plset<Packet16bf>(const bfloat16& a) {
+  return F32ToBf16(plset<Packet16f>(static_cast<float>(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4<Packet16bf>(const Packet16bf& a) {
+  Packet8bf lane0 = _mm256_extractf128_si256(a, 0);
+  Packet8bf lane1 = _mm256_extractf128_si256(a, 1);
+  return padd<Packet8bf>(lane0, lane1);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux<Packet16bf>(const Packet16bf& p) {
+  return static_cast<bfloat16>(predux<Packet16f>(Bf16ToF32(p)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet16bf>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_mul<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<Packet16bf>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_min<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<Packet16bf>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_max<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf preverse(const Packet16bf& a) {
+  __m256i m = _mm256_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,
+                               14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+
+  Packet16bf res;
+  // Swap hi and lo first because shuffle is in 128-bit lanes.
+  res = _mm256_permute2x128_si256(a, a, 1);
+  // Shuffle 8-bit values in src within 2*128-bit lanes.
+  return _mm256_shuffle_epi8(res, m);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pgather<bfloat16, Packet16bf>(const bfloat16* from,
+                                                             Index stride) {
+  return _mm256_set_epi16(
+      from[15*stride].value, from[14*stride].value, from[13*stride].value, from[12*stride].value,
+      from[11*stride].value, from[10*stride].value, from[9*stride].value, from[8*stride].value,
+      from[7*stride].value, from[6*stride].value, from[5*stride].value, from[4*stride].value,
+      from[3*stride].value, from[2*stride].value, from[1*stride].value, from[0*stride].value);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet16bf>(bfloat16* to,
+                                                        const Packet16bf& from,
+                                                        Index stride) {
+  EIGEN_ALIGN64 bfloat16 aux[16];
+  pstore(aux, from);
+  to[stride*0] = aux[0];
+  to[stride*1] = aux[1];
+  to[stride*2] = aux[2];
+  to[stride*3] = aux[3];
+  to[stride*4] = aux[4];
+  to[stride*5] = aux[5];
+  to[stride*6] = aux[6];
+  to[stride*7] = aux[7];
+  to[stride*8] = aux[8];
+  to[stride*9] = aux[9];
+  to[stride*10] = aux[10];
+  to[stride*11] = aux[11];
+  to[stride*12] = aux[12];
+  to[stride*13] = aux[13];
+  to[stride*14] = aux[14];
+  to[stride*15] = aux[15];
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,16>& kernel) {
+  __m256i a = kernel.packet[0];
+  __m256i b = kernel.packet[1];
+  __m256i c = kernel.packet[2];
+  __m256i d = kernel.packet[3];
+  __m256i e = kernel.packet[4];
+  __m256i f = kernel.packet[5];
+  __m256i g = kernel.packet[6];
+  __m256i h = kernel.packet[7];
+  __m256i i = kernel.packet[8];
+  __m256i j = kernel.packet[9];
+  __m256i k = kernel.packet[10];
+  __m256i l = kernel.packet[11];
+  __m256i m = kernel.packet[12];
+  __m256i n = kernel.packet[13];
+  __m256i o = kernel.packet[14];
+  __m256i p = kernel.packet[15];
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
+  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
+  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
+  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
+  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
+  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
+
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
+  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
+  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
+  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
+  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
+  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
+  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
+  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
+  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
+  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
+  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
+
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
+  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
+  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
+  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
+  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
+  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
+
+  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
+  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
+  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
+  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
+  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
+  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
+  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
+  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
+  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
+  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
+  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
+  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
+  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
+  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
+  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
+  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  kernel.packet[0] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
+  kernel.packet[1] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  kernel.packet[2] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  kernel.packet[3] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  kernel.packet[4] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  kernel.packet[5] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  kernel.packet[6] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  kernel.packet[7] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  kernel.packet[8] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  kernel.packet[9] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  kernel.packet[10] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  kernel.packet[11] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  kernel.packet[12] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  kernel.packet[13] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  kernel.packet[14] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+  kernel.packet[15] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,4>& kernel) {
+  __m256i a = kernel.packet[0];
+  __m256i b = kernel.packet[1];
+  __m256i c = kernel.packet[2];
+  __m256i d = kernel.packet[3];
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  kernel.packet[0] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x20);
+  kernel.packet[1] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x20);
+  kernel.packet[2] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x31);
+  kernel.packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31);
+}
 
 } // end namespace internal
 
diff --git a/contrib/eigen/Eigen/src/Core/arch/AltiVec/Complex.h b/contrib/eigen/Eigen/src/Core/arch/AltiVec/Complex.h
index 3e665730cfa05d34f3eccbe26aafd8ee8829a62f..f424f11cf7d9d652d1cc2e443931171c308600c1 100644
--- a/contrib/eigen/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/contrib/eigen/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -29,8 +29,54 @@ static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (P
 //---------- float ----------
 struct Packet2cf
 {
-  EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {}
+  EIGEN_STRONG_INLINE explicit Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b)
+  {
+    Packet4f v1, v2;
+
+    // Permute and multiply the real parts of a and b
+    v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
+    // Get the imaginary parts of a
+    v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
+    // multiply a_re * b
+    v1 = vec_madd(v1, b.v, p4f_ZERO);
+    // multiply a_im * b and get the conjugate result
+    v2 = vec_madd(v2, b.v, p4f_ZERO);
+    v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
+    // permute back to a proper order
+    v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
+
+    return Packet2cf(padd<Packet4f>(v1, v2));
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
+    v = pmul(Packet2cf(*this), b).v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
+    return Packet2cf(*this) *= b;
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
+    return Packet2cf(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
+    return Packet2cf(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
+    return Packet2cf(-v);
+  }
+
   Packet4f  v;
 };
 
@@ -38,6 +84,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
   typedef Packet2cf type;
   typedef Packet2cf half;
+  typedef Packet4f as_real;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
@@ -60,7 +107,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; typedef Packet4f as_real; };
 
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
@@ -80,16 +127,35 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
 
+EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>* from0, const std::complex<float>* from1)
+{
+  Packet4f res0, res1;
+#ifdef __VSX__
+  __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (*from0));
+  __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (*from1));
+#ifdef _BIG_ENDIAN
+  __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
+#else
+  __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
+#endif
+#else
+  *reinterpret_cast<std::complex<float> *>(&res0) = *from0;
+  *reinterpret_cast<std::complex<float> *>(&res1) = *from1;
+  res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI);
+#endif
+  return Packet2cf(res0);
+}
+
 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
-  std::complex<float> EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 std::complex<float> af[2];
   af[0] = from[0*stride];
   af[1] = from[1*stride];
   return pload<Packet2cf>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
 {
-  std::complex<float> EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 std::complex<float> af[2];
   pstore<std::complex<float> >((std::complex<float> *) af, from);
   to[0*stride] = af[0];
   to[1*stride] = af[1];
@@ -100,25 +166,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, con
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  Packet4f v1, v2;
-
-  // Permute and multiply the real parts of a and b
-  v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
-  // Get the imaginary parts of a
-  v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
-  // multiply a_re * b 
-  v1 = vec_madd(v1, b.v, p4f_ZERO);
-  // multiply a_im * b and get the conjugate result
-  v2 = vec_madd(v2, b.v, p4f_ZERO);
-  v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
-  // permute back to a proper order
-  v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
-  
-  return Packet2cf(padd<Packet4f>(v1, v2));
-}
-
 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v, b.v)); }
@@ -128,7 +175,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co
 
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
-  std::complex<float> EIGEN_ALIGN16 res[2];
+  EIGEN_ALIGN16 std::complex<float> res[2];
   pstore((float *)&res, a.v);
 
   return res[0];
@@ -149,22 +196,6 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packe
   return pfirst<Packet2cf>(Packet2cf(b));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
-{
-  Packet4f b1, b2;
-#ifdef _BIG_ENDIAN  
-  b1 = vec_sld(vecs[0].v, vecs[1].v, 8);
-  b2 = vec_sld(vecs[1].v, vecs[0].v, 8);
-#else
-  b1 = vec_sld(vecs[1].v, vecs[0].v, 8);
-  b2 = vec_sld(vecs[0].v, vecs[1].v, 8);
-#endif
-  b2 = vec_sld(b2, b2, 8);
-  b2 = padd<Packet4f>(b1, b2);
-
-  return Packet2cf(b2);
-}
-
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
 {
   Packet4f b;
@@ -175,61 +206,12 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
   return pfirst<Packet2cf>(prod);
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet2cf>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
-  {
-    if (Offset==1)
-    {
-#ifdef _BIG_ENDIAN
-      first.v = vec_sld(first.v, second.v, 8);
-#else
-      first.v = vec_sld(second.v, first.v, 8);
-#endif
-    }
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   // TODO optimize it for AltiVec
-  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a, b);
+  Packet2cf res = pmul(a, pconj(b));
   Packet4f s = pmul<Packet4f>(b.v, b.v);
   return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
 }
@@ -246,6 +228,11 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
   kernel.packet[0].v = tmp;
 }
 
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = reinterpret_cast<Packet4f>(vec_cmpeq(a.v,b.v));
+  return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV)));
+}
+
 #ifdef __VSX__
 template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
   Packet2cf result;
@@ -254,12 +241,62 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, con
 }
 #endif
 
+template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a)
+{
+  return psqrt_complex<Packet2cf>(a);
+}
+
 //---------- double ----------
 #ifdef __VSX__
 struct Packet1cd
 {
   EIGEN_STRONG_INLINE Packet1cd() {}
   EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b)
+  {
+    Packet2d a_re, a_im, v1, v2;
+
+    // Permute and multiply the real parts of a and b
+    a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
+    // Get the imaginary parts of a
+    a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
+    // multiply a_re * b
+    v1 = vec_madd(a_re, b.v, p2d_ZERO);
+    // multiply a_im * b and get the conjugate result
+    v2 = vec_madd(a_im, b.v, p2d_ZERO);
+    v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
+    v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
+
+    return Packet1cd(padd<Packet2d>(v1, v2));
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
+    v = pmul(Packet1cd(*this), b).v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
+    return Packet1cd(*this) *= b;
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
+    return Packet1cd(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
+    return Packet1cd(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
+    return Packet1cd(-v);
+  }
+
   Packet2d v;
 };
 
@@ -267,6 +304,7 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 {
   typedef Packet1cd type;
   typedef Packet1cd half;
+  typedef Packet2d as_real;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 0,
@@ -286,7 +324,7 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; typedef Packet2d as_real; };
 
 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
@@ -296,19 +334,13 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<
 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
 
-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
 {
-  std::complex<double> EIGEN_ALIGN16 af[2];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  return pload<Packet1cd>(af);
+  return pload<Packet1cd>(from);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
 {
-  std::complex<double> EIGEN_ALIGN16 af[2];
-  pstore<std::complex<double> >(af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
+  pstore<std::complex<double> >(to, from);
 }
 
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
@@ -316,24 +348,6 @@ template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, con
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
-  Packet2d a_re, a_im, v1, v2;
-
-  // Permute and multiply the real parts of a and b
-  a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
-  // Get the imaginary parts of a
-  a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
-  // multiply a_re * b
-  v1 = vec_madd(a_re, b.v, p2d_ZERO);
-  // multiply a_im * b and get the conjugate result
-  v2 = vec_madd(a_im, b.v, p2d_ZERO);
-  v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
-  v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
-
-  return Packet1cd(padd<Packet2d>(v1, v2));
-}
-
 template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); }
@@ -345,7 +359,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c
 
 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
-  std::complex<double> EIGEN_ALIGN16 res[2];
+  EIGEN_ALIGN16 std::complex<double> res[2];
   pstore<std::complex<double> >(res, a);
 
   return res[0];
@@ -354,59 +368,15 @@ template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Pac
 template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
 
 template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)        { return vecs[0]; }
 
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
 
-template<int Offset>
-struct palign_impl<Offset,Packet1cd>
-{
-  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
-  {
-    // FIXME is it sure we never have to align a Packet1cd?
-    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   // TODO optimize it for AltiVec
-  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet1cd res = pmul(a,pconj(b));
   Packet2d s = pmul<Packet2d>(b.v, b.v);
   return Packet1cd(pdiv(res.v, padd<Packet2d>(s, vec_perm(s, s, p16uc_REVERSE64))));
 }
@@ -422,6 +392,23 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
   kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
   kernel.packet[0].v = tmp;
 }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+  // Compare real and imaginary parts of a and b to get the mask vector:
+  // [re(a)==re(b), im(a)==im(b)]
+  Packet2d eq = reinterpret_cast<Packet2d>(vec_cmpeq(a.v,b.v));
+  // Swap real/imag elements in the mask in to get:
+  // [im(a)==im(b), re(a)==re(b)]
+  Packet2d eq_swapped = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(eq), reinterpret_cast<Packet4ui>(eq), 8));
+  // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+  return Packet1cd(vec_and(eq, eq_swapped));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a)
+{
+  return psqrt_complex<Packet1cd>(a);
+}
+
 #endif // __VSX__
 } // end namespace internal
 
diff --git a/contrib/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/contrib/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h
index c5e4bede74d7a86f36681160d209f43f5b2eb4e3..3a7a329361eea0cd6ba25b81d663a8409ed70384 100644
--- a/contrib/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/contrib/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -9,10 +9,6 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-/* The sin, cos, exp, and log functions of this file come from
- * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
- */
-
 #ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
 #define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
 
@@ -20,180 +16,28 @@ namespace Eigen {
 
 namespace internal {
 
-static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-static _EIGEN_DECLARE_CONST_Packet4i(23, 23);
-
-static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
-/* the smallest non denormalized float number */
-static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
-static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
-static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
-  
-/* natural logarithm computed for 4 simultaneous float
-  return NaN for x <= 0
-*/
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-
-static _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
-static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-
-#ifdef __VSX__
-static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
-static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
-static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
-static _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
-static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
-
-#ifdef __POWER8_VECTOR__
-static Packet2l p2l_1023 = { 1023, 1023 };
-static Packet2ul p2ul_52 = { 52, 52 };
-#endif
-
-#endif
-
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f plog<Packet4f>(const Packet4f& _x)
 {
-  Packet4f x = _x;
-
-  Packet4i emm0;
-
-  /* isvalid_mask is 0 if x < 0 or x is NaN. */
-  Packet4ui isvalid_mask = reinterpret_cast<Packet4ui>(vec_cmpge(x, p4f_ZERO));
-  Packet4ui iszero_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(x, p4f_ZERO));
-
-  x = pmax(x, p4f_min_norm_pos);  /* cut off denormalized stuff */
-  emm0 = vec_sr(reinterpret_cast<Packet4i>(x),
-                reinterpret_cast<Packet4ui>(p4i_23));
-
-  /* keep only the fractional part */
-  x = pand(x, p4f_inv_mant_mask);
-  x = por(x, p4f_half);
-
-  emm0 = psub(emm0, p4i_0x7f);
-  Packet4f e = padd(vec_ctf(emm0, 0), p4f_1);
-
-  /* part2:
-     if( x < SQRTHF ) {
-       e -= 1;
-       x = x + x - 1.0;
-     } else { x = x - 1.0; }
-  */
-  Packet4f mask = reinterpret_cast<Packet4f>(vec_cmplt(x, p4f_cephes_SQRTHF));
-  Packet4f tmp = pand(x, mask);
-  x = psub(x, p4f_1);
-  e = psub(e, pand(p4f_1, mask));
-  x = padd(x, tmp);
-
-  Packet4f x2 = pmul(x,x);
-  Packet4f x3 = pmul(x2,x);
-
-  Packet4f y, y1, y2;
-  y  = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
-  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
-  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
-  y  = pmadd(y , x, p4f_cephes_log_p2);
-  y1 = pmadd(y1, x, p4f_cephes_log_p5);
-  y2 = pmadd(y2, x, p4f_cephes_log_p8);
-  y = pmadd(y, x3, y1);
-  y = pmadd(y, x3, y2);
-  y = pmul(y, x3);
-
-  y1 = pmul(e, p4f_cephes_log_q1);
-  tmp = pmul(x2, p4f_half);
-  y = padd(y, y1);
-  x = psub(x, tmp);
-  y2 = pmul(e, p4f_cephes_log_q2);
-  x = padd(x, y);
-  x = padd(x, y2);
-  // negative arg will be NAN, 0 will be -INF
-  x = vec_sel(x, p4f_minus_inf, iszero_mask);
-  x = vec_sel(p4f_minus_nan, x, isvalid_mask);
-  return x;
+  return plog_float(_x);
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f pexp<Packet4f>(const Packet4f& _x)
 {
-  Packet4f x = _x;
-
-  Packet4f tmp, fx;
-  Packet4i emm0;
-
-  // clamp x
-  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
-
-  // express exp(x) as exp(g + n*log(2))
-  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
-
-  fx = pfloor(fx);
-
-  tmp = pmul(fx, p4f_cephes_exp_C1);
-  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
-  x = psub(x, tmp);
-  x = psub(x, z);
-
-  z = pmul(x,x);
-
-  Packet4f y = p4f_cephes_exp_p0;
-  y = pmadd(y, x, p4f_cephes_exp_p1);
-  y = pmadd(y, x, p4f_cephes_exp_p2);
-  y = pmadd(y, x, p4f_cephes_exp_p3);
-  y = pmadd(y, x, p4f_cephes_exp_p4);
-  y = pmadd(y, x, p4f_cephes_exp_p5);
-  y = pmadd(y, z, x);
-  y = padd(y, p4f_1);
+  return pexp_float(_x);
+}
 
-  // build 2^n
-  emm0 = vec_cts(fx, 0);
-  emm0 = vec_add(emm0, p4i_0x7f);
-  emm0 = vec_sl(emm0, reinterpret_cast<Packet4ui>(p4i_23));
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psin<Packet4f>(const Packet4f& _x)
+{
+  return psin_float(_x);
+}
 
-  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
-  // inputs and return them unmodified.
-  Packet4ui isnumber_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(_x, _x));
-  return vec_sel(_x, pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x),
-                 isnumber_mask);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pcos<Packet4f>(const Packet4f& _x)
+{
+  return pcos_float(_x);
 }
 
 #ifndef EIGEN_COMP_CLANG
@@ -225,95 +69,19 @@ Packet2d psqrt<Packet2d>(const Packet2d& x)
   return  vec_sqrt(x);
 }
 
-// VSX support varies between different compilers and even different
-// versions of the same compiler.  For gcc version >= 4.9.3, we can use
-// vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
-// a slow version that works with older compilers. 
-// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
-// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
-static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
-#if EIGEN_GNUC_AT_LEAST(5, 4) || \
-    (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
-  return vec_cts(x, 0);    // TODO: check clang version.
-#else
-  double tmp[2];
-  memcpy(tmp, &x, sizeof(tmp));
-  Packet2l l = { static_cast<long long>(tmp[0]),
-                 static_cast<long long>(tmp[1]) };
-  return l;
-#endif
-}
-
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d pexp<Packet2d>(const Packet2d& _x)
 {
-  Packet2d x = _x;
-
-  Packet2d tmp, fx;
-  Packet2l emm0;
-
-  // clamp x
-  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
-
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = pmadd(x, p2d_cephes_LOG2EF, p2d_half);
-
-  fx = pfloor(fx);
-
-  tmp = pmul(fx, p2d_cephes_exp_C1);
-  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
-  x = psub(x, tmp);
-  x = psub(x, z);
-
-  Packet2d x2 = pmul(x,x);
-
-  Packet2d px = p2d_cephes_exp_p0;
-  px = pmadd(px, x2, p2d_cephes_exp_p1);
-  px = pmadd(px, x2, p2d_cephes_exp_p2);
-  px = pmul (px, x);
-
-  Packet2d qx = p2d_cephes_exp_q0;
-  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
-  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
-  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
-
-  x = pdiv(px,psub(qx,px));
-  x = pmadd(p2d_2,x,p2d_1);
-
-  // build 2^n
-  emm0 = ConvertToPacket2l(fx);
-
-#ifdef __POWER8_VECTOR__ 
-  emm0 = vec_add(emm0, p2l_1023);
-  emm0 = vec_sl(emm0, p2ul_52);
-#else
-  // Code is a bit complex for POWER7.  There is actually a
-  // vec_xxsldi intrinsic but it is not supported by some gcc versions.
-  // So we shift (52-32) bits and do a word swap with zeros.
-  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
-  _EIGEN_DECLARE_CONST_Packet4i(20, 20);    // 52 - 32
-
-  Packet4i emm04i = reinterpret_cast<Packet4i>(emm0);
-  emm04i = vec_add(emm04i, p4i_1023);
-  emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20));
-  static const Packet16uc perm = {
-    0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 
-    0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
-#ifdef  _BIG_ENDIAN
-  emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm));
-#else
-  emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm));
-#endif
-
+  return pexp_double(_x);
+}
 #endif
 
-  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
-  // inputs and return them unmodified.
-  Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
-  return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x),
-                 isnumber_mask);
+// Hyperbolic Tangent function.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+ptanh<Packet4f>(const Packet4f& x) {
+  return internal::generic_fast_tanh_float(x);
 }
-#endif
 
 }  // end namespace internal
 
diff --git a/contrib/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h b/contrib/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 08a27d153026e46ae9c4af0e76a96cd7c03f4e23..2a440545b20c79fbce327a4083f25274d03d553a 100755
--- a/contrib/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/contrib/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -22,31 +22,38 @@ namespace internal {
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#endif
-
 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
 #endif
 
-typedef __vector float          Packet4f;
-typedef __vector int            Packet4i;
-typedef __vector unsigned int   Packet4ui;
-typedef __vector __bool int     Packet4bi;
-typedef __vector short int      Packet8i;
-typedef __vector unsigned char  Packet16uc;
+typedef __vector float                   Packet4f;
+typedef __vector int                     Packet4i;
+typedef __vector unsigned int            Packet4ui;
+typedef __vector __bool int              Packet4bi;
+typedef __vector short int               Packet8s;
+typedef __vector unsigned short int      Packet8us;
+typedef __vector signed char             Packet16c;
+typedef __vector unsigned char           Packet16uc;
+typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf;
 
 // We don't want to write the same code all the time, but we need to reuse the constants
 // and it doesn't really work to declare them global, so we define macros instead
-
 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
+  Packet4f p4f_##NAME = {X, X, X, X}
 
 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
   Packet4i p4i_##NAME = vec_splat_s32(X)
 
+#define _EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \
+  Packet4ui p4ui_##NAME = {X, X, X, X}
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \
+  Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
+
+#define _EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \
+  Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
+
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
   Packet4f p4f_##NAME = pset1<Packet4f>(X)
 
@@ -64,7 +71,7 @@ typedef __vector unsigned char  Packet16uc;
 
 #define DST_CHAN 1
 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
-
+#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type 
 
 // These constants are endian-agnostic
 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
@@ -72,25 +79,36 @@ static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
+static _EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
+static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
+static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
+static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1);
 static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
 #ifndef __VSX__
 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
 #endif
 
-static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
-static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
+static Packet4f  p4f_COUNTDOWN  = { 0.0, 1.0, 2.0, 3.0 };
+static Packet4i  p4i_COUNTDOWN  = { 0, 1, 2, 3 };
+static Packet8s  p8s_COUNTDOWN  = { 0, 1, 2, 3, 4, 5, 6, 7 };
+static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
+
+static Packet16c  p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
+                                    8, 9, 10, 11, 12, 13, 14, 15};
+static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7, 
+                                    8, 9, 10, 11, 12, 13, 14, 15};
 
 static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
-static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
+static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 };
+static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 };
 
-// Mask alignment
-#ifdef __PPC64__
-#define _EIGEN_MASK_ALIGNMENT	0xfffffffffffffff0
-#else
-#define _EIGEN_MASK_ALIGNMENT	0xfffffff0
-#endif
+static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
+static Packet16uc p16uc_DUPLICATE16_HI = { 0,1,0,1, 2,3,2,3, 4,5,4,5, 6,7,6,7 };
+static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 };
+static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 };
+static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 };
 
-#define _EIGEN_ALIGNED_PTR(x)	((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
+static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 };
 
 // Handle endianness properly while loading constants
 // Define global static constants:
@@ -129,27 +147,27 @@ static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_L
   #define EIGEN_PPC_PREFETCH(ADDR) asm( "   dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
 #endif
 
-template<> struct packet_traits<float>  : default_packet_traits
-{
+template <>
+struct packet_traits<float> : default_packet_traits {
   typedef Packet4f type;
   typedef Packet4f half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4,
+    size = 4,
     HasHalfPacket = 1,
 
-    HasAdd  = 1,
-    HasSub  = 1,
-    HasMul  = 1,
-    HasDiv  = 1,
-    HasMin  = 1,
-    HasMax  = 1,
-    HasAbs  = 1,
-    HasSin  = 0,
-    HasCos  = 0,
-    HasLog  = 0,
-    HasExp  = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasAbs = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
 #ifdef __VSX__
     HasSqrt = 1,
 #if !EIGEN_COMP_CLANG
@@ -160,16 +178,62 @@ template<> struct packet_traits<float>  : default_packet_traits
 #else
     HasSqrt = 0,
     HasRsqrt = 0,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
 #endif
     HasRound = 1,
     HasFloor = 1,
     HasCeil = 1,
+    HasRint = 1,
     HasNegate = 1,
     HasBlend = 1
   };
 };
-template<> struct packet_traits<int>    : default_packet_traits
-{
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet8bf type;
+  typedef Packet8bf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasAbs = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+#ifdef __VSX__
+    HasSqrt = 1,
+#if !EIGEN_COMP_CLANG
+    HasRsqrt = 1,
+#else
+    HasRsqrt = 0,
+#endif
+#else
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+#endif
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<int> : default_packet_traits {
   typedef Packet4i type;
   typedef Packet4i half;
   enum {
@@ -178,6 +242,79 @@ template<> struct packet_traits<int>    : default_packet_traits
     size = 4,
     HasHalfPacket = 0,
 
+    HasAdd   = 1,
+    HasSub   = 1,
+    HasShift = 1,
+    HasMul   = 1,
+    HasDiv   = 0,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<short int> : default_packet_traits {
+  typedef Packet8s type;
+  typedef Packet8s half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 0,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<unsigned short int> : default_packet_traits {
+  typedef Packet8us type;
+  typedef Packet8us half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 0,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<signed char> : default_packet_traits {
+  typedef Packet16c type;
+  typedef Packet16c half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 0,
+
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 0,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<unsigned char> : default_packet_traits {
+  typedef Packet16uc type;
+  typedef Packet16uc half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 0,
+
     HasAdd  = 1,
     HasSub  = 1,
     HasMul  = 1,
@@ -186,9 +323,62 @@ template<> struct packet_traits<int>    : default_packet_traits
   };
 };
 
+template<> struct unpacket_traits<Packet4f>
+{
+  typedef float     type;
+  typedef Packet4f  half;
+  typedef Packet4i  integer_packet;
+  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet4i>
+{
+  typedef int       type;
+  typedef Packet4i  half;
+  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet8s>
+{
+  typedef short int type;
+  typedef Packet8s  half;
+  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet8us>
+{
+  typedef unsigned short int type;
+  typedef Packet8us          half;
+  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+
+template<> struct unpacket_traits<Packet16c>
+{
+  typedef signed char type;
+  typedef Packet16c  half;
+  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet16uc>
+{
+  typedef unsigned char type;
+  typedef Packet16uc  half;
+  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
 
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet8bf>
+{
+  typedef bfloat16 type;
+  typedef Packet8bf          half;
+  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+inline std::ostream & operator <<(std::ostream & s, const Packet16c & v)
+{
+  union {
+    Packet16c   v;
+    signed char n[16];
+  } vt;
+  vt.v = v;
+  for (int i=0; i< 16; i++)
+    s << vt.n[i] << ", ";
+  return s;
+}
 
 inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
 {
@@ -198,7 +388,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
   } vt;
   vt.v = v;
   for (int i=0; i< 16; i++)
-    s << (int)vt.n[i] << ", ";
+    s << vt.n[i] << ", ";
   return s;
 }
 
@@ -235,148 +425,397 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
   return s;
 }
 
-// Need to define them first or we get specialization after instantiation errors
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from)
 {
+  // some versions of GCC throw "unused-but-set-parameter".
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(from);
   EIGEN_DEBUG_ALIGNED_LOAD
 #ifdef __VSX__
-  return vec_vsx_ld(0, from);
+  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
 #else
   return vec_ld(0, from);
 #endif
 }
 
+// Need to define them first or we get specialization after instantiation errors
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
+{
+  return pload_common<Packet4f>(from);
+}
+
 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
-#ifdef __VSX__
-  return vec_vsx_ld(0, from);
-#else
-  return vec_ld(0, from);
-#endif
+  return pload_common<Packet4i>(from);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
+template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from)
+{
+  return pload_common<Packet8s>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from)
+{
+  return pload_common<Packet8us>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char*     from)
+{
+  return pload_common<Packet16c>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char*     from)
+{
+  return pload_common<Packet16uc>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16*     from)
 {
+  return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
+  // some versions of GCC throw "unused-but-set-parameter" (float *to).
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(to);
   EIGEN_DEBUG_ALIGNED_STORE
 #ifdef __VSX__
-  vec_vsx_st(from, 0, to);
+  vec_xst(from, 0, to);
 #else
   vec_st(from, 0, to);
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
+{
+  pstore_common<Packet4f>(to, from);
+}
+
 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
 {
-  EIGEN_DEBUG_ALIGNED_STORE
-#ifdef __VSX__
-  vec_vsx_st(from, 0, to);
-#else
-  vec_st(from, 0, to);
-#endif
+  pstore_common<Packet4i>(to, from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
-  Packet4f v = {from, from, from, from};
+template<> EIGEN_STRONG_INLINE void pstore<short int>(short int*       to, const Packet8s& from)
+{
+  pstore_common<Packet8s>(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int*       to, const Packet8us& from)
+{
+  pstore_common<Packet8us>(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16*       to, const Packet8bf& from)
+{
+  pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<signed char>(signed char*       to, const Packet16c& from)
+{
+  pstore_common<Packet16c>(to, from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char*       to, const Packet16uc& from)
+{
+  pstore_common<Packet16uc>(to, from);
+}
+
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
+{
+  Packet v = {from, from, from, from};
   return v;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
-  Packet4i v = {from, from, from, from};
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet)& from)
+{
+  Packet v = {from, from, from, from, from, from, from, from};
   return v;
 }
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
-                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+
+template<typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet)& from)
+{
+  Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
+  return v;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
+  return pset1_size4<Packet4f>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
+  return pset1_size4<Packet4i>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int&    from)   {
+  return pset1_size8<Packet8s>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int&    from)   {
+  return pset1_size8<Packet8us>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char&    from)   {
+  return pset1_size16<Packet16c>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char&    from)   {
+  return pset1_size16<Packet16uc>(from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
+  return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16&    from)   {
+  return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from));
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE void
+pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a,
+                      Packet& a0, Packet& a1, Packet& a2, Packet& a3)
 {
-  a3 = pload<Packet4f>(a);
+  a3 = pload<Packet>(a);
   a0 = vec_splat(a3, 0);
   a1 = vec_splat(a3, 1);
   a2 = vec_splat(a3, 2);
   a3 = vec_splat(a3, 3);
 }
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
+}
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4i>(const int *a,
                       Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
 {
-  a3 = pload<Packet4i>(a);
-  a0 = vec_splat(a3, 0);
-  a1 = vec_splat(a3, 1);
-  a2 = vec_splat(a3, 2);
-  a3 = vec_splat(a3, 3);
+  pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
+}
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
+  a[0] = from[0*stride];
+  a[1] = from[1*stride];
+  a[2] = from[2*stride];
+  a[3] = from[3*stride];
+  return pload<Packet>(a);
 }
 
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
-  float EIGEN_ALIGN16 af[4];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  af[2] = from[2*stride];
-  af[3] = from[3*stride];
- return pload<Packet4f>(af);
+  return pgather_common<Packet4f>(from, stride);
 }
+
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
 {
-  int EIGEN_ALIGN16 ai[4];
-  ai[0] = from[0*stride];
-  ai[1] = from[1*stride];
-  ai[2] = from[2*stride];
-  ai[3] = from[3*stride];
- return pload<Packet4i>(ai);
+  return pgather_common<Packet4i>(from, stride);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride)
 {
-  float EIGEN_ALIGN16 af[4];
-  pstore<float>(af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
-  to[2*stride] = af[2];
-  to[3*stride] = af[3];
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
+  a[0] = from[0*stride];
+  a[1] = from[1*stride];
+  a[2] = from[2*stride];
+  a[3] = from[3*stride];
+  a[4] = from[4*stride];
+  a[5] = from[5*stride];
+  a[6] = from[6*stride];
+  a[7] = from[7*stride];
+  return pload<Packet>(a);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+
+template<> EIGEN_DEVICE_FUNC inline Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
 {
-  int EIGEN_ALIGN16 ai[4];
-  pstore<int>((int *)ai, from);
-  to[0*stride] = ai[0];
-  to[1*stride] = ai[1];
-  to[2*stride] = ai[2];
-  to[3*stride] = ai[3];
+  return pgather_size8<Packet8s>(from, stride);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
-template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)   { return pset1<Packet4i>(a) + p4i_COUNTDOWN; }
-
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return a + b; }
+template<> EIGEN_DEVICE_FUNC inline Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
+{
+  return pgather_size8<Packet8us>(from, stride);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return a - b; }
+template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
+{
+  return pgather_size8<Packet8bf>(from, stride);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
+  a[0] = from[0*stride];
+  a[1] = from[1*stride];
+  a[2] = from[2*stride];
+  a[3] = from[3*stride];
+  a[4] = from[4*stride];
+  a[5] = from[5*stride];
+  a[6] = from[6*stride];
+  a[7] = from[7*stride];
+  a[8] = from[8*stride];
+  a[9] = from[9*stride];
+  a[10] = from[10*stride];
+  a[11] = from[11*stride];
+  a[12] = from[12*stride];
+  a[13] = from[13*stride];
+  a[14] = from[14*stride];
+  a[15] = from[15*stride];
+  return pload<Packet>(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); }
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return a * b; }
+template<> EIGEN_DEVICE_FUNC inline Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
+{
+  return pgather_size16<Packet16c>(from, stride);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride)
 {
-#ifndef __VSX__  // VSX actually provides a div instruction
-  Packet4f t, y_0, y_1;
+  return pgather_size16<Packet16uc>(from, stride);
+}
 
-  // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
-  y_0 = vec_re(b);
+template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
+  pstore<__UNPACK_TYPE__(Packet)>(a, from);
+  to[0*stride] = a[0];
+  to[1*stride] = a[1];
+  to[2*stride] = a[2];
+  to[3*stride] = a[3];
+}
 
-  // Do one Newton-Raphson iteration to get the needed accuracy
-  t   = vec_nmsub(y_0, b, p4f_ONE);
-  y_1 = vec_madd(y_0, t, y_0);
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  pscatter_size4<Packet4f>(to, from, stride);
+}
 
-  return vec_madd(a, y_1, p4f_MZERO);
-#else
-  return vec_div(a, b);
-#endif
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+{
+  pscatter_size4<Packet4i>(to, from, stride);
+}
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
+  pstore<__UNPACK_TYPE__(Packet)>(a, from);
+  to[0*stride] = a[0];
+  to[1*stride] = a[1];
+  to[2*stride] = a[2];
+  to[3*stride] = a[3];
+  to[4*stride] = a[4];
+  to[5*stride] = a[5];
+  to[6*stride] = a[6];
+  to[7*stride] = a[7];
+}
+
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
+{
+  pscatter_size8<Packet8s>(to, from, stride);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
+{
+  pscatter_size8<Packet8us>(to, from, stride);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
+{
+  pscatter_size8<Packet8bf>(to, from, stride);
+}
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
+{
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
+  pstore<__UNPACK_TYPE__(Packet)>(a, from);
+  to[0*stride] = a[0];
+  to[1*stride] = a[1];
+  to[2*stride] = a[2];
+  to[3*stride] = a[3];
+  to[4*stride] = a[4];
+  to[5*stride] = a[5];
+  to[6*stride] = a[6];
+  to[7*stride] = a[7];
+  to[8*stride] = a[8];
+  to[9*stride] = a[9];
+  to[10*stride] = a[10];
+  to[11*stride] = a[11];
+  to[12*stride] = a[12];
+  to[13*stride] = a[13];
+  to[14*stride] = a[14];
+  to[15*stride] = a[15];
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
+{
+  pscatter_size16<Packet16c>(to, from, stride);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
+{
+  pscatter_size16<Packet16uc>(to, from, stride);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f   plset<Packet4f>(const float&     a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN;  }
+template<> EIGEN_STRONG_INLINE Packet4i   plset<Packet4i>(const int&       a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN;  }
+template<> EIGEN_STRONG_INLINE Packet8s   plset<Packet8s>(const short int& a) { return pset1<Packet8s>(a) + p8s_COUNTDOWN; }
+template<> EIGEN_STRONG_INLINE Packet8us  plset<Packet8us>(const unsigned short int& a) { return pset1<Packet8us>(a) + p8us_COUNTDOWN; }
+template<> EIGEN_STRONG_INLINE Packet16c  plset<Packet16c>(const signed char& a)   { return pset1<Packet16c>(a) + p16c_COUNTDOWN; }
+template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a)   { return pset1<Packet16uc>(a) + p16uc_COUNTDOWN; }
+
+template<> EIGEN_STRONG_INLINE Packet4f   padd<Packet4f>  (const Packet4f&   a, const Packet4f&   b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet4i   padd<Packet4i>  (const Packet4i&   a, const Packet4i&   b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet4ui   padd<Packet4ui>  (const Packet4ui&   a, const Packet4ui&   b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet8s   padd<Packet8s>  (const Packet8s&   a, const Packet8s&   b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet8us  padd<Packet8us> (const Packet8us&  a, const Packet8us&  b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet16c  padd<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a + b; }
+
+template<> EIGEN_STRONG_INLINE Packet4f   psub<Packet4f>  (const Packet4f&   a, const Packet4f&   b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet4i   psub<Packet4i>  (const Packet4i&   a, const Packet4i&   b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet8s   psub<Packet8s>  (const Packet8s&   a, const Packet8s&   b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet8us  psub<Packet8us> (const Packet8us&  a, const Packet8us&  b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet16c  psub<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; }
+
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
+
+template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4f   pmul<Packet4f>  (const Packet4f&   a, const Packet4f&   b) { return vec_madd(a,b, p4f_MZERO); }
+template<> EIGEN_STRONG_INLINE Packet4i   pmul<Packet4i>  (const Packet4i&   a, const Packet4i&   b) { return a * b; }
+template<> EIGEN_STRONG_INLINE Packet8s   pmul<Packet8s>  (const Packet8s&   a, const Packet8s&   b) { return vec_mul(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us  pmul<Packet8us> (const Packet8us&  a, const Packet8us&  b) { return vec_mul(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c  pmul<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return vec_mul(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); }
+
+
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+#ifndef __VSX__  // VSX actually provides a div instruction
+  Packet4f t, y_0, y_1;
+
+  // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
+  y_0 = vec_re(b);
+
+  // Do one Newton-Raphson iteration to get the needed accuracy
+  t   = vec_nmsub(y_0, b, p4f_ONE);
+  y_1 = vec_madd(y_0, t, y_0);
+
+  return vec_madd(a, y_1, p4f_MZERO);
+#else
+  return vec_div(a, b);
+#endif
 }
 
 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
@@ -387,10 +826,13 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
+template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); }
+template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
   #ifdef __VSX__
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
   Packet4f ret;
   __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
   return ret;
@@ -399,10 +841,16 @@ template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const
   #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); }
+
 
 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
   #ifdef __VSX__
+  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
   Packet4f ret;
   __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
   return ret;
@@ -411,79 +859,214 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const
   #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
+  Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
+  return vec_nor(c,c);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmpeq(a,b)); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return pand<Packet8us>(a, b);
+}
+
 
 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return por<Packet8us>(a, b);
+}
 
 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 
+  return pxor<Packet8us>(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_andc(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_andc(a, b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+  return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+    Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
+    Packet4f res;
+
+#ifdef __VSX__
+    __asm__("xvrspiz %x0, %x1\n\t"
+        : "=&wa" (res)
+        : "wa" (t));
+#else
+    __asm__("vrfiz %0, %1\n\t"
+        : "=v" (res)
+        : "v" (t));
+#endif
+
+    return res;
+}
 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a) { return vec_ceil(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
+template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
+{
+    Packet4f res;
 
-#ifdef _BIG_ENDIAN
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
+    __asm__("xvrspic %x0, %x1\n\t"
+        : "=&wa" (res)
+        : "wa" (a));
+
+    return res;
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
 {
   EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef _BIG_ENDIAN
   Packet16uc MSQ, LSQ;
   Packet16uc mask;
   MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
   LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
   mask = vec_lvsl(0, from);                        // create the permute mask
-  return (Packet4f) vec_perm(MSQ, LSQ, mask);           // align the data
+  //TODO: Add static_cast here
+  return (Packet) vec_perm(MSQ, LSQ, mask);           // align the data
+#else
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
+#endif
+}
 
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
+{
+  return ploadu_common<Packet4f>(from);
 }
 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  Packet16uc MSQ, LSQ;
-  Packet16uc mask;
-  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
-  mask = vec_lvsl(0, from);                        // create the permute mask
-  return (Packet4i) vec_perm(MSQ, LSQ, mask);    // align the data
+  return ploadu_common<Packet4i>(from);
 }
-#else
-// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
+template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from)
 {
-  EIGEN_DEBUG_UNALIGNED_LOAD
-  return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from));
+  return ploadu_common<Packet8s>(from);
 }
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
+template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from)
 {
-  EIGEN_DEBUG_UNALIGNED_LOAD
-  return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
+  return ploadu_common<Packet8us>(from);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from)
+{
+  return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from)
+{
+  return ploadu_common<Packet16c>(from);
+}
+template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from)
+{
+  return ploadu_common<Packet16uc>(from);
 }
-#endif
 
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
+template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)*   from)
 {
-  Packet4f p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet4f>(from);
-  else                                  p = ploadu<Packet4f>(from);
+  Packet p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet>(from);
+  else                                  p = ploadu<Packet>(from);
   return vec_perm(p, p, p16uc_DUPLICATE32_HI);
 }
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
+{
+  return ploaddup_common<Packet4f>(from);
+}
 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
 {
-  Packet4i p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet4i>(from);
-  else                                  p = ploadu<Packet4i>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+  return ploaddup_common<Packet4i>(from);
 }
 
-#ifdef _BIG_ENDIAN
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
+template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int*     from)
+{
+  Packet8s p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8s>(from);
+  else                                  p = ploadu<Packet8s>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE16_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int*     from)
+{
+  Packet8us p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8us>(from);
+  else                                  p = ploadu<Packet8us>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE16_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int*     from)
+{
+  Packet8s p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8s>(from);
+  else                                  p = ploadu<Packet8s>(from);
+  return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int*     from)
+{
+  Packet8us p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8us>(from);
+  else                                  p = ploadu<Packet8us>(from);
+  return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16*     from)
+{
+  return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char*     from)
+{
+  Packet16c p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet16c>(from);
+  else                                  p = ploadu<Packet16c>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE8_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char*     from)
+{
+  Packet16uc p;
+  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet16uc>(from);
+  else                                  p = ploadu<Packet16uc>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE8_HI);
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)*  to, const Packet& from)
 {
   EIGEN_DEBUG_UNALIGNED_STORE
+#ifdef _BIG_ENDIAN
   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
   // Warning: not thread safe!
   Packet16uc MSQ, LSQ, edges;
@@ -497,45 +1080,69 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& f
   MSQ = vec_perm(edges,(Packet16uc)from,align);             // misalign the data (MSQ)
   LSQ = vec_perm((Packet16uc)from,edges,align);             // misalign the data (LSQ)
   vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
-  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
+  vec_st( MSQ, 0, (unsigned char *)to );                   // Store the MSQ part second
+#else
+  vec_xst(from, 0, to);
+#endif
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
+{
+  pstoreu_common<Packet4f>(to, from);
 }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from)
 {
-  EIGEN_DEBUG_UNALIGNED_STORE
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  // Warning: not thread safe!
-  Packet16uc MSQ, LSQ, edges;
-  Packet16uc edgeAlign, align;
-
-  MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
-  edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
-  edges=vec_perm(LSQ, MSQ, edgeAlign);                      // extract the edges
-  align = vec_lvsr( 0, to );                                // permute map to misalign data
-  MSQ = vec_perm(edges, (Packet16uc) from, align);          // misalign the data (MSQ)
-  LSQ = vec_perm((Packet16uc) from, edges, align);          // misalign the data (LSQ)
-  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
-  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
+  pstoreu_common<Packet4i>(to, from);
 }
-#else
-// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from)
+template<> EIGEN_STRONG_INLINE void pstoreu<short int>(short int*      to, const Packet8s& from)
 {
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to));
+  pstoreu_common<Packet8s>(to, from);
 }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from)
+template<> EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int*      to, const Packet8us& from)
 {
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
+  pstoreu_common<Packet8us>(to, from);
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16*      to, const Packet8bf& from)
+{
+  pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from);
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char*      to, const Packet16c& from)
+{
+  pstoreu_common<Packet16c>(to, from);
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char*      to, const Packet16uc& from)
+{
+  pstoreu_common<Packet16uc>(to, from);
 }
-#endif
 
 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr)    { EIGEN_PPC_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr)    { EIGEN_PPC_PREFETCH(addr); }
 
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int   x; vec_ste(a, 0, &x); return x; }
+
+template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
+  vec_ste(a, 0, &x);
+  return x;
+}
+
+template<> EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
+  return pfirst_common<Packet8s>(a);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
+  return pfirst_common<Packet8us>(a);
+}
+
+template<> EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a)
+{
+  return pfirst_common<Packet16c>(a);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a)
+{
+  return pfirst_common<Packet16uc>(a);
+}
 
 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
 {
@@ -543,10 +1150,296 @@ template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
 }
 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
 {
-  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); }
+  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
+{
+  return reinterpret_cast<Packet8s>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
+}
+template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
+{
+  return reinterpret_cast<Packet8us>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
+}
+template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
+{
+  return vec_perm(a, a, p16uc_REVERSE8);
+}
+template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
+{
+  return vec_perm(a, a, p16uc_REVERSE8);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
+{
+  return preverse<Packet8us>(a);
+}
 
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8bf  pabs(const Packet8bf& a) {
+  _EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF);
+  return pand<Packet8us>(p8us_abs_mask, a);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a)
+{ return vec_sra(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a)
+{ return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a)
+{ return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
+template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
+  Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
+  return reinterpret_cast<Packet4f>(r);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
+  Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
+  return reinterpret_cast<Packet4f>(r);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
+  return vec_sr(a, p4ui_mask);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
+  return vec_sl(a, p4ui_mask);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
+  return vec_sl(a, p8us_mask);
+}
+template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a)
+{
+  const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
+  return vec_sr(a, p8us_mask);
+}
+
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){
+  return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
+}
+
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
+  return pand<Packet4f>(
+    reinterpret_cast<Packet4f>(bf.m_val),
+    reinterpret_cast<Packet4f>(p4ui_high_mask)
+  );
+}
+
+// Simple interleaving of bool masks, prevents true values from being
+// converted to NaNs.
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
+  Packet4f bf_odd, bf_even;
+  bf_odd = pand(reinterpret_cast<Packet4f>(p4ui_high_mask), odd);
+  bf_even = plogical_shift_right<16>(even);
+  return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd));
+}
+
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){
+  Packet4ui input = reinterpret_cast<Packet4ui>(p4f);
+  Packet4ui lsb = plogical_shift_right<16>(input);
+  lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE));
+
+  _EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu);
+  Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
+  input = padd<Packet4ui>(input, rounding_bias);
+
+  //Test NaN and Subnormal - Begin
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
+  Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f));
+
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
+  Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask, reinterpret_cast<Packet4ui>(p4f));
+
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(max_exp, 0x7F800000);
+  Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp);
+  Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
+
+  Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
+  Packet4ui nan_selector = pandnot<Packet4ui>(
+      reinterpret_cast<Packet4ui>(is_max_exp),
+      reinterpret_cast<Packet4ui>(is_mant_zero)
+  );
+
+  Packet4ui subnormal_selector = pandnot<Packet4ui>(
+      reinterpret_cast<Packet4ui>(is_zero_exp),
+      reinterpret_cast<Packet4ui>(is_mant_zero)
+  );
+
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
+  input = vec_sel(input, p4ui_nan, nan_selector);
+  input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
+  //Test NaN and Subnormal - End
+
+  input = plogical_shift_right<16>(input);
+  return reinterpret_cast<Packet8us>(input);
+}
+
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd){
+  Packet4f bf_odd, bf_even;
+  bf_odd = reinterpret_cast<Packet4f>(F32ToBf16(odd).m_val);
+  bf_odd = plogical_shift_left<16>(bf_odd);
+  bf_even = reinterpret_cast<Packet4f>(F32ToBf16(even).m_val);
+  return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd));
+}
+#define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
+  Packet4f a_even = Bf16ToF32Even(A);\
+  Packet4f a_odd = Bf16ToF32Odd(A);\
+  Packet4f op_even = OP(a_even);\
+  Packet4f op_odd = OP(a_odd);\
+  return F32ToBf16(op_even, op_odd);\
+
+#define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
+  Packet4f a_even = Bf16ToF32Even(A);\
+  Packet4f a_odd = Bf16ToF32Odd(A);\
+  Packet4f b_even = Bf16ToF32Even(B);\
+  Packet4f b_odd = Bf16ToF32Odd(B);\
+  Packet4f op_even = OP(a_even, b_even);\
+  Packet4f op_odd = OP(a_odd, b_odd);\
+  return F32ToBf16(op_even, op_odd);\
+
+#define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
+  Packet4f a_even = Bf16ToF32Even(A);\
+  Packet4f a_odd = Bf16ToF32Odd(A);\
+  Packet4f b_even = Bf16ToF32Even(B);\
+  Packet4f b_odd = Bf16ToF32Odd(B);\
+  Packet4f op_even = OP(a_even, b_even);\
+  Packet4f op_odd = OP(a_odd, b_odd);\
+  return F32ToBf16Bool(op_even, op_odd);\
+
+template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(pnegate<Packet4f>, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a,exponent);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf> (const Packet8bf& a, const Packet8bf& exponent){
+  BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a,exponent);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf> (const Packet8bf& a, Packet8bf& e){
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f e_even;
+  Packet4f e_odd;
+  Packet4f op_even = pfrexp<Packet4f>(a_even, e_even);
+  Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd);
+  e = F32ToBf16(e_even, e_odd);
+  return F32ToBf16(op_even, op_odd);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f b_even = Bf16ToF32Even(b);
+  Packet4f b_odd = Bf16ToF32Odd(b);
+  Packet4f c_even = Bf16ToF32Even(c);
+  Packet4f c_odd = Bf16ToF32Odd(c);
+  Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even);
+  Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd);
+  return F32ToBf16(pmadd_even, pmadd_odd);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
+}
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
+  return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const  bfloat16*     from)
+{
+  return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
+  bfloat16 countdown[8] = { bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
+                            bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7) };
+  return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
+}
 
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
@@ -558,34 +1451,6 @@ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
   return pfirst(sum);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
-  Packet4f v[4], sum[4];
-
-  // It's easier and faster to transpose then add as columns
-  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
-  // Do the transpose, first set of moves
-  v[0] = vec_mergeh(vecs[0], vecs[2]);
-  v[1] = vec_mergel(vecs[0], vecs[2]);
-  v[2] = vec_mergeh(vecs[1], vecs[3]);
-  v[3] = vec_mergel(vecs[1], vecs[3]);
-  // Get the resulting vectors
-  sum[0] = vec_mergeh(v[0], v[2]);
-  sum[1] = vec_mergel(v[0], v[2]);
-  sum[2] = vec_mergeh(v[1], v[3]);
-  sum[3] = vec_mergel(v[1], v[3]);
-
-  // Now do the summation:
-  // Lines 0+1
-  sum[0] = sum[0] + sum[1];
-  // Lines 2+3
-  sum[1] = sum[2] + sum[3];
-  // Add the results
-  sum[0] = sum[0] + sum[1];
-
-  return sum[0];
-}
-
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
   Packet4i sum;
@@ -598,141 +1463,377 @@ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
   return pfirst(sum);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
+template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a)
+{
+  float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
+  float redux_odd  = predux<Packet4f>(Bf16ToF32Odd(a));
+  float f32_result = redux_even + redux_odd;
+  return bfloat16(f32_result);
+}
+template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a)
+{
+  union{
+    Packet v;
+    __UNPACK_TYPE__(Packet) n[8];
+  } vt;
+  vt.v = a;
+
+  EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
+  EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
+  Packet4i first_half  = pload<Packet4i>(first_loader);
+  Packet4i second_half = pload<Packet4i>(second_loader);
+
+  return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half));
+}
+
+template<> EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a)
+{
+  return predux_size8<Packet8s>(a);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a)
+{
+  return predux_size8<Packet8us>(a);
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a)
+{
+  union{
+    Packet v;
+    __UNPACK_TYPE__(Packet) n[16];
+  } vt;
+  vt.v = a;
+
+  EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
+  EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
+  EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] };
+  EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] };
+
+  Packet4i first_quarter = pload<Packet4i>(first_loader);
+  Packet4i second_quarter = pload<Packet4i>(second_loader);
+  Packet4i third_quarter = pload<Packet4i>(third_loader);
+  Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
+
+  return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter)
+		                  + predux(third_quarter) + predux(fourth_quarter));
+}
+
+template<> EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a)
+{
+  return predux_size16<Packet16c>(a);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a)
+{
+  return predux_size16<Packet16uc>(a);
+}
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  Packet4f prod;
+  prod = pmul(a, vec_sld(a, a, 8));
+  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
+}
+
+template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+{
+  EIGEN_ALIGN16 int aux[4];
+  pstore(aux, a);
+  return aux[0] * aux[1] * aux[2] * aux[3];
+}
+
+template<> EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a)
+{
+  Packet8s pair, quad, octo;
+
+  pair = vec_mul(a, vec_sld(a, a, 8));
+  quad = vec_mul(pair, vec_sld(pair, pair, 4));
+  octo = vec_mul(quad, vec_sld(quad, quad, 2));
+
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a)
+{
+  Packet8us pair, quad, octo;
+
+  pair = vec_mul(a, vec_sld(a, a, 8));
+  quad = vec_mul(pair, vec_sld(pair, pair, 4));
+  octo = vec_mul(quad, vec_sld(quad, quad, 2));
+
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a)
+{
+  float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
+  float redux_odd  = predux_mul<Packet4f>(Bf16ToF32Odd(a));
+  float f32_result = redux_even * redux_odd;
+  return bfloat16(f32_result);
+}
+
+
+template<> EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a)
+{
+  Packet16c pair, quad, octo, result;
+
+  pair = vec_mul(a, vec_sld(a, a, 8));
+  quad = vec_mul(pair, vec_sld(pair, pair, 4));
+  octo = vec_mul(quad, vec_sld(quad, quad, 2));
+  result = vec_mul(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a)
+{
+  Packet16uc pair, quad, octo, result;
+
+  pair = vec_mul(a, vec_sld(a, a, 8));
+  quad = vec_mul(pair, vec_sld(pair, pair, 4));
+  octo = vec_mul(quad, vec_sld(quad, quad, 2));
+  result = vec_mul(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+// min
+template<typename Packet> EIGEN_STRONG_INLINE
+__UNPACK_TYPE__(Packet) predux_min4(const Packet& a)
+{
+  Packet b, res;
+  b = vec_min(a, vec_sld(a, a, 8));
+  res = vec_min(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  return predux_min4<Packet4f>(a);
+}
+
+template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
+{
+  return predux_min4<Packet4i>(a);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a)
+{
+  float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
+  float redux_odd  = predux_min<Packet4f>(Bf16ToF32Odd(a));
+  float f32_result = (std::min)(redux_even, redux_odd);
+  return bfloat16(f32_result);
+}
+
+template<> EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a)
+{
+  Packet8s pair, quad, octo;
+  
+  //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
+  pair = vec_min(a, vec_sld(a, a, 8)); 
+
+  //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
+  quad = vec_min(pair, vec_sld(pair, pair, 4));
+
+  //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
+  octo = vec_min(quad, vec_sld(quad, quad, 2));
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a)
+{
+  Packet8us pair, quad, octo;
+  
+  //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
+  pair = vec_min(a, vec_sld(a, a, 8)); 
+
+  //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
+  quad = vec_min(pair, vec_sld(pair, pair, 4));
+
+  //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
+  octo = vec_min(quad, vec_sld(quad, quad, 2));
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a)
+{
+  Packet16c pair, quad, octo, result;
+
+  pair = vec_min(a, vec_sld(a, a, 8));
+  quad = vec_min(pair, vec_sld(pair, pair, 4));
+  octo = vec_min(quad, vec_sld(quad, quad, 2));
+  result = vec_min(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a)
+{
+  Packet16uc pair, quad, octo, result;
+
+  pair = vec_min(a, vec_sld(a, a, 8));
+  quad = vec_min(pair, vec_sld(pair, pair, 4));
+  octo = vec_min(quad, vec_sld(quad, quad, 2));
+  result = vec_min(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+// max
+template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a)
+{
+  Packet b, res;
+  b = vec_max(a, vec_sld(a, a, 8));
+  res = vec_max(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  return predux_max4<Packet4f>(a);
+}
+
+template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
+{
+  return predux_max4<Packet4i>(a);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a)
+{
+  float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
+  float redux_odd  = predux_max<Packet4f>(Bf16ToF32Odd(a));
+  float f32_result = (std::max)(redux_even, redux_odd);
+  return bfloat16(f32_result);
+}
+
+template<> EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a)
+{
+  Packet8s pair, quad, octo;
+  
+  //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
+  pair = vec_max(a, vec_sld(a, a, 8)); 
+
+  //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
+  quad = vec_max(pair, vec_sld(pair, pair, 4));
+
+  //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
+  octo = vec_max(quad, vec_sld(quad, quad, 2));
+  return pfirst(octo);
+}
+
+template<> EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a)
 {
-  Packet4i v[4], sum[4];
+  Packet8us pair, quad, octo;
+  
+  //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
+  pair = vec_max(a, vec_sld(a, a, 8)); 
 
-  // It's easier and faster to transpose then add as columns
-  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
-  // Do the transpose, first set of moves
-  v[0] = vec_mergeh(vecs[0], vecs[2]);
-  v[1] = vec_mergel(vecs[0], vecs[2]);
-  v[2] = vec_mergeh(vecs[1], vecs[3]);
-  v[3] = vec_mergel(vecs[1], vecs[3]);
-  // Get the resulting vectors
-  sum[0] = vec_mergeh(v[0], v[2]);
-  sum[1] = vec_mergel(v[0], v[2]);
-  sum[2] = vec_mergeh(v[1], v[3]);
-  sum[3] = vec_mergel(v[1], v[3]);
+  //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
+  quad = vec_max(pair, vec_sld(pair, pair, 4));
 
-  // Now do the summation:
-  // Lines 0+1
-  sum[0] = sum[0] + sum[1];
-  // Lines 2+3
-  sum[1] = sum[2] + sum[3];
-  // Add the results
-  sum[0] = sum[0] + sum[1];
-
-  return sum[0];
+  //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
+  octo = vec_max(quad, vec_sld(quad, quad, 2));
+  return pfirst(octo);
 }
 
-// Other reduction functions:
-// mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+template<> EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a)
 {
-  Packet4f prod;
-  prod = pmul(a, vec_sld(a, a, 8));
-  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
+  Packet16c pair, quad, octo, result;
+
+  pair = vec_max(a, vec_sld(a, a, 8));
+  quad = vec_max(pair, vec_sld(pair, pair, 4));
+  octo = vec_max(quad, vec_sld(quad, quad, 2));
+  result = vec_max(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
 }
 
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a)
 {
-  EIGEN_ALIGN16 int aux[4];
-  pstore(aux, a);
-  return aux[0] * aux[1] * aux[2] * aux[3];
+  Packet16uc pair, quad, octo, result;
+
+  pair = vec_max(a, vec_sld(a, a, 8));
+  quad = vec_max(pair, vec_sld(pair, pair, 4));
+  octo = vec_max(quad, vec_sld(quad, quad, 2));
+  result = vec_max(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
 }
 
-// min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
 {
-  Packet4f b, res;
-  b = vec_min(a, vec_sld(a, a, 8));
-  res = vec_min(b, vec_sld(b, b, 4));
-  return pfirst(res);
+  return vec_any_ne(x, pzero(x));
 }
 
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
-{
-  Packet4i b, res;
-  b = vec_min(a, vec_sld(a, a, 8));
-  res = vec_min(b, vec_sld(b, b, 4));
-  return pfirst(res);
+template <typename T> EIGEN_DEVICE_FUNC inline void
+ptranpose_common(PacketBlock<T,4>& kernel){
+  T t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
 }
 
-// max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
-  Packet4f b, res;
-  b = vec_max(a, vec_sld(a, a, 8));
-  res = vec_max(b, vec_sld(b, b, 4));
-  return pfirst(res);
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  ptranpose_common<Packet4f>(kernel);
 }
 
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
-{
-  Packet4i b, res;
-  b = vec_max(a, vec_sld(a, a, 8));
-  res = vec_max(b, vec_sld(b, b, 4));
-  return pfirst(res);
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  ptranpose_common<Packet4i>(kernel);
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
-  {
-#ifdef _BIG_ENDIAN
-    switch (Offset % 4) {
-    case 1:
-      first = vec_sld(first, second, 4); break;
-    case 2:
-      first = vec_sld(first, second, 8); break;
-    case 3:
-      first = vec_sld(first, second, 12); break;
-    }
-#else
-    switch (Offset % 4) {
-    case 1:
-      first = vec_sld(second, first, 12); break;
-    case 2:
-      first = vec_sld(second, first, 8); break;
-    case 3:
-      first = vec_sld(second, first, 4); break;
-    }
-#endif
-  }
-};
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8s,4>& kernel) {
+  Packet8s t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8us,4>& kernel) {
+  Packet8us t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
 
-template<int Offset>
-struct palign_impl<Offset,Packet4i>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
-  {
-#ifdef _BIG_ENDIAN
-    switch (Offset % 4) {
-    case 1:
-      first = vec_sld(first, second, 4); break;
-    case 2:
-      first = vec_sld(first, second, 8); break;
-    case 3:
-      first = vec_sld(first, second, 12); break;
-    }
-#else
-    switch (Offset % 4) {
-    case 1:
-      first = vec_sld(second, first, 12); break;
-    case 2:
-      first = vec_sld(second, first, 8); break;
-    case 3:
-      first = vec_sld(second, first, 4); break;
-    }
-#endif
-  }
-};
 
 EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
-  Packet4f t0, t1, t2, t3;
+ptranspose(PacketBlock<Packet8bf,4>& kernel) {
+  Packet8us t0, t1, t2, t3;
+
+  t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
+  t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val);
+  t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val);
+  t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16c,4>& kernel) {
+  Packet16c t0, t1, t2, t3;
   t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
   t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
   t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
@@ -743,9 +1844,10 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
   kernel.packet[3] = vec_mergel(t1, t3);
 }
 
+
 EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
-  Packet4i t0, t1, t2, t3;
+ptranspose(PacketBlock<Packet16uc,4>& kernel) {
+  Packet16uc t0, t1, t2, t3;
   t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
   t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
   t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
@@ -756,18 +1858,398 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
   kernel.packet[3] = vec_mergel(t1, t3);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8s,8>& kernel) {
+  Packet8s v[8], sum[8];
+
+  v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
+  v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
+  v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
+  v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
+  v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
+  v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
+  v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
+  v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
+  sum[0] = vec_mergeh(v[0], v[4]);
+  sum[1] = vec_mergel(v[0], v[4]);
+  sum[2] = vec_mergeh(v[1], v[5]);
+  sum[3] = vec_mergel(v[1], v[5]);
+  sum[4] = vec_mergeh(v[2], v[6]);
+  sum[5] = vec_mergel(v[2], v[6]);
+  sum[6] = vec_mergeh(v[3], v[7]);
+  sum[7] = vec_mergel(v[3], v[7]);
+
+  kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
+  kernel.packet[1] = vec_mergel(sum[0], sum[4]);
+  kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
+  kernel.packet[3] = vec_mergel(sum[1], sum[5]);
+  kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
+  kernel.packet[5] = vec_mergel(sum[2], sum[6]);
+  kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
+  kernel.packet[7] = vec_mergel(sum[3], sum[7]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8us,8>& kernel) {
+  Packet8us v[8], sum[8];
+
+  v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
+  v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
+  v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
+  v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
+  v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
+  v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
+  v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
+  v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
+  sum[0] = vec_mergeh(v[0], v[4]);
+  sum[1] = vec_mergel(v[0], v[4]);
+  sum[2] = vec_mergeh(v[1], v[5]);
+  sum[3] = vec_mergel(v[1], v[5]);
+  sum[4] = vec_mergeh(v[2], v[6]);
+  sum[5] = vec_mergel(v[2], v[6]);
+  sum[6] = vec_mergeh(v[3], v[7]);
+  sum[7] = vec_mergel(v[3], v[7]);
+
+  kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
+  kernel.packet[1] = vec_mergel(sum[0], sum[4]);
+  kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
+  kernel.packet[3] = vec_mergel(sum[1], sum[5]);
+  kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
+  kernel.packet[5] = vec_mergel(sum[2], sum[6]);
+  kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
+  kernel.packet[7] = vec_mergel(sum[3], sum[7]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8bf,8>& kernel) {
+  Packet8bf v[8], sum[8];
+
+  v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
+  v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val);
+  v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val);
+  v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val);
+  v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val);
+  v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val);
+  v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val);
+  v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val);
+  sum[0] = vec_mergeh(v[0].m_val, v[4].m_val);
+  sum[1] = vec_mergel(v[0].m_val, v[4].m_val);
+  sum[2] = vec_mergeh(v[1].m_val, v[5].m_val);
+  sum[3] = vec_mergel(v[1].m_val, v[5].m_val);
+  sum[4] = vec_mergeh(v[2].m_val, v[6].m_val);
+  sum[5] = vec_mergel(v[2].m_val, v[6].m_val);
+  sum[6] = vec_mergeh(v[3].m_val, v[7].m_val);
+  sum[7] = vec_mergel(v[3].m_val, v[7].m_val);
+
+  kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val);
+  kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val);
+  kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val);
+  kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val);
+  kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val);
+  kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val);
+  kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val);
+  kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16c,16>& kernel) {
+  Packet16c step1[16], step2[16], step3[16];
+
+  step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
+  step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
+  step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
+  step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
+  step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
+  step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
+  step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
+  step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
+  step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
+  step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
+  step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
+  step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
+  step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
+  step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
+  step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
+  step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
+
+  step2[0]  = vec_mergeh(step1[0], step1[8]);
+  step2[1]  = vec_mergel(step1[0], step1[8]);
+  step2[2]  = vec_mergeh(step1[1], step1[9]);
+  step2[3]  = vec_mergel(step1[1], step1[9]);
+  step2[4]  = vec_mergeh(step1[2], step1[10]);
+  step2[5]  = vec_mergel(step1[2], step1[10]);
+  step2[6]  = vec_mergeh(step1[3], step1[11]);
+  step2[7]  = vec_mergel(step1[3], step1[11]);
+  step2[8]  = vec_mergeh(step1[4], step1[12]);
+  step2[9]  = vec_mergel(step1[4], step1[12]);
+  step2[10] = vec_mergeh(step1[5], step1[13]);
+  step2[11] = vec_mergel(step1[5], step1[13]);
+  step2[12] = vec_mergeh(step1[6], step1[14]);
+  step2[13] = vec_mergel(step1[6], step1[14]);
+  step2[14] = vec_mergeh(step1[7], step1[15]);
+  step2[15] = vec_mergel(step1[7], step1[15]);
+
+  step3[0]  = vec_mergeh(step2[0], step2[8]);
+  step3[1]  = vec_mergel(step2[0], step2[8]);
+  step3[2]  = vec_mergeh(step2[1], step2[9]);
+  step3[3]  = vec_mergel(step2[1], step2[9]);
+  step3[4]  = vec_mergeh(step2[2], step2[10]);
+  step3[5]  = vec_mergel(step2[2], step2[10]);
+  step3[6]  = vec_mergeh(step2[3], step2[11]);
+  step3[7]  = vec_mergel(step2[3], step2[11]);
+  step3[8]  = vec_mergeh(step2[4], step2[12]);
+  step3[9]  = vec_mergel(step2[4], step2[12]);
+  step3[10] = vec_mergeh(step2[5], step2[13]);
+  step3[11] = vec_mergel(step2[5], step2[13]);
+  step3[12] = vec_mergeh(step2[6], step2[14]);
+  step3[13] = vec_mergel(step2[6], step2[14]);
+  step3[14] = vec_mergeh(step2[7], step2[15]);
+  step3[15] = vec_mergel(step2[7], step2[15]);
+
+  kernel.packet[0]  = vec_mergeh(step3[0], step3[8]);
+  kernel.packet[1]  = vec_mergel(step3[0], step3[8]);
+  kernel.packet[2]  = vec_mergeh(step3[1], step3[9]);
+  kernel.packet[3]  = vec_mergel(step3[1], step3[9]);
+  kernel.packet[4]  = vec_mergeh(step3[2], step3[10]);
+  kernel.packet[5]  = vec_mergel(step3[2], step3[10]);
+  kernel.packet[6]  = vec_mergeh(step3[3], step3[11]);
+  kernel.packet[7]  = vec_mergel(step3[3], step3[11]);
+  kernel.packet[8]  = vec_mergeh(step3[4], step3[12]);
+  kernel.packet[9]  = vec_mergel(step3[4], step3[12]);
+  kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
+  kernel.packet[11] = vec_mergel(step3[5], step3[13]);
+  kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
+  kernel.packet[13] = vec_mergel(step3[6], step3[14]);
+  kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
+  kernel.packet[15] = vec_mergel(step3[7], step3[15]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16uc,16>& kernel) {
+  Packet16uc step1[16], step2[16], step3[16];
+
+  step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
+  step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
+  step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
+  step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
+  step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
+  step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
+  step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
+  step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
+  step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
+  step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
+  step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
+  step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
+  step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
+  step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
+  step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
+  step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
+
+  step2[0]  = vec_mergeh(step1[0], step1[8]);
+  step2[1]  = vec_mergel(step1[0], step1[8]);
+  step2[2]  = vec_mergeh(step1[1], step1[9]);
+  step2[3]  = vec_mergel(step1[1], step1[9]);
+  step2[4]  = vec_mergeh(step1[2], step1[10]);
+  step2[5]  = vec_mergel(step1[2], step1[10]);
+  step2[6]  = vec_mergeh(step1[3], step1[11]);
+  step2[7]  = vec_mergel(step1[3], step1[11]);
+  step2[8]  = vec_mergeh(step1[4], step1[12]);
+  step2[9]  = vec_mergel(step1[4], step1[12]);
+  step2[10] = vec_mergeh(step1[5], step1[13]);
+  step2[11] = vec_mergel(step1[5], step1[13]);
+  step2[12] = vec_mergeh(step1[6], step1[14]);
+  step2[13] = vec_mergel(step1[6], step1[14]);
+  step2[14] = vec_mergeh(step1[7], step1[15]);
+  step2[15] = vec_mergel(step1[7], step1[15]);
+
+  step3[0]  = vec_mergeh(step2[0], step2[8]);
+  step3[1]  = vec_mergel(step2[0], step2[8]);
+  step3[2]  = vec_mergeh(step2[1], step2[9]);
+  step3[3]  = vec_mergel(step2[1], step2[9]);
+  step3[4]  = vec_mergeh(step2[2], step2[10]);
+  step3[5]  = vec_mergel(step2[2], step2[10]);
+  step3[6]  = vec_mergeh(step2[3], step2[11]);
+  step3[7]  = vec_mergel(step2[3], step2[11]);
+  step3[8]  = vec_mergeh(step2[4], step2[12]);
+  step3[9]  = vec_mergel(step2[4], step2[12]);
+  step3[10] = vec_mergeh(step2[5], step2[13]);
+  step3[11] = vec_mergel(step2[5], step2[13]);
+  step3[12] = vec_mergeh(step2[6], step2[14]);
+  step3[13] = vec_mergel(step2[6], step2[14]);
+  step3[14] = vec_mergeh(step2[7], step2[15]);
+  step3[15] = vec_mergel(step2[7], step2[15]);
+
+  kernel.packet[0]  = vec_mergeh(step3[0], step3[8]);
+  kernel.packet[1]  = vec_mergel(step3[0], step3[8]);
+  kernel.packet[2]  = vec_mergeh(step3[1], step3[9]);
+  kernel.packet[3]  = vec_mergel(step3[1], step3[9]);
+  kernel.packet[4]  = vec_mergeh(step3[2], step3[10]);
+  kernel.packet[5]  = vec_mergel(step3[2], step3[10]);
+  kernel.packet[6]  = vec_mergeh(step3[3], step3[11]);
+  kernel.packet[7]  = vec_mergel(step3[3], step3[11]);
+  kernel.packet[8]  = vec_mergeh(step3[4], step3[12]);
+  kernel.packet[9]  = vec_mergel(step3[4], step3[12]);
+  kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
+  kernel.packet[11] = vec_mergel(step3[5], step3[13]);
+  kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
+  kernel.packet[13] = vec_mergel(step3[6], step3[14]);
+  kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
+  kernel.packet[15] = vec_mergel(step3[7], step3[15]);
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE
+Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
   Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
   Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
   return vec_sel(elsePacket, thenPacket, mask);
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
+}
+
 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
-  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
-  Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
+  return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) {
+  Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
+  Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(select, p8us_ONE));
+  Packet8s result = vec_sel(elsePacket, thenPacket, mask);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) {
+  Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
+  Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(reinterpret_cast<Packet8us>(select), p8us_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket, const Packet8bf& elsePacket) {
+  return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket, const Packet16c& elsePacket) {
+  Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
+                       ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
+                       ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
+
+  Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) {
+  Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
+                       ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
+                       ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
+
+  Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
   return vec_sel(elsePacket, thenPacket, mask);
 }
 
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<bfloat16, unsigned short int> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<unsigned short int, bfloat16> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return vec_cts(a,0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
+  return vec_ctu(a,0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return vec_ctf(a,0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
+  return vec_ctf(a,0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8us pcast<Packet8bf, Packet8us>(const Packet8bf& a) {
+  Packet4f float_even = Bf16ToF32Even(a);
+  Packet4f float_odd = Bf16ToF32Odd(a);
+  Packet4ui int_even = pcast<Packet4f, Packet4ui>(float_even);
+  Packet4ui int_odd = pcast<Packet4f, Packet4ui>(float_odd);
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
+  Packet4ui low_even = pand<Packet4ui>(int_even, p4ui_low_mask);
+  Packet4ui low_odd = pand<Packet4ui>(int_odd, p4ui_low_mask);
+
+  //Check values that are bigger than USHRT_MAX (0xFFFF)
+  Packet4bi overflow_selector;
+  if(vec_any_gt(int_even, p4ui_low_mask)){
+    overflow_selector = vec_cmpgt(int_even, p4ui_low_mask);
+    low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector);
+  }
+  if(vec_any_gt(int_odd, p4ui_low_mask)){
+    overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask);
+    low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector);
+  }
+
+  low_odd = plogical_shift_left<16>(low_odd);
+
+  Packet4ui int_final = por<Packet4ui>(low_even, low_odd);
+  return reinterpret_cast<Packet8us>(int_final);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8us, Packet8bf>(const Packet8us& a) {
+  //short -> int -> float -> bfloat16
+  const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
+  Packet4ui int_cast = reinterpret_cast<Packet4ui>(a);
+  Packet4ui int_even = pand<Packet4ui>(int_cast, p4ui_low_mask);
+  Packet4ui int_odd = plogical_shift_right<16>(int_cast);
+  Packet4f float_even = pcast<Packet4ui, Packet4f>(int_even);
+  Packet4f float_odd = pcast<Packet4ui, Packet4f>(int_odd);
+  return F32ToBf16(float_even, float_odd);
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+  return reinterpret_cast<Packet4i>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+  return reinterpret_cast<Packet4f>(a);
+}
+
+
 
 //---------- double ----------
 #ifdef __VSX__
@@ -782,9 +2264,12 @@ typedef __vector __bool long         Packet2bl;
 
 static Packet2l  p2l_ONE  = { 1, 1 };
 static Packet2l  p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
+static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull };
+static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull };
 static Packet2d  p2d_ONE  = { 1.0, 1.0 };
 static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
-static Packet2d  p2d_MZERO = { -0.0, -0.0 };
+static Packet2d  p2d_MZERO = { numext::bit_cast<double>(0x8000000000000000ull),
+                               numext::bit_cast<double>(0x8000000000000000ull) };
 
 #ifdef _BIG_ENDIAN
 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
@@ -792,16 +2277,9 @@ static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_c
 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
 #endif
 
-template<int index> Packet2d vec_splat_dbl(Packet2d& a);
-
-template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a)
-{
-  return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_HI));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a)
+template<int index> Packet2d vec_splat_dbl(Packet2d& a)
 {
-  return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_LO));
+  return vec_splat(a, index);
 }
 
 template<> struct packet_traits<double> : default_packet_traits
@@ -830,12 +2308,13 @@ template<> struct packet_traits<double> : default_packet_traits
     HasRound = 1,
     HasFloor = 1,
     HasCeil = 1,
+    HasRint = 1,
     HasNegate = 1,
     HasBlend = 1
   };
 };
 
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
 
 inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
 {
@@ -863,21 +2342,13 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
 {
   EIGEN_DEBUG_ALIGNED_LOAD
-#ifdef __VSX__
-  return vec_vsx_ld(0, from);
-#else
-  return vec_ld(0, from);
-#endif
+  return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
 }
 
 template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
 {
   EIGEN_DEBUG_ALIGNED_STORE
-#ifdef __VSX__
-  vec_vsx_st(from, 0, to);
-#else
-  vec_st(from, 0, to);
-#endif
+  vec_xst(from, 0, to);
 }
 
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
@@ -885,28 +2356,32 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
   return v;
 }
 
+template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
+  Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
+  return reinterpret_cast<Packet2d>(v);
+}
+
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet2d>(const double *a,
                       Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
 {
-  a1 = pload<Packet2d>(a);
-  a0 = vec_splat_dbl<0>(a1);
-  a1 = vec_splat_dbl<1>(a1);
-  a3 = pload<Packet2d>(a+2);
-  a2 = vec_splat_dbl<0>(a3);
-  a3 = vec_splat_dbl<1>(a3);
+  //This way is faster than vec_splat (at least for doubles in Power 9)
+  a0 = pset1<Packet2d>(a[0]);
+  a1 = pset1<Packet2d>(a[1]);
+  a2 = pset1<Packet2d>(a[2]);
+  a3 = pset1<Packet2d>(a[3]);
 }
 
 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
-  double EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 double af[2];
   af[0] = from[0*stride];
   af[1] = from[1*stride];
  return pload<Packet2d>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
 {
-  double EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 double af[2];
   pstore<double>(af, from);
   to[0*stride] = af[0];
   to[1*stride] = af[1];
@@ -930,6 +2405,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d&
 
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
 {
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
   Packet2d ret;
   __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
   return ret;
@@ -937,11 +2413,20 @@ template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const
 
 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
 {
+  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
   Packet2d ret;
   __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
   return ret;
 }
 
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
+  Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a,b));
+  return vec_nor(c,c);
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
 
 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
@@ -950,14 +2435,34 @@ template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const
 
 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
 
-template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
+{
+    Packet2d t = vec_add(reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
+    Packet2d res;
+
+    __asm__("xvrdpiz %x0, %x1\n\t"
+        : "=&wa" (res)
+        : "wa" (t));
+
+    return res;
+}
 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
+template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
+{
+    Packet2d res;
+
+    __asm__("xvrdpic %x0, %x1\n\t"
+        : "=&wa" (res)
+        : "wa" (a));
+
+    return res;
+}
 
 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
-  return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from));
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return vec_xl(0, const_cast<double*>(from));
 }
 
 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
@@ -970,13 +2475,13 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from)
 {
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
+  EIGEN_DEBUG_UNALIGNED_STORE
+  vec_xst(from, 0, to);
 }
 
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
 
-template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
 
 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
 {
@@ -984,6 +2489,177 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
 }
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
 
+// VSX support varies between different compilers and even different
+// versions of the same compiler.  For gcc version >= 4.9.3, we can use
+// vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
+// a slow version that works with older compilers. 
+// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
+// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
+template<>
+inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) {
+#if EIGEN_GNUC_AT_LEAST(5, 4) || \
+    (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
+  return vec_cts(x, 0);    // TODO: check clang version.
+#else
+  double tmp[2];
+  memcpy(tmp, &x, sizeof(tmp));
+  Packet2l l = { static_cast<long long>(tmp[0]),
+                 static_cast<long long>(tmp[1]) };
+  return l;
+#endif
+}
+
+template<>
+inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x) {
+  unsigned long long tmp[2];
+  memcpy(tmp, &x, sizeof(tmp));
+  Packet2d d = { static_cast<double>(tmp[0]),
+                 static_cast<double>(tmp[1]) };
+  return d;
+}
+
+
+// Packet2l shifts.
+// For POWER8 we simply use vec_sr/l. 
+//
+// Things are more complicated for POWER7. There is actually a
+// vec_xxsxdi intrinsic but it is not supported by some gcc versions.
+// So we need to shift by N % 32 and rearrage bytes.
+#ifdef __POWER8_VECTOR__
+
+template<int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
+  const Packet2ul shift = { N, N };
+  return vec_sl(a, shift); 
+}
+
+template<int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
+  const Packet2ul shift = { N, N };
+  return vec_sr(a, shift); 
+}
+
+#else
+
+// Shifts [A, B, C, D] to [B, 0, D, 0].
+// Used to implement left shifts for Packet2l.
+EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
+  static const Packet16uc perm = {
+      0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 
+      0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
+  #ifdef  _BIG_ENDIAN
+    return vec_perm(p4i_ZERO, a, perm);
+  #else
+    return vec_perm(a, p4i_ZERO, perm);
+  #endif
+}
+
+// Shifts [A, B, C, D] to [0, A, 0, C].
+// Used to implement right shifts for Packet2l.
+EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
+  static const Packet16uc perm = {
+      0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 
+      0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b };
+  #ifdef  _BIG_ENDIAN
+    return vec_perm(p4i_ZERO, a, perm);
+  #else
+    return vec_perm(a, p4i_ZERO, perm);
+  #endif
+}
+
+template<int N, typename EnableIf = void>
+struct plogical_shift_left_impl;
+
+template<int N>
+struct plogical_shift_left_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned n = static_cast<unsigned>(N);
+    const Packet4ui shift = {n, n, n, n};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    static const unsigned m = static_cast<unsigned>(32 - N);
+    const Packet4ui shift_right = {m, m, m, m};
+    const Packet4i out_hi = vec_sl(ai, shift);
+    const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right));
+    return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
+  }
+};
+
+template<int N>
+struct plogical_shift_left_impl<N, typename enable_if<(N >= 32)>::type> {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned m = static_cast<unsigned>(N - 32);
+    const Packet4ui shift = {m, m, m, m};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    return reinterpret_cast<Packet2l>(shift_even_left(vec_sl(ai, shift)));
+  }
+};
+
+template<int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
+  return plogical_shift_left_impl<N>::run(a); 
+}
+
+template<int N, typename EnableIf = void>
+struct plogical_shift_right_impl;
+
+template<int N>
+struct plogical_shift_right_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned n = static_cast<unsigned>(N);
+    const Packet4ui shift = {n, n, n, n};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    static const unsigned m = static_cast<unsigned>(32 - N);
+    const Packet4ui shift_left = {m, m, m, m};
+    const Packet4i out_lo = vec_sr(ai, shift);
+    const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left));
+    return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
+  }
+};
+
+template<int N>
+struct plogical_shift_right_impl<N, typename enable_if<(N >= 32)>::type> {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned m = static_cast<unsigned>(N - 32);
+    const Packet4ui shift = {m, m, m, m};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    return reinterpret_cast<Packet2l>(shift_odd_right(vec_sr(ai, shift)));
+  }
+};
+
+template<int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
+  return plogical_shift_right_impl<N>::run(a); 
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
+  const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+
+  // Split 2^e into four factors and multiply:
+  const Packet2l  bias = { 1023, 1023 };
+  Packet2l b = plogical_shift_right<2>(e);  // floor(e/4)
+  Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
+  Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
+  b = psub(psub(psub(e, b), b), b);  // e - 3b
+  c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
+  out = pmul(out, c); // a * 2^e
+  return out;
+}
+
+
+// Extract exponent without existence of Packet2l.
+template<>
+EIGEN_STRONG_INLINE  
+Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
+  return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d> (const Packet2d& a, Packet2d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
 {
   Packet2d b, sum;
@@ -992,20 +2668,6 @@ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
   return pfirst<Packet2d>(sum);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
-{
-  Packet2d v[2], sum;
-  v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
-  v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
-
-#ifdef _BIG_ENDIAN
-  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
-#else
-  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[1]), reinterpret_cast<Packet4f>(v[0]), 8));
-#endif
-
-  return sum;
-}
 // Other reduction functions:
 // mul
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
@@ -1025,20 +2687,6 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
   return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet2d>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
-  {
-    if (Offset == 1)
-#ifdef _BIG_ENDIAN
-      first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(first), reinterpret_cast<Packet4ui>(second), 8));
-#else
-      first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(second), reinterpret_cast<Packet4ui>(first), 8));
-#endif
-  }
-};
-
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet2d,2>& kernel) {
   Packet2d t0, t1;
@@ -1053,6 +2701,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons
   Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
   return vec_sel(elsePacket, thenPacket, mask);
 }
+
+
 #endif // __VSX__
 } // end namespace internal
 
diff --git a/contrib/eigen/Eigen/src/Core/arch/CUDA/Complex.h b/contrib/eigen/Eigen/src/Core/arch/CUDA/Complex.h
index 9c25365090bb88554ddf5b448d43561d55fbff68..deb4c8694f8581fd6e8f485ef6ec9a74ecf64712 100644
--- a/contrib/eigen/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/contrib/eigen/Eigen/src/Core/arch/CUDA/Complex.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2021 C. Antonio Sanchez <cantonios@google.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -11,93 +12,247 @@
 #define EIGEN_COMPLEX_CUDA_H
 
 // clang-format off
+// Many std::complex methods such as operator+, operator-, operator* and
+// operator/ are not constexpr. Due to this, GCC and older versions of clang do
+// not treat them as device functions and thus Eigen functors making use of
+// these operators fail to compile. Here, we manually specialize these
+// operators and functors for complex types when building for CUDA to enable
+// their use on-device.
+
+#if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE)
+    
+// ICC already specializes std::complex<float> and std::complex<double>
+// operators, preventing us from making them device functions here.
+// This will lead to silent runtime errors if the operators are used on device.
+//
+// To allow std::complex operator use on device, define _OVERRIDE_COMPLEX_SPECIALIZATION_
+// prior to first inclusion of <complex>.  This prevents ICC from adding
+// its own specializations, so our custom ones below can be used instead.
+#if !(defined(EIGEN_COMP_ICC) && defined(_USE_COMPLEX_SPECIALIZATION_))
+
+// Import Eigen's internal operator specializations.
+#define EIGEN_USING_STD_COMPLEX_OPERATORS           \
+  using Eigen::complex_operator_detail::operator+;  \
+  using Eigen::complex_operator_detail::operator-;  \
+  using Eigen::complex_operator_detail::operator*;  \
+  using Eigen::complex_operator_detail::operator/;  \
+  using Eigen::complex_operator_detail::operator+=; \
+  using Eigen::complex_operator_detail::operator-=; \
+  using Eigen::complex_operator_detail::operator*=; \
+  using Eigen::complex_operator_detail::operator/=; \
+  using Eigen::complex_operator_detail::operator==; \
+  using Eigen::complex_operator_detail::operator!=;
 
 namespace Eigen {
 
-namespace internal {
+// Specialized std::complex overloads.
+namespace complex_operator_detail {
 
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<T> complex_multiply(const std::complex<T>& a, const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  return std::complex<T>(
+      a_real * b_real - a_imag * b_imag,
+      a_imag * b_real + a_real * b_imag);
+}
 
-// Many std::complex methods such as operator+, operator-, operator* and
-// operator/ are not constexpr. Due to this, clang does not treat them as device
-// functions and thus Eigen functors making use of these operators fail to
-// compile. Here, we manually specialize these functors for complex types when
-// building for CUDA to avoid non-constexpr methods.
-
-// Sum
-template<typename T> struct scalar_sum_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
-  typedef typename std::complex<T> result_type;
-
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
-    return std::complex<T>(numext::real(a) + numext::real(b),
-                           numext::imag(a) + numext::imag(b));
-  }
-};
-
-template<typename T> struct scalar_sum_op<std::complex<T>, std::complex<T> > : scalar_sum_op<const std::complex<T>, const std::complex<T> > {};
-
-
-// Difference
-template<typename T> struct scalar_difference_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
-  typedef typename std::complex<T> result_type;
-
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
-    return std::complex<T>(numext::real(a) - numext::real(b),
-                           numext::imag(a) - numext::imag(b));
-  }
-};
-
-template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T> > : scalar_difference_op<const std::complex<T>, const std::complex<T> > {};
-
-
-// Product
-template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
-  enum {
-    Vectorizable = packet_traits<std::complex<T>>::HasMul
-  };
-  typedef typename std::complex<T> result_type;
-
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
-    const T a_real = numext::real(a);
-    const T a_imag = numext::imag(a);
-    const T b_real = numext::real(b);
-    const T b_imag = numext::imag(b);
-    return std::complex<T>(a_real * b_real - a_imag * b_imag,
-                           a_real * b_imag + a_imag * b_real);
-  }
-};
-
-template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> > : scalar_product_op<const std::complex<T>, const std::complex<T> > {};
-
-
-// Quotient
-template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
-  enum {
-    Vectorizable = packet_traits<std::complex<T>>::HasDiv
-  };
-  typedef typename std::complex<T> result_type;
-
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
-    const T a_real = numext::real(a);
-    const T a_imag = numext::imag(a);
-    const T b_real = numext::real(b);
-    const T b_imag = numext::imag(b);
-    const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
-    return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
-                           (a_imag * b_real - a_real * b_imag) * norm);
-  }
-};
-
-template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<T> complex_divide_fast(const std::complex<T>& a, const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  const T norm = (b_real * b_real + b_imag * b_imag);
+  return std::complex<T>((a_real * b_real + a_imag * b_imag) / norm,
+                          (a_imag * b_real - a_real * b_imag) / norm);
+}
 
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<T> complex_divide_stable(const std::complex<T>& a, const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  // Smith's complex division (https://arxiv.org/pdf/1210.4539.pdf),
+  // guards against over/under-flow.
+  const bool scale_imag = numext::abs(b_imag) <= numext::abs(b_real);
+  const T rscale = scale_imag ? T(1) : b_real / b_imag;
+  const T iscale = scale_imag ? b_imag / b_real : T(1);
+  const T denominator = b_real * rscale + b_imag * iscale;
+  return std::complex<T>((a_real * rscale + a_imag * iscale) / denominator, 
+                         (a_imag * rscale - a_real * iscale) / denominator);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<T> complex_divide(const std::complex<T>& a, const std::complex<T>& b) {
+#if EIGEN_FAST_MATH
+  return complex_divide_fast(a, b);
+#else
+  return complex_divide_stable(a, b);
 #endif
+}
+
+// NOTE: We cannot specialize compound assignment operators with Scalar T,
+//         (i.e.  operator@=(const T&), for @=+,-,*,/)
+//       since they are already specialized for float/double/long double within
+//       the standard <complex> header. We also do not specialize the stream
+//       operators.
+#define EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(T)                                    \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator+(const std::complex<T>& a) { return a; }                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator-(const std::complex<T>& a) {                                           \
+  return std::complex<T>(-numext::real(a), -numext::imag(a));                                   \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator+(const std::complex<T>& a, const std::complex<T>& b) {                 \
+  return std::complex<T>(numext::real(a) + numext::real(b), numext::imag(a) + numext::imag(b)); \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator+(const std::complex<T>& a, const T& b) {                               \
+  return std::complex<T>(numext::real(a) + b, numext::imag(a));                                 \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator+(const T& a, const std::complex<T>& b) {                               \
+  return std::complex<T>(a + numext::real(b), numext::imag(b));                                 \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator-(const std::complex<T>& a, const std::complex<T>& b) {                 \
+  return std::complex<T>(numext::real(a) - numext::real(b), numext::imag(a) - numext::imag(b)); \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator-(const std::complex<T>& a, const T& b) {                               \
+  return std::complex<T>(numext::real(a) - b, numext::imag(a));                                 \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator-(const T& a, const std::complex<T>& b) {                               \
+  return std::complex<T>(a - numext::real(b), -numext::imag(b));                                \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator*(const std::complex<T>& a, const std::complex<T>& b) {                 \
+  return complex_multiply(a, b);                                                                \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator*(const std::complex<T>& a, const T& b) {                               \
+  return std::complex<T>(numext::real(a) * b, numext::imag(a) * b);                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator*(const T& a, const std::complex<T>& b) {                               \
+  return std::complex<T>(a * numext::real(b), a * numext::imag(b));                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator/(const std::complex<T>& a, const std::complex<T>& b) {                 \
+  return complex_divide(a, b);                                                                  \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator/(const std::complex<T>& a, const T& b) {                               \
+  return std::complex<T>(numext::real(a) / b, numext::imag(a) / b);                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T> operator/(const T& a, const std::complex<T>& b) {                               \
+  return complex_divide(std::complex<T>(a, 0), b);                                              \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T>& operator+=(std::complex<T>& a, const std::complex<T>& b) {                     \
+  numext::real_ref(a) += numext::real(b);                                                       \
+  numext::imag_ref(a) += numext::imag(b);                                                       \
+  return a;                                                                                     \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T>& operator-=(std::complex<T>& a, const std::complex<T>& b) {                     \
+  numext::real_ref(a) -= numext::real(b);                                                       \
+  numext::imag_ref(a) -= numext::imag(b);                                                       \
+  return a;                                                                                     \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T>& operator*=(std::complex<T>& a, const std::complex<T>& b) {                     \
+  a = complex_multiply(a, b);                                                                   \
+  return a;                                                                                     \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+std::complex<T>& operator/=(std::complex<T>& a, const std::complex<T>& b) {                     \
+  a = complex_divide(a, b);                                                                     \
+  return  a;                                                                                    \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator==(const std::complex<T>& a, const std::complex<T>& b) {                           \
+  return numext::real(a) == numext::real(b) && numext::imag(a) == numext::imag(b);              \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator==(const std::complex<T>& a, const T& b) {                                         \
+  return numext::real(a) == b && numext::imag(a) == 0;                                          \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator==(const T& a, const std::complex<T>& b) {                                         \
+  return a  == numext::real(b) && 0 == numext::imag(b);                                         \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator!=(const std::complex<T>& a, const std::complex<T>& b) {                           \
+  return !(a == b);                                                                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator!=(const std::complex<T>& a, const T& b) {                                         \
+  return !(a == b);                                                                             \
+}                                                                                               \
+                                                                                                \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
+bool operator!=(const T& a, const std::complex<T>& b) {                                         \
+  return !(a == b);                                                                             \
+}
+
+// Do not specialize for long double, since that reduces to double on device.
+EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(float)
+EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(double)
+
+#undef EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS
+
+  
+}  // namespace complex_operator_detail
+
+EIGEN_USING_STD_COMPLEX_OPERATORS
+
+namespace numext {
+EIGEN_USING_STD_COMPLEX_OPERATORS
+}  // namespace numext
+
+namespace internal {
+EIGEN_USING_STD_COMPLEX_OPERATORS
+
+}  // namespace internal
+}  // namespace Eigen
 
-} // end namespace internal
+#endif  // !(EIGEN_COMP_ICC && _USE_COMPLEX_SPECIALIZATION_)
 
-} // end namespace Eigen
+#endif  // EIGEN_CUDACC && EIGEN_GPU_COMPILE_PHASE
 
-#endif // EIGEN_COMPLEX_CUDA_H
+#endif  // EIGEN_COMPLEX_CUDA_H
diff --git a/contrib/eigen/Eigen/src/Core/arch/CUDA/Half.h b/contrib/eigen/Eigen/src/Core/arch/CUDA/Half.h
deleted file mode 100644
index 59717b4fe6cc1c788491efd1dfd528d461449921..0000000000000000000000000000000000000000
--- a/contrib/eigen/Eigen/src/Core/arch/CUDA/Half.h
+++ /dev/null
@@ -1,675 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-//
-// The conversion routines are Copyright (c) Fabian Giesen, 2016.
-// The original license follows:
-//
-// Copyright (c) Fabian Giesen, 2016
-// All rights reserved.
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-// Standard 16-bit float type, mostly useful for GPUs. Defines a new
-// type Eigen::half (inheriting from CUDA's __half struct) with
-// operator overloads such that it behaves basically as an arithmetic
-// type. It will be quite slow on CPUs (so it is recommended to stay
-// in float32_bits for CPUs, except for simple parameter conversions, I/O
-// to disk and the likes), but fast on GPUs.
-
-
-#ifndef EIGEN_HALF_CUDA_H
-#define EIGEN_HALF_CUDA_H
-
-#if __cplusplus > 199711L
-#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
-#else
-#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
-#endif
-
-#include <sstream>
-
-namespace Eigen {
-
-struct half;
-
-namespace half_impl {
-
-#if !defined(EIGEN_HAS_CUDA_FP16)
-// Make our own __half_raw definition that is similar to CUDA's.
-struct __half_raw {
-  EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
-  explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
-  unsigned short x;
-};
-#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
-// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
-typedef __half __half_raw;
-#endif
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
-
-struct half_base : public __half_raw {
-  EIGEN_DEVICE_FUNC half_base() {}
-  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
-  EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
-  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
-#endif
-};
-
-} // namespace half_impl
-
-// Class definition.
-struct half : public half_impl::half_base {
-  #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
-    typedef half_impl::__half_raw __half_raw;
-  #endif
-
-  EIGEN_DEVICE_FUNC half() {}
-
-  EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
-  EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
-  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
-#endif
-
-  explicit EIGEN_DEVICE_FUNC half(bool b)
-      : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
-  template<class T>
-  explicit EIGEN_DEVICE_FUNC half(const T& val)
-      : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(val))) {}
-  explicit EIGEN_DEVICE_FUNC half(float f)
-      : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
-    // +0.0 and -0.0 become false, everything else becomes true.
-    return (x & 0x7fff) != 0;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
-    return static_cast<signed char>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
-    return static_cast<unsigned char>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
-    return static_cast<short>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const {
-    return static_cast<unsigned short>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
-    return static_cast<int>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
-    return static_cast<unsigned int>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
-    return static_cast<long>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
-    return static_cast<unsigned long>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
-    return static_cast<long long>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
-    return static_cast<unsigned long long>(half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
-    return half_impl::half_to_float(*this);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
-    return static_cast<double>(half_impl::half_to_float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC half& operator=(const half& other) {
-    x = other.x;
-    return *this;
-  }
-};
-
-} // end namespace Eigen
-
-namespace std {
-template<>
-struct numeric_limits<Eigen::half> {
-  static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = true;
-  static const bool has_quiet_NaN = true;
-  static const bool has_signaling_NaN = true;
-  static const float_denorm_style has_denorm = denorm_present;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = std::round_to_nearest;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = false;
-  static const bool is_modulo = false;
-  static const int digits = 11;
-  static const int digits10 = 3;      // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static const int max_digits10 = 5;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static const int radix = 2;
-  static const int min_exponent = -13;
-  static const int min_exponent10 = -4;
-  static const int max_exponent = 16;
-  static const int max_exponent10 = 4;
-  static const bool traps = true;
-  static const bool tinyness_before = false;
-
-  static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
-  static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
-  static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
-  static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
-  static Eigen::half round_error() { return Eigen::half(0.5); }
-  static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
-  static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
-  static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
-  static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
-};
-
-// If std::numeric_limits<T> is specialized, should also specialize
-// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
-// std::numeric_limits<const volatile T>
-// https://stackoverflow.com/a/16519653/
-template<>
-struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {};
-template<>
-struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {};
-template<>
-struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {};
-} // end namespace std
-
-namespace Eigen {
-
-namespace half_impl {
-
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
-
-// Intrinsics for native fp16 support. Note that on current hardware,
-// these are no faster than float32_bits arithmetic (you need to use the half2
-// versions to get the ALU speed increased), but you do save the
-// conversion steps back and forth.
-
-EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
-  return __hadd(a, b);
-}
-EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
-  return __hmul(a, b);
-}
-EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
-  return __hsub(a, b);
-}
-EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
-  float num = __half2float(a);
-  float denom = __half2float(b);
-  return __float2half(num / denom);
-}
-EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
-  return __hneg(a);
-}
-EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
-  a = a + b;
-  return a;
-}
-EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
-  a = a * b;
-  return a;
-}
-EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
-  a = a - b;
-  return a;
-}
-EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
-  a = a / b;
-  return a;
-}
-EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
-  return __heq(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
-  return __hne(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
-  return __hlt(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
-  return __hle(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
-  return __hgt(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
-  return __hge(a, b);
-}
-
-#else  // Emulate support for half floats
-
-// Definitions for CPUs and older CUDA, mostly working through conversion
-// to/from float32_bits.
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
-  return half(float(a) + float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
-  return half(float(a) * float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
-  return half(float(a) - float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
-  return half(float(a) / float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
-  half result;
-  result.x = a.x ^ 0x8000;
-  return result;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
-  a = half(float(a) + float(b));
-  return a;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
-  a = half(float(a) * float(b));
-  return a;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
-  a = half(float(a) - float(b));
-  return a;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
-  a = half(float(a) / float(b));
-  return a;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
-  return numext::equal_strict(float(a),float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
-  return numext::not_equal_strict(float(a), float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
-  return float(a) < float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
-  return float(a) <= float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
-  return float(a) > float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
-  return float(a) >= float(b);
-}
-
-#endif  // Emulate support for half floats
-
-// Division by an index. Do it in full float precision to avoid accuracy
-// issues in converting the denominator to half.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
-  return half(static_cast<float>(a) / static_cast<float>(b));
-}
-
-// Conversion routines, including fallbacks for the host or older CUDA.
-// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
-// these in hardware. If we need more performance on older/other CPUs, they are
-// also possible to vectorize directly.
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
-  __half_raw h;
-  h.x = x;
-  return h;
-}
-
-union float32_bits {
-  unsigned int u;
-  float f;
-};
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
-  __half tmp_ff = __float2half(ff);
-  return *(__half_raw*)&tmp_ff;
-
-#elif defined(EIGEN_HAS_FP16_C)
-  __half_raw h;
-  h.x = _cvtss_sh(ff, 0);
-  return h;
-
-#else
-  float32_bits f; f.f = ff;
-
-  const float32_bits f32infty = { 255 << 23 };
-  const float32_bits f16max = { (127 + 16) << 23 };
-  const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
-  unsigned int sign_mask = 0x80000000u;
-  __half_raw o;
-  o.x = static_cast<unsigned short>(0x0u);
-
-  unsigned int sign = f.u & sign_mask;
-  f.u ^= sign;
-
-  // NOTE all the integer compares in this function can be safely
-  // compiled into signed compares since all operands are below
-  // 0x80000000. Important if you want fast straight SSE2 code
-  // (since there's no unsigned PCMPGTD).
-
-  if (f.u >= f16max.u) {  // result is Inf or NaN (all exponent bits set)
-    o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
-  } else {  // (De)normalized number or zero
-    if (f.u < (113 << 23)) {  // resulting FP16 is subnormal or zero
-      // use a magic value to align our 10 mantissa bits at the bottom of
-      // the float. as long as FP addition is round-to-nearest-even this
-      // just works.
-      f.f += denorm_magic.f;
-
-      // and one integer subtract of the bias later, we have our final float!
-      o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
-    } else {
-      unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
-
-      // update exponent, rounding bias part 1
-      f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
-      // rounding bias part 2
-      f.u += mant_odd;
-      // take the bits!
-      o.x = static_cast<unsigned short>(f.u >> 13);
-    }
-  }
-
-  o.x |= static_cast<unsigned short>(sign >> 16);
-  return o;
-#endif
-}
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
-  return __half2float(h);
-
-#elif defined(EIGEN_HAS_FP16_C)
-  return _cvtsh_ss(h.x);
-
-#else
-  const float32_bits magic = { 113 << 23 };
-  const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
-  float32_bits o;
-
-  o.u = (h.x & 0x7fff) << 13;             // exponent/mantissa bits
-  unsigned int exp = shifted_exp & o.u;   // just the exponent
-  o.u += (127 - 15) << 23;                // exponent adjust
-
-  // handle exponent special cases
-  if (exp == shifted_exp) {     // Inf/NaN?
-    o.u += (128 - 16) << 23;    // extra exp adjust
-  } else if (exp == 0) {        // Zero/Denormal?
-    o.u += 1 << 23;             // extra exp adjust
-    o.f -= magic.f;             // renormalize
-  }
-
-  o.u |= (h.x & 0x8000) << 16;    // sign bit
-  return o.f;
-#endif
-}
-
-// --- standard functions ---
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
-  return (a.x & 0x7fff) == 0x7c00;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
-  return __hisnan(a);
-#else
-  return (a.x & 0x7fff) > 0x7c00;
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) {
-  return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
-}
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
-  half result;
-  result.x = a.x & 0x7FFF;
-  return result;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
-  return half(hexp(a));
-#else
-   return half(::expf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
-  return half(::hlog(a));
-#else
-  return half(::logf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) {
-  return half(numext::log1p(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
-  return half(::log10f(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
-  return half(hsqrt(a));
-#else
-    return half(::sqrtf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
-  return half(::powf(float(a), float(b)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) {
-  return half(::sinf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) {
-  return half(::cosf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) {
-  return half(::tanf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
-  return half(::tanhf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
-  return half(hfloor(a));
-#else
-  return half(::floorf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
-  return half(hceil(a));
-#else
-  return half(::ceilf(float(a)));
-#endif
-}
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
-  return __hlt(b, a) ? b : a;
-#else
-  const float f1 = static_cast<float>(a);
-  const float f2 = static_cast<float>(b);
-  return f2 < f1 ? b : a;
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
-  return __hlt(a, b) ? b : a;
-#else
-  const float f1 = static_cast<float>(a);
-  const float f2 = static_cast<float>(b);
-  return f1 < f2 ? b : a;
-#endif
-}
-
-EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) {
-  os << static_cast<float>(v);
-  return os;
-}
-
-} // end namespace half_impl
-
-// import Eigen::half_impl::half into Eigen namespace
-// using half_impl::half;
-
-namespace internal {
-
-template<>
-struct random_default_impl<half, false, false>
-{
-  static inline half run(const half& x, const half& y)
-  {
-    return x + (y-x) * half(float(std::rand()) / float(RAND_MAX));
-  }
-  static inline half run()
-  {
-    return run(half(-1.f), half(1.f));
-  }
-};
-
-template<> struct is_arithmetic<half> { enum { value = true }; };
-
-} // end namespace internal
-
-template<> struct NumTraits<Eigen::half>
-    : GenericNumTraits<Eigen::half>
-{
-  enum {
-    IsSigned = true,
-    IsInteger = false,
-    IsComplex = false,
-    RequireInitialization = false
-  };
-
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
-    return half_impl::raw_uint16_to_half(0x0800);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return Eigen::half(1e-2f); }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() {
-    return half_impl::raw_uint16_to_half(0x7bff);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() {
-    return half_impl::raw_uint16_to_half(0xfbff);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() {
-    return half_impl::raw_uint16_to_half(0x7c00);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
-    return half_impl::raw_uint16_to_half(0x7c01);
-  }
-};
-
-} // end namespace Eigen
-
-// C-like standard mathematical functions and trancendentals.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) {
-  Eigen::half result;
-  result.x = a.x & 0x7FFF;
-  return result;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
-  return Eigen::half(::expf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
-  return Eigen::half(::hlog(a));
-#else
-  return Eigen::half(::logf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) {
-  return Eigen::half(::sqrtf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) {
-  return Eigen::half(::powf(float(a), float(b)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) {
-  return Eigen::half(::floorf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) {
-  return Eigen::half(::ceilf(float(a)));
-}
-
-namespace std {
-
-#if __cplusplus > 199711L
-template <>
-struct hash<Eigen::half> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const {
-    return static_cast<std::size_t>(a.x);
-  }
-};
-#endif
-
-} // end namespace std
-
-
-// Add the missing shfl_xor intrinsic
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
-  #if EIGEN_CUDACC_VER < 90000
-  return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
-  #else
-  return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
-  #endif
-}
-#endif
-
-// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
-  return Eigen::half_impl::raw_uint16_to_half(
-      __ldg(reinterpret_cast<const unsigned short*>(ptr)));
-}
-#endif
-
-
-#if defined(EIGEN_CUDA_ARCH)
-namespace Eigen {
-namespace numext {
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-bool (isnan)(const Eigen::half& h) {
-  return (half_impl::isnan)(h);
-}
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-bool (isinf)(const Eigen::half& h) {
-  return (half_impl::isinf)(h);
-}
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-bool (isfinite)(const Eigen::half& h) {
-  return (half_impl::isfinite)(h);
-}
-
-} // namespace Eigen
-}  // namespace numext
-#endif
-
-#endif // EIGEN_HALF_CUDA_H
diff --git a/contrib/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h b/contrib/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h
deleted file mode 100644
index 0348b41db05b7dcc88432b34c77c8aba97c35c5c..0000000000000000000000000000000000000000
--- a/contrib/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ /dev/null
@@ -1,91 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
-#define EIGEN_MATH_FUNCTIONS_CUDA_H
-
-namespace Eigen {
-
-namespace internal {
-
-// Make sure this is only available when targeting a GPU: we don't want to
-// introduce conflicts between these packet_traits definitions and the ones
-// we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 plog<float4>(const float4& a)
-{
-  return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
-}
-
-template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 plog<double2>(const double2& a)
-{
-  using ::log;
-  return make_double2(log(a.x), log(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 plog1p<float4>(const float4& a)
-{
-  return make_float4(log1pf(a.x), log1pf(a.y), log1pf(a.z), log1pf(a.w));
-}
-
-template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 plog1p<double2>(const double2& a)
-{
-  return make_double2(log1p(a.x), log1p(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pexp<float4>(const float4& a)
-{
-  return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pexp<double2>(const double2& a)
-{
-  using ::exp;
-  return make_double2(exp(a.x), exp(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 psqrt<float4>(const float4& a)
-{
-  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 psqrt<double2>(const double2& a)
-{
-  using ::sqrt;
-  return make_double2(sqrt(a.x), sqrt(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 prsqrt<float4>(const float4& a)
-{
-  return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 prsqrt<double2>(const double2& a)
-{
-  return make_double2(rsqrt(a.x), rsqrt(a.y));
-}
-
-
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
diff --git a/contrib/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h b/contrib/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h
deleted file mode 100644
index 4dda63188d20a2c881284e688f5ef527ed3a74a8..0000000000000000000000000000000000000000
--- a/contrib/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ /dev/null
@@ -1,333 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_CUDA_H
-#define EIGEN_PACKET_MATH_CUDA_H
-
-namespace Eigen {
-
-namespace internal {
-
-// Make sure this is only available when targeting a GPU: we don't want to
-// introduce conflicts between these packet_traits definitions and the ones
-// we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
-template<> struct is_arithmetic<float4>  { enum { value = true }; };
-template<> struct is_arithmetic<double2> { enum { value = true }; };
-
-template<> struct packet_traits<float> : default_packet_traits
-{
-  typedef float4 type;
-  typedef float4 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size=4,
-    HasHalfPacket = 0,
-
-    HasDiv  = 1,
-    HasSin  = 0,
-    HasCos  = 0,
-    HasLog  = 1,
-    HasExp  = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasLGamma = 1,
-    HasDiGamma = 1,
-    HasZeta = 1,
-    HasPolygamma = 1,
-    HasErf = 1,
-    HasErfc = 1,
-    HasIGamma = 1,
-    HasIGammac = 1,
-    HasBetaInc = 1,
-
-    HasBlend = 0,
-  };
-};
-
-template<> struct packet_traits<double> : default_packet_traits
-{
-  typedef double2 type;
-  typedef double2 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size=2,
-    HasHalfPacket = 0,
-
-    HasDiv  = 1,
-    HasLog  = 1,
-    HasExp  = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasLGamma = 1,
-    HasDiGamma = 1,
-    HasZeta = 1,
-    HasPolygamma = 1,
-    HasErf = 1,
-    HasErfc = 1,
-    HasIGamma = 1,
-    HasIGammac = 1,
-    HasBetaInc = 1,
-
-    HasBlend = 0,
-  };
-};
-
-
-template<> struct unpacket_traits<float4>  { typedef float  type; enum {size=4, alignment=Aligned16}; typedef float4 half; };
-template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; };
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float&  from) {
-  return make_float4(from, from, from, from);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
-  return make_double2(from, from);
-}
-
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
-  return make_float4(a, a+1, a+2, a+3);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
-  return make_double2(a, a+1);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
-  return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
-  return make_double2(a.x+b.x, a.y+b.y);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
-  return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
-  return make_double2(a.x-b.x, a.y-b.y);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
-  return make_float4(-a.x, -a.y, -a.z, -a.w);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
-  return make_double2(-a.x, -a.y);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
-  return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
-  return make_double2(a.x*b.x, a.y*b.y);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
-  return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
-  return make_double2(a.x/b.x, a.y/b.y);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
-  return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
-  return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
-  return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
-  return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
-  return *reinterpret_cast<const float4*>(from);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
-  return *reinterpret_cast<const double2*>(from);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
-  return make_float4(from[0], from[1], from[2], from[3]);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
-  return make_double2(from[0], from[1]);
-}
-
-template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
-  return make_float4(from[0], from[0], from[1], from[1]);
-}
-template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
-  return make_double2(from[0], from[0]);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float*   to, const float4& from) {
-  *reinterpret_cast<float4*>(to) = from;
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
-  *reinterpret_cast<double2*>(to) = from;
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const float4& from) {
-  to[0] = from.x;
-  to[1] = from.y;
-  to[2] = from.z;
-  to[3] = from.w;
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
-  to[0] = from.x;
-  to[1] = from.y;
-}
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return __ldg((const float4*)from);
-#else
-  return make_float4(from[0], from[1], from[2], from[3]);
-#endif
-}
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return __ldg((const double2*)from);
-#else
-  return make_double2(from[0], from[1]);
-#endif
-}
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
-#else
-  return make_float4(from[0], from[1], from[2], from[3]);
-#endif
-}
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return make_double2(__ldg(from+0), __ldg(from+1));
-#else
-  return make_double2(from[0], from[1]);
-#endif
-}
-
-template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
-  return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
-}
-
-template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
-  return make_double2(from[0*stride], from[1*stride]);
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
-  to[stride*0] = from.x;
-  to[stride*1] = from.y;
-  to[stride*2] = from.z;
-  to[stride*3] = from.w;
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
-  to[stride*0] = from.x;
-  to[stride*1] = from.y;
-}
-
-template<> EIGEN_DEVICE_FUNC inline float  pfirst<float4>(const float4& a) {
-  return a.x;
-}
-template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
-  return a.x;
-}
-
-template<> EIGEN_DEVICE_FUNC inline float  predux<float4>(const float4& a) {
-  return a.x + a.y + a.z + a.w;
-}
-template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
-  return a.x + a.y;
-}
-
-template<> EIGEN_DEVICE_FUNC inline float  predux_max<float4>(const float4& a) {
-  return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
-}
-template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
-  return fmax(a.x, a.y);
-}
-
-template<> EIGEN_DEVICE_FUNC inline float  predux_min<float4>(const float4& a) {
-  return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
-}
-template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
-  return fmin(a.x, a.y);
-}
-
-template<> EIGEN_DEVICE_FUNC inline float  predux_mul<float4>(const float4& a) {
-  return a.x * a.y * a.z * a.w;
-}
-template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
-  return a.x * a.y;
-}
-
-template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
-  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
-}
-template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
-  return make_double2(fabs(a.x), fabs(a.y));
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<float4,4>& kernel) {
-  float tmp = kernel.packet[0].y;
-  kernel.packet[0].y = kernel.packet[1].x;
-  kernel.packet[1].x = tmp;
-
-  tmp = kernel.packet[0].z;
-  kernel.packet[0].z = kernel.packet[2].x;
-  kernel.packet[2].x = tmp;
-
-  tmp = kernel.packet[0].w;
-  kernel.packet[0].w = kernel.packet[3].x;
-  kernel.packet[3].x = tmp;
-
-  tmp = kernel.packet[1].z;
-  kernel.packet[1].z = kernel.packet[2].y;
-  kernel.packet[2].y = tmp;
-
-  tmp = kernel.packet[1].w;
-  kernel.packet[1].w = kernel.packet[3].y;
-  kernel.packet[3].y = tmp;
-
-  tmp = kernel.packet[2].w;
-  kernel.packet[2].w = kernel.packet[3].z;
-  kernel.packet[3].z = tmp;
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<double2,2>& kernel) {
-  double tmp = kernel.packet[0].y;
-  kernel.packet[0].y = kernel.packet[1].x;
-  kernel.packet[1].x = tmp;
-}
-
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-
-#endif // EIGEN_PACKET_MATH_CUDA_H
diff --git a/contrib/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/contrib/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
deleted file mode 100644
index f749c573ff6032c7a9667784c60f382ad64b60b8..0000000000000000000000000000000000000000
--- a/contrib/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ /dev/null
@@ -1,1124 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
-#define EIGEN_PACKET_MATH_HALF_CUDA_H
-
-
-namespace Eigen {
-namespace internal {
-
-// Most of the following operations require arch >= 3.0
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-
-template<> struct is_arithmetic<half2> { enum { value = true }; };
-
-template<> struct packet_traits<Eigen::half> : default_packet_traits
-{
-  typedef half2 type;
-  typedef half2 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size=2,
-    HasHalfPacket = 0,
-    HasAdd    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasSqrt   = 1,
-    HasRsqrt  = 1,
-    HasExp    = 1,
-    HasLog    = 1,
-    HasLog1p  = 1
-  };
-};
-
-template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
-
-template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
-  return __half2half2(from);
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
-  return *reinterpret_cast<const half2*>(from);
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
-  return __halves2half2(from[0], from[1]);
-}
-
-template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
-  return __halves2half2(from[0], from[0]);
-}
-
-template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
-  *reinterpret_cast<half2*>(to) = from;
-}
-
-template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
-  to[0] = __low2half(from);
-  to[1] = __high2half(from);
-}
-
-template<>
- __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
-#if __CUDA_ARCH__ >= 350
-   return __ldg((const half2*)from);
-#else
-  return __halves2half2(*(from+0), *(from+1));
-#endif
-}
-
-template<>
-__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
-#if __CUDA_ARCH__ >= 350
-   return __halves2half2(__ldg(from+0), __ldg(from+1));
-#else
-  return __halves2half2(*(from+0), *(from+1));
-#endif
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
-  return __halves2half2(from[0*stride], from[1*stride]);
-}
-
-template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
-  to[stride*0] = __low2half(from);
-  to[stride*1] = __high2half(from);
-}
-
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
-  return __low2half(a);
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
-  half2 result;
-  unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
-  *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
-  return result;
-}
-
-
-__device__ EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<half2,2>& kernel) {
-  __half a1 = __low2half(kernel.packet[0]);
-  __half a2 = __high2half(kernel.packet[0]);
-  __half b1 = __low2half(kernel.packet[1]);
-  __half b2 = __high2half(kernel.packet[1]);
-  kernel.packet[0] = __halves2half2(a1, b1);
-  kernel.packet[1] = __halves2half2(a2, b2);
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
-#if __CUDA_ARCH__ >= 530
-  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
-#else
-  float f = __half2float(a) + 1.0f;
-  return __halves2half2(a, __float2half(f));
-#endif
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
-#if __CUDA_ARCH__ >= 530
-  return __hadd2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 + b1;
-  float r2 = a2 + b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
-#if __CUDA_ARCH__ >= 530
-  return __hsub2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 - b1;
-  float r2 = a2 - b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
-#if __CUDA_ARCH__ >= 530
-  return __hneg2(a);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return __floats2half2_rn(-a1, -a2);
-#endif
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
-
-template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
-#if __CUDA_ARCH__ >= 530
-  return __hmul2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 * b1;
-  float r2 = a2 * b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
-#if __CUDA_ARCH__ >= 530
-   return __hfma2(a, b, c);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float c1 = __low2float(c);
-  float c2 = __high2float(c);
-  float r1 = a1 * b1 + c1;
-  float r2 = a2 * b2 + c2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 / b1;
-  float r2 = a2 / b2;
-  return __floats2half2_rn(r1, r2);
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
-  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
-  return __halves2half2(r1, r2);
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
-  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
-  return __halves2half2(r1, r2);
-}
-
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
-#if __CUDA_ARCH__ >= 530
-  return __hadd(__low2half(a), __high2half(a));
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return Eigen::half(__float2half_rn(a1 + a2));
-#endif
-}
-
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
-#if __CUDA_ARCH__ >= 530
-  __half first = __low2half(a);
-  __half second = __high2half(a);
-  return __hgt(first, second) ? first : second;
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return a1 > a2 ? __low2half(a) : __high2half(a);
-#endif
-}
-
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
-#if __CUDA_ARCH__ >= 530
-  __half first = __low2half(a);
-  __half second = __high2half(a);
-  return __hlt(first, second) ? first : second;
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return a1 < a2 ? __low2half(a) : __high2half(a);
-#endif
-}
-
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
-#if __CUDA_ARCH__ >= 530
-  return __hmul(__low2half(a), __high2half(a));
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return Eigen::half(__float2half_rn(a1 * a2));
-#endif
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = log1pf(a1);
-  float r2 = log1pf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
-
-template<>  __device__ EIGEN_STRONG_INLINE
-half2 plog<half2>(const half2& a) {
-  return h2log(a);
-}
-
-template<> __device__ EIGEN_STRONG_INLINE
-half2 pexp<half2>(const half2& a) {
-  return h2exp(a);
-}
-
-template<> __device__ EIGEN_STRONG_INLINE
-half2 psqrt<half2>(const half2& a) {
-  return h2sqrt(a);
-}
-
-template<> __device__ EIGEN_STRONG_INLINE
-half2 prsqrt<half2>(const half2& a) {
-  return h2rsqrt(a);
-}
-
-#else
-
-template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = logf(a1);
-  float r2 = logf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = expf(a1);
-  float r2 = expf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = sqrtf(a1);
-  float r2 = sqrtf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = rsqrtf(a1);
-  float r2 = rsqrtf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-#endif
-
-#elif defined EIGEN_VECTORIZE_AVX512
-
-typedef struct {
-  __m256i x;
-} Packet16h;
-
-
-template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
-
-template <>
-struct packet_traits<half> : default_packet_traits {
-  typedef Packet16h type;
-  // There is no half-size packet for Packet16h.
-  typedef Packet16h half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 16,
-    HasHalfPacket = 0,
-    HasAdd    = 0,
-    HasSub    = 0,
-    HasMul    = 0,
-    HasNegate = 0,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasConj   = 0,
-    HasSetLinear = 0,
-    HasDiv = 0,
-    HasSqrt = 0,
-    HasRsqrt = 0,
-    HasExp = 0,
-    HasLog = 0,
-    HasBlend = 0
-  };
-};
-
-
-template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; };
-
-template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
-  Packet16h result;
-  result.x = _mm256_set1_epi16(from.x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
-  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, 0)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
-  Packet16h result;
-  result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
-  Packet16h result;
-  result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
-  _mm256_store_si256((__m256i*)to, from.x);
-}
-
-template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
-  _mm256_storeu_si256((__m256i*)to, from.x);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h
-ploadquad(const Eigen::half* from) {
-  Packet16h result;
-  unsigned short a = from[0].x;
-  unsigned short b = from[1].x;
-  unsigned short c = from[2].x;
-  unsigned short d = from[3].x;
-  result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
-  return result;
-}
-
-EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
-#ifdef EIGEN_HAS_FP16_C
-  return _mm512_cvtph_ps(a.x);
-#else
-  EIGEN_ALIGN64 half aux[16];
-  pstore(aux, a);
-  float f0(aux[0]);
-  float f1(aux[1]);
-  float f2(aux[2]);
-  float f3(aux[3]);
-  float f4(aux[4]);
-  float f5(aux[5]);
-  float f6(aux[6]);
-  float f7(aux[7]);
-  float f8(aux[8]);
-  float f9(aux[9]);
-  float fa(aux[10]);
-  float fb(aux[11]);
-  float fc(aux[12]);
-  float fd(aux[13]);
-  float fe(aux[14]);
-  float ff(aux[15]);
-
-  return _mm512_set_ps(
-      ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
-#endif
-}
-
-EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
-#ifdef EIGEN_HAS_FP16_C
-  Packet16h result;
-  result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
-  return result;
-#else
-  EIGEN_ALIGN64 float aux[16];
-  pstore(aux, a);
-  half h0(aux[0]);
-  half h1(aux[1]);
-  half h2(aux[2]);
-  half h3(aux[3]);
-  half h4(aux[4]);
-  half h5(aux[5]);
-  half h6(aux[6]);
-  half h7(aux[7]);
-  half h8(aux[8]);
-  half h9(aux[9]);
-  half ha(aux[10]);
-  half hb(aux[11]);
-  half hc(aux[12]);
-  half hd(aux[13]);
-  half he(aux[14]);
-  half hf(aux[15]);
-
-  Packet16h result;
-  result.x = _mm256_set_epi16(
-      hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
-      h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
-  return result;
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  Packet16f af = half2float(a);
-  Packet16f bf = half2float(b);
-  Packet16f rf = padd(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  Packet16f af = half2float(a);
-  Packet16f bf = half2float(b);
-  Packet16f rf = pmul(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
-  Packet16f from_float = half2float(from);
-  return half(predux(from_float));
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
-{
-  Packet16h result;
-  result.x = _mm256_set_epi16(
-      from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
-      from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
-      from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
-      from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
-{
-  EIGEN_ALIGN64 half aux[16];
-  pstore(aux, from);
-  to[stride*0].x = aux[0].x;
-  to[stride*1].x = aux[1].x;
-  to[stride*2].x = aux[2].x;
-  to[stride*3].x = aux[3].x;
-  to[stride*4].x = aux[4].x;
-  to[stride*5].x = aux[5].x;
-  to[stride*6].x = aux[6].x;
-  to[stride*7].x = aux[7].x;
-  to[stride*8].x = aux[8].x;
-  to[stride*9].x = aux[9].x;
-  to[stride*10].x = aux[10].x;
-  to[stride*11].x = aux[11].x;
-  to[stride*12].x = aux[12].x;
-  to[stride*13].x = aux[13].x;
-  to[stride*14].x = aux[14].x;
-  to[stride*15].x = aux[15].x;
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet16h,16>& kernel) {
-  __m256i a = kernel.packet[0].x;
-  __m256i b = kernel.packet[1].x;
-  __m256i c = kernel.packet[2].x;
-  __m256i d = kernel.packet[3].x;
-  __m256i e = kernel.packet[4].x;
-  __m256i f = kernel.packet[5].x;
-  __m256i g = kernel.packet[6].x;
-  __m256i h = kernel.packet[7].x;
-  __m256i i = kernel.packet[8].x;
-  __m256i j = kernel.packet[9].x;
-  __m256i k = kernel.packet[10].x;
-  __m256i l = kernel.packet[11].x;
-  __m256i m = kernel.packet[12].x;
-  __m256i n = kernel.packet[13].x;
-  __m256i o = kernel.packet[14].x;
-  __m256i p = kernel.packet[15].x;
-
-  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
-  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
-  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
-  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
-  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
-  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
-  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
-  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
-
-  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
-  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
-  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
-  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
-  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
-  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
-  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
-  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
-
-  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
-  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
-  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
-  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
-  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
-  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
-  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
-  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
-
-  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
-  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
-  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
-  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
-  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
-  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
-  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
-  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
-
-  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
-  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
-  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
-  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
-  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
-  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
-  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
-  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
-  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
-  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
-  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
-  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
-  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
-  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
-  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
-  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
-
-  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
-  __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
-  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
-  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
-  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
-  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
-  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
-  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
-  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
-  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
-  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
-  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
-  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
-  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
-  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
-  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
-  __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
-
-  kernel.packet[0].x = a_p_0;
-  kernel.packet[1].x = a_p_1;
-  kernel.packet[2].x = a_p_2;
-  kernel.packet[3].x = a_p_3;
-  kernel.packet[4].x = a_p_4;
-  kernel.packet[5].x = a_p_5;
-  kernel.packet[6].x = a_p_6;
-  kernel.packet[7].x = a_p_7;
-  kernel.packet[8].x = a_p_8;
-  kernel.packet[9].x = a_p_9;
-  kernel.packet[10].x = a_p_a;
-  kernel.packet[11].x = a_p_b;
-  kernel.packet[12].x = a_p_c;
-  kernel.packet[13].x = a_p_d;
-  kernel.packet[14].x = a_p_e;
-  kernel.packet[15].x = a_p_f;
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet16h,8>& kernel) {
-  EIGEN_ALIGN64 half in[8][16];
-  pstore<half>(in[0], kernel.packet[0]);
-  pstore<half>(in[1], kernel.packet[1]);
-  pstore<half>(in[2], kernel.packet[2]);
-  pstore<half>(in[3], kernel.packet[3]);
-  pstore<half>(in[4], kernel.packet[4]);
-  pstore<half>(in[5], kernel.packet[5]);
-  pstore<half>(in[6], kernel.packet[6]);
-  pstore<half>(in[7], kernel.packet[7]);
-
-  EIGEN_ALIGN64 half out[8][16];
-
-  for (int i = 0; i < 8; ++i) {
-    for (int j = 0; j < 8; ++j) {
-      out[i][j] = in[j][2*i];
-    }
-    for (int j = 0; j < 8; ++j) {
-      out[i][j+8] = in[j][2*i+1];
-    }
-  }
-
-  kernel.packet[0] = pload<Packet16h>(out[0]);
-  kernel.packet[1] = pload<Packet16h>(out[1]);
-  kernel.packet[2] = pload<Packet16h>(out[2]);
-  kernel.packet[3] = pload<Packet16h>(out[3]);
-  kernel.packet[4] = pload<Packet16h>(out[4]);
-  kernel.packet[5] = pload<Packet16h>(out[5]);
-  kernel.packet[6] = pload<Packet16h>(out[6]);
-  kernel.packet[7] = pload<Packet16h>(out[7]);
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet16h,4>& kernel) {
-  EIGEN_ALIGN64 half in[4][16];
-  pstore<half>(in[0], kernel.packet[0]);
-  pstore<half>(in[1], kernel.packet[1]);
-  pstore<half>(in[2], kernel.packet[2]);
-  pstore<half>(in[3], kernel.packet[3]);
-
-  EIGEN_ALIGN64 half out[4][16];
-
-  for (int i = 0; i < 4; ++i) {
-    for (int j = 0; j < 4; ++j) {
-      out[i][j] = in[j][4*i];
-    }
-    for (int j = 0; j < 4; ++j) {
-      out[i][j+4] = in[j][4*i+1];
-    }
-    for (int j = 0; j < 4; ++j) {
-      out[i][j+8] = in[j][4*i+2];
-    }
-    for (int j = 0; j < 4; ++j) {
-      out[i][j+12] = in[j][4*i+3];
-    }
-  }
-
-  kernel.packet[0] = pload<Packet16h>(out[0]);
-  kernel.packet[1] = pload<Packet16h>(out[1]);
-  kernel.packet[2] = pload<Packet16h>(out[2]);
-  kernel.packet[3] = pload<Packet16h>(out[3]);
-}
-
-
-#elif defined EIGEN_VECTORIZE_AVX
-
-typedef struct {
-  __m128i x;
-} Packet8h;
-
-
-template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
-
-template <>
-struct packet_traits<Eigen::half> : default_packet_traits {
-  typedef Packet8h type;
-  // There is no half-size packet for Packet8h.
-  typedef Packet8h half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 8,
-    HasHalfPacket = 0,
-    HasAdd    = 0,
-    HasSub    = 0,
-    HasMul    = 0,
-    HasNegate = 0,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasConj   = 0,
-    HasSetLinear = 0,
-    HasDiv = 0,
-    HasSqrt = 0,
-    HasRsqrt = 0,
-    HasExp = 0,
-    HasLog = 0,
-    HasBlend = 0
-  };
-};
-
-
-template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; };
-
-template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
-  Packet8h result;
-  result.x = _mm_set1_epi16(from.x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
-  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
-  Packet8h result;
-  result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
-  Packet8h result;
-  result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
-  _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x);
-}
-
-template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h
-ploadquad<Packet8h>(const Eigen::half* from) {
-  Packet8h result;
-  unsigned short a = from[0].x;
-  unsigned short b = from[1].x;
-  result.x = _mm_set_epi16(b, b, b, b, a, a, a, a);
-  return result;
-}
-
-EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
-#ifdef EIGEN_HAS_FP16_C
-  return _mm256_cvtph_ps(a.x);
-#else
-  EIGEN_ALIGN32 Eigen::half aux[8];
-  pstore(aux, a);
-  float f0(aux[0]);
-  float f1(aux[1]);
-  float f2(aux[2]);
-  float f3(aux[3]);
-  float f4(aux[4]);
-  float f5(aux[5]);
-  float f6(aux[6]);
-  float f7(aux[7]);
-
-  return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
-#endif
-}
-
-EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
-#ifdef EIGEN_HAS_FP16_C
-  Packet8h result;
-  result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
-  return result;
-#else
-  EIGEN_ALIGN32 float aux[8];
-  pstore(aux, a);
-  Eigen::half h0(aux[0]);
-  Eigen::half h1(aux[1]);
-  Eigen::half h2(aux[2]);
-  Eigen::half h3(aux[3]);
-  Eigen::half h4(aux[4]);
-  Eigen::half h5(aux[5]);
-  Eigen::half h6(aux[6]);
-  Eigen::half h7(aux[7]);
-
-  Packet8h result;
-  result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
-  return result;
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = padd(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = pmul(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
-{
-  Packet8h result;
-  result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
-{
-  EIGEN_ALIGN32 Eigen::half aux[8];
-  pstore(aux, from);
-  to[stride*0].x = aux[0].x;
-  to[stride*1].x = aux[1].x;
-  to[stride*2].x = aux[2].x;
-  to[stride*3].x = aux[3].x;
-  to[stride*4].x = aux[4].x;
-  to[stride*5].x = aux[5].x;
-  to[stride*6].x = aux[6].x;
-  to[stride*7].x = aux[7].x;
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_max<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_min<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_mul<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet8h,8>& kernel) {
-  __m128i a = kernel.packet[0].x;
-  __m128i b = kernel.packet[1].x;
-  __m128i c = kernel.packet[2].x;
-  __m128i d = kernel.packet[3].x;
-  __m128i e = kernel.packet[4].x;
-  __m128i f = kernel.packet[5].x;
-  __m128i g = kernel.packet[6].x;
-  __m128i h = kernel.packet[7].x;
-
-  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
-  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
-  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
-  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
-  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
-  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
-  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
-  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
-
-  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
-  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
-  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
-  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
-  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
-  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
-  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
-  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
-
-  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
-  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
-  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
-  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
-  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
-  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
-  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
-  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
-
-  kernel.packet[0].x = a0b0c0d0e0f0g0h0;
-  kernel.packet[1].x = a1b1c1d1e1f1g1h1;
-  kernel.packet[2].x = a2b2c2d2e2f2g2h2;
-  kernel.packet[3].x = a3b3c3d3e3f3g3h3;
-  kernel.packet[4].x = a4b4c4d4e4f4g4h4;
-  kernel.packet[5].x = a5b5c5d5e5f5g5h5;
-  kernel.packet[6].x = a6b6c6d6e6f6g6h6;
-  kernel.packet[7].x = a7b7c7d7e7f7g7h7;
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet8h,4>& kernel) {
-  EIGEN_ALIGN32 Eigen::half in[4][8];
-  pstore<Eigen::half>(in[0], kernel.packet[0]);
-  pstore<Eigen::half>(in[1], kernel.packet[1]);
-  pstore<Eigen::half>(in[2], kernel.packet[2]);
-  pstore<Eigen::half>(in[3], kernel.packet[3]);
-
-  EIGEN_ALIGN32 Eigen::half out[4][8];
-
-  for (int i = 0; i < 4; ++i) {
-    for (int j = 0; j < 4; ++j) {
-      out[i][j] = in[j][2*i];
-    }
-    for (int j = 0; j < 4; ++j) {
-      out[i][j+4] = in[j][2*i+1];
-    }
-  }
-
-  kernel.packet[0] = pload<Packet8h>(out[0]);
-  kernel.packet[1] = pload<Packet8h>(out[1]);
-  kernel.packet[2] = pload<Packet8h>(out[2]);
-  kernel.packet[3] = pload<Packet8h>(out[3]);
-}
-
-
-// Disable the following code since it's broken on too many platforms / compilers.
-//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
-#elif 0
-
-typedef struct {
-  __m64 x;
-} Packet4h;
-
-
-template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
-
-template <>
-struct packet_traits<Eigen::half> : default_packet_traits {
-  typedef Packet4h type;
-  // There is no half-size packet for Packet4h.
-  typedef Packet4h half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 4,
-    HasHalfPacket = 0,
-    HasAdd    = 0,
-    HasSub    = 0,
-    HasMul    = 0,
-    HasNegate = 0,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasConj   = 0,
-    HasSetLinear = 0,
-    HasDiv = 0,
-    HasSqrt = 0,
-    HasRsqrt = 0,
-    HasExp = 0,
-    HasLog = 0,
-    HasBlend = 0
-  };
-};
-
-
-template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; };
-
-template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
-  Packet4h result;
-  result.x = _mm_set1_pi16(from.x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
-  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha + hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha + hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha + hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha + hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha * hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha * hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha * hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha * hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
-  Packet4h result;
-  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
-  Packet4h result;
-  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
-  __int64_t r = _mm_cvtm64_si64(from.x);
-  *(reinterpret_cast<__int64_t*>(to)) = r;
-}
-
-template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
-  __int64_t r = _mm_cvtm64_si64(from.x);
-  *(reinterpret_cast<__int64_t*>(to)) = r;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h
-ploadquad<Packet4h>(const Eigen::half* from) {
-  return pset1<Packet4h>(*from);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
-{
-  Packet4h result;
-  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
-{
-  __int64_t a = _mm_cvtm64_si64(from.x);
-  to[stride*0].x = static_cast<unsigned short>(a);
-  to[stride*1].x = static_cast<unsigned short>(a >> 16);
-  to[stride*2].x = static_cast<unsigned short>(a >> 32);
-  to[stride*3].x = static_cast<unsigned short>(a >> 48);
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet4h,4>& kernel) {
-  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
-  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
-  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
-  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
-
-  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
-  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
-  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
-  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
-}
-
-#endif
-
-}
-}
-
-#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
diff --git a/contrib/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h b/contrib/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h
deleted file mode 100644
index aa5fbce8eac475a752a5b096133a01940b49f4a8..0000000000000000000000000000000000000000
--- a/contrib/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h
+++ /dev/null
@@ -1,212 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_TYPE_CASTING_CUDA_H
-#define EIGEN_TYPE_CASTING_CUDA_H
-
-namespace Eigen {
-
-namespace internal {
-
-template<>
-struct scalar_cast_op<float, Eigen::half> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef Eigen::half result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
-    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-      return __float2half(a);
-    #else
-      return Eigen::half(a);
-    #endif
-  }
-};
-
-template<>
-struct functor_traits<scalar_cast_op<float, Eigen::half> >
-{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
-
-
-template<>
-struct scalar_cast_op<int, Eigen::half> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef Eigen::half result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
-    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-      return __float2half(static_cast<float>(a));
-    #else
-      return Eigen::half(static_cast<float>(a));
-    #endif
-  }
-};
-
-template<>
-struct functor_traits<scalar_cast_op<int, Eigen::half> >
-{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
-
-
-template<>
-struct scalar_cast_op<Eigen::half, float> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef float result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
-    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-      return __half2float(a);
-    #else
-      return static_cast<float>(a);
-    #endif
-  }
-};
-
-template<>
-struct functor_traits<scalar_cast_op<Eigen::half, float> >
-{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
-
-
-
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-
-template <>
-struct type_casting_traits<Eigen::half, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 2,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
-  float2 r1 = __half22float2(a);
-  float2 r2 = __half22float2(b);
-  return make_float4(r1.x, r1.y, r2.x, r2.y);
-}
-
-template <>
-struct type_casting_traits<float, Eigen::half> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 2
-  };
-};
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
-  // Simply discard the second half of the input
-  return __floats2half2_rn(a.x, a.y);
-}
-
-#elif defined EIGEN_VECTORIZE_AVX512
-template <>
-struct type_casting_traits<half, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
-  return half2float(a);
-}
-
-template <>
-struct type_casting_traits<float, half> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
-  return float2half(a);
-}
-
-#elif defined EIGEN_VECTORIZE_AVX
-
-template <>
-struct type_casting_traits<Eigen::half, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
-  return half2float(a);
-}
-
-template <>
-struct type_casting_traits<float, Eigen::half> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
-  return float2half(a);
-}
-
-// Disable the following code since it's broken on too many platforms / compilers.
-//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
-#elif 0
-
-template <>
-struct type_casting_traits<Eigen::half, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
-  float f1 = static_cast<float>(h);
-  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  float f2 = static_cast<float>(h);
-  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  float f3 = static_cast<float>(h);
-  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  float f4 = static_cast<float>(h);
-  return _mm_set_ps(f4, f3, f2, f1);
-}
-
-template <>
-struct type_casting_traits<float, Eigen::half> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
-  EIGEN_ALIGN16 float aux[4];
-  pstore(aux, a);
-  Eigen::half h0(aux[0]);
-  Eigen::half h1(aux[1]);
-  Eigen::half h2(aux[2]);
-  Eigen::half h3(aux[3]);
-
-  Packet4h result;
-  result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
-  return result;
-}
-
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_TYPE_CASTING_CUDA_H
diff --git a/contrib/eigen/Eigen/src/Core/arch/Default/ConjHelper.h b/contrib/eigen/Eigen/src/Core/arch/Default/ConjHelper.h
index 4cfe34e05268c459645c00449b0a92b23f1d0c2b..53830b5a274c72546315eaea5d1fb4fb737e3a8b 100644
--- a/contrib/eigen/Eigen/src/Core/arch/Default/ConjHelper.h
+++ b/contrib/eigen/Eigen/src/Core/arch/Default/ConjHelper.h
@@ -11,19 +11,107 @@
 #ifndef EIGEN_ARCH_CONJ_HELPER_H
 #define EIGEN_ARCH_CONJ_HELPER_H
 
-#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL)                                                          \
-  template<> struct conj_helper<PACKET_REAL, PACKET_CPLX, false,false> {                                          \
-    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \
-    { return padd(c, pmul(x,y)); }                                                                                \
-    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const                        \
-    { return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); }                                           \
-  };                                                                                                              \
-                                                                                                                  \
-  template<> struct conj_helper<PACKET_CPLX, PACKET_REAL, false,false> {                                          \
-    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \
-    { return padd(c, pmul(x,y)); }                                                                                \
-    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const                        \
-    { return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); }                                           \
+#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL)      \
+  template <>                                                           \
+  struct conj_helper<PACKET_REAL, PACKET_CPLX, false, false> {          \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x,         \
+                                          const PACKET_CPLX& y,         \
+                                          const PACKET_CPLX& c) const { \
+      return padd(c, this->pmul(x, y));                                 \
+    }                                                                   \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x,          \
+                                         const PACKET_CPLX& y) const {  \
+      return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v));   \
+    }                                                                   \
+  };                                                                    \
+                                                                        \
+  template <>                                                           \
+  struct conj_helper<PACKET_CPLX, PACKET_REAL, false, false> {          \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x,         \
+                                          const PACKET_REAL& y,         \
+                                          const PACKET_CPLX& c) const { \
+      return padd(c, this->pmul(x, y));                                 \
+    }                                                                   \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x,          \
+                                         const PACKET_REAL& y) const {  \
+      return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y));   \
+    }                                                                   \
   };
 
-#endif // EIGEN_ARCH_CONJ_HELPER_H
+namespace Eigen {
+namespace internal {
+
+template<bool Conjugate> struct conj_if;
+
+template<> struct conj_if<true> {
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { return numext::conj(x); }
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T pconj(const T& x) const { return internal::pconj(x); }
+};
+
+template<> struct conj_if<false> {
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator()(const T& x) const { return x; }
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& pconj(const T& x) const { return x; }
+};
+
+// Generic Implementation, assume scalars since the packet-version is
+// specialized below.
+template<typename LhsType, typename RhsType, bool ConjLhs, bool ConjRhs>
+struct conj_helper {
+  typedef typename ScalarBinaryOpTraits<LhsType, RhsType>::ReturnType ResultType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
+  pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const
+  { return this->pmul(x, y) + c; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
+  pmul(const LhsType& x, const RhsType& y) const
+  { return conj_if<ConjLhs>()(x) * conj_if<ConjRhs>()(y); }
+};
+
+template<typename LhsScalar, typename RhsScalar>
+struct conj_helper<LhsScalar, RhsScalar, true, true> {
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar>::ReturnType ResultType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
+  pmadd(const LhsScalar& x, const RhsScalar& y, const ResultType& c) const
+  { return this->pmul(x, y) + c; }
+
+  // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
+  pmul(const LhsScalar& x, const RhsScalar& y) const
+  { return numext::conj(x * y); }
+};
+
+// Implementation with equal type, use packet operations.
+template<typename Packet, bool ConjLhs, bool ConjRhs>
+struct conj_helper<Packet, Packet, ConjLhs, ConjRhs>
+{
+  typedef Packet ResultType;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const
+  { return Eigen::internal::pmadd(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y), c); }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const
+  { return Eigen::internal::pmul(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y)); }
+};
+
+template<typename Packet>
+struct conj_helper<Packet, Packet, true, true>
+{
+  typedef Packet ResultType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const
+  { return Eigen::internal::pmadd(pconj(x), pconj(y), c); }
+  // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const
+  { return pconj(Eigen::internal::pmul(x, y)); }
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_ARCH_CONJ_HELPER_H
diff --git a/contrib/eigen/Eigen/src/Core/arch/Default/Settings.h b/contrib/eigen/Eigen/src/Core/arch/Default/Settings.h
index 097373c84dc0ba4ecc4972c95b8976b58f8e5c2a..a5c3ada4c7fc54fd250a020c7d4fe3487cd1b1c5 100644
--- a/contrib/eigen/Eigen/src/Core/arch/Default/Settings.h
+++ b/contrib/eigen/Eigen/src/Core/arch/Default/Settings.h
@@ -21,7 +21,7 @@
   * it does not correspond to the number of iterations or the number of instructions
   */
 #ifndef EIGEN_UNROLLING_LIMIT
-#define EIGEN_UNROLLING_LIMIT 100
+#define EIGEN_UNROLLING_LIMIT 110
 #endif
 
 /** Defines the threshold between a "small" and a "large" matrix.
diff --git a/contrib/eigen/Eigen/src/Core/arch/NEON/Complex.h b/contrib/eigen/Eigen/src/Core/arch/NEON/Complex.h
index 306a309beb2f5e319aadfacc2a42e949a5446740..f40af7f87f2f77217775309568030c5ddf7facfc 100644
--- a/contrib/eigen/Eigen/src/Core/arch/NEON/Complex.h
+++ b/contrib/eigen/Eigen/src/Core/arch/NEON/Complex.h
@@ -15,9 +15,10 @@ namespace Eigen {
 
 namespace internal {
 
-inline uint32x4_t p4ui_CONJ_XOR() {
+inline uint32x4_t p4ui_CONJ_XOR()
+{
 // See bug 1325, clang fails to call vld1q_u64.
-#if EIGEN_COMP_CLANG
+#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
   uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
   return ret;
 #else
@@ -26,61 +27,136 @@ inline uint32x4_t p4ui_CONJ_XOR() {
 #endif
 }
 
-inline uint32x2_t p2ui_CONJ_XOR() {
+inline uint32x2_t p2ui_CONJ_XOR()
+{
   static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 };
   return vld1_u32( conj_XOR_DATA );
 }
 
 //---------- float ----------
+
+struct Packet1cf
+{
+  EIGEN_STRONG_INLINE Packet1cf() {}
+  EIGEN_STRONG_INLINE explicit Packet1cf(const Packet2f& a) : v(a) {}
+  Packet2f v;
+};
 struct Packet2cf
 {
   EIGEN_STRONG_INLINE Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
-  Packet4f  v;
+  Packet4f v;
 };
 
-template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+template<> struct packet_traits<std::complex<float> > : default_packet_traits
 {
   typedef Packet2cf type;
-  typedef Packet2cf half;
-  enum {
+  typedef Packet1cf half;
+  enum
+  {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    HasHalfPacket = 0,
-
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasMul       = 1,
+    HasDiv       = 1,
+    HasNegate    = 1,
+    HasAbs       = 0,
+    HasAbs2      = 0,
+    HasMin       = 0,
+    HasMax       = 0,
     HasSetLinear = 0
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
-
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+template<> struct unpacket_traits<Packet1cf>
 {
-  float32x2_t r64;
-  r64 = vld1_f32((const float *)&from);
+  typedef std::complex<float> type;
+  typedef Packet1cf half;
+  typedef Packet2f as_real;
+  enum
+  {
+    size = 1,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet2cf>
+{
+  typedef std::complex<float> type;
+  typedef Packet1cf half;
+  typedef Packet4f as_real;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet1cf pcast<float,Packet1cf>(const float& a)
+{ return Packet1cf(vset_lane_f32(a, vdup_n_f32(0.f), 0)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pcast<Packet2f,Packet2cf>(const Packet2f& a)
+{ return Packet2cf(vreinterpretq_f32_u64(vmovl_u32(vreinterpret_u32_f32(a)))); }
 
+template<> EIGEN_STRONG_INLINE Packet1cf pset1<Packet1cf>(const std::complex<float>& from)
+{ return Packet1cf(vld1_f32(reinterpret_cast<const float*>(&from))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
+{
+  const float32x2_t r64 = vld1_f32(reinterpret_cast<const float*>(&from));
   return Packet2cf(vcombine_f32(r64, r64));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cf padd<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(padd<Packet2f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(padd<Packet4f>(a.v, b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf psub<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(psub<Packet2f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(psub<Packet4f>(a.v, b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate<Packet2f>(a.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate<Packet4f>(a.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a)
+{
+  const Packet2ui b = vreinterpret_u32_f32(a.v);
+  return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR())));
+}
 template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
 {
-  Packet4ui b = vreinterpretq_u32_f32(a.v);
+  const Packet4ui b = vreinterpretq_u32_f32(a.v);
   return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR())));
 }
 
+template<> EIGEN_STRONG_INLINE Packet1cf pmul<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{
+  Packet2f v1, v2;
+
+  // Get the real values of a | a1_re | a1_re |
+  v1 = vdup_lane_f32(a.v, 0);
+  // Get the imag values of a | a1_im | a1_im |
+  v2 = vdup_lane_f32(a.v, 1);
+  // Multiply the real a with b
+  v1 = vmul_f32(v1, b.v);
+  // Multiply the imag a with b
+  v2 = vmul_f32(v2, b.v);
+  // Conjugate v2
+  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR()));
+  // Swap real/imag elements in v2.
+  v2 = vrev64_f32(v2);
+  // Add and return the result
+  return Packet1cf(vadd_f32(v1, v2));
+}
 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   Packet4f v1, v2;
@@ -93,7 +169,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
   v1 = vmulq_f32(v1, b.v);
   // Multiply the imag a with b
   v2 = vmulq_f32(v2, b.v);
-  // Conjugate v2 
+  // Conjugate v2
   v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR()));
   // Swap real/imag elements in v2.
   v2 = vrev64q_f32(v2);
@@ -101,98 +177,144 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
   return Packet2cf(vaddq_f32(v1, v2));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+template<> EIGEN_STRONG_INLINE Packet1cf pcmp_eq(const Packet1cf& a, const Packet1cf& b)
 {
-  return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+  // Compare real and imaginary parts of a and b to get the mask vector:
+  // [re(a[0])==re(b[0]), im(a[0])==im(b[0])]
+  Packet2f eq = pcmp_eq<Packet2f>(a.v, b.v);
+  // Swap real/imag elements in the mask in to get:
+  // [im(a[0])==im(b[0]), re(a[0])==re(b[0])]
+  Packet2f eq_swapped = vrev64_f32(eq);
+  // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+  return Packet1cf(pand<Packet2f>(eq, eq_swapped));
 }
-template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
-}
-template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
-}
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
+{
+  // Compare real and imaginary parts of a and b to get the mask vector:
+  // [re(a[0])==re(b[0]), im(a[0])==im(b[0]), re(a[1])==re(b[1]), im(a[1])==im(b[1])]
+  Packet4f eq = pcmp_eq<Packet4f>(a.v, b.v);
+  // Swap real/imag elements in the mask in to get:
+  // [im(a[0])==im(b[0]), re(a[0])==re(b[0]), im(a[1])==im(b[1]), re(a[1])==re(b[1])]
+  Packet4f eq_swapped = vrev64q_f32(eq);
+  // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+  return Packet2cf(pand<Packet4f>(eq, eq_swapped));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cf pand<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
 
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
+template<> EIGEN_STRONG_INLINE Packet1cf por<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
+template<> EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
 
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE Packet1cf pxor<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
 
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+template<> EIGEN_STRONG_INLINE Packet1cf pandnot<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
+{ return Packet1cf(vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{ return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf pload<Packet1cf>(const std::complex<float>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload<Packet2f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(reinterpret_cast<const float*>(from))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf ploadu<Packet1cf>(const std::complex<float>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cf(ploadu<Packet2f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(reinterpret_cast<const float*>(from))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cf ploaddup<Packet1cf>(const std::complex<float>* from)
+{ return pset1<Packet1cf>(*from); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from)
+{ return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *to, const Packet1cf& from)
+{ EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *to, const Packet2cf& from)
+{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<float*>(to), from.v); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *to, const Packet1cf& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *to, const Packet2cf& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<float*>(to), from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet1cf pgather<std::complex<float>, Packet1cf>(
+    const std::complex<float>* from, Index stride)
+{
+  const Packet2f tmp = vdup_n_f32(std::real(from[0*stride]));
+  return Packet1cf(vset_lane_f32(std::imag(from[0*stride]), tmp, 1));
+}
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
+    const std::complex<float>* from, Index stride)
 {
-  Packet4f res = pset1<Packet4f>(0.f);
-  res = vsetq_lane_f32(std::real(from[0*stride]), res, 0);
+  Packet4f res = vdupq_n_f32(std::real(from[0*stride]));
   res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1);
   res = vsetq_lane_f32(std::real(from[1*stride]), res, 2);
   res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3);
   return Packet2cf(res);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet1cf>(
+    std::complex<float>* to, const Packet1cf& from, Index stride)
+{ to[stride*0] = std::complex<float>(vget_lane_f32(from.v, 0), vget_lane_f32(from.v, 1)); }
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(
+    std::complex<float>* to, const Packet2cf& from, Index stride)
 {
   to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
   to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((const float *)addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *addr)
+{ EIGEN_ARM_PREFETCH(reinterpret_cast<const float*>(addr)); }
 
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
+template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet1cf>(const Packet1cf& a)
 {
-  std::complex<float> EIGEN_ALIGN16 x[2];
-  vst1q_f32((float *)x, a.v);
+  EIGEN_ALIGN16 std::complex<float> x;
+  vst1_f32(reinterpret_cast<float*>(&x), a.v);
+  return x;
+}
+template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
+{
+  EIGEN_ALIGN16 std::complex<float> x[2];
+  vst1q_f32(reinterpret_cast<float*>(x), a.v);
   return x[0];
 }
 
+template<> EIGEN_STRONG_INLINE Packet1cf preverse(const Packet1cf& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{
-  float32x2_t a_lo, a_hi;
-  Packet4f a_r128;
-
-  a_lo = vget_low_f32(a.v);
-  a_hi = vget_high_f32(a.v);
-  a_r128 = vcombine_f32(a_hi, a_lo);
-
-  return Packet2cf(a_r128);
-}
+{ return Packet2cf(vcombine_f32(vget_high_f32(a.v), vget_low_f32(a.v))); }
 
+template<> EIGEN_STRONG_INLINE Packet1cf pcplxflip<Packet1cf>(const Packet1cf& a)
+{ return Packet1cf(vrev64_f32(a.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a)
+{ return Packet2cf(vrev64q_f32(a.v)); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet1cf>(const Packet1cf& a)
 {
-  return Packet2cf(vrev64q_f32(a.v));
+  std::complex<float> s;
+  vst1_f32((float *)&s, a.v);
+  return s;
 }
-
 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
 {
-  float32x2_t a1, a2;
   std::complex<float> s;
-
-  a1 = vget_low_f32(a.v);
-  a2 = vget_high_f32(a.v);
-  a2 = vadd_f32(a1, a2);
-  vst1_f32((float *)&s, a2);
-
+  vst1_f32(reinterpret_cast<float*>(&s), vadd_f32(vget_low_f32(a.v), vget_high_f32(a.v)));
   return s;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet1cf>(const Packet1cf& a)
 {
-  Packet4f sum1, sum2, sum;
-
-  // Add the first two 64-bit float32x2_t of vecs[0]
-  sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v));
-  sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v));
-  sum = vaddq_f32(sum1, sum2);
-
-  return Packet2cf(sum);
+  std::complex<float> s;
+  vst1_f32((float *)&s, a.v);
+  return s;
 }
-
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
 {
   float32x2_t a1, a2, v1, v2, prod;
@@ -208,90 +330,67 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
   v1 = vmul_f32(v1, a2);
   // Multiply the imag a with b
   v2 = vmul_f32(v2, a2);
-  // Conjugate v2 
+  // Conjugate v2
   v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR()));
   // Swap real/imag elements in v2.
   v2 = vrev64_f32(v2);
   // Add v1, v2
   prod = vadd_f32(v1, v2);
 
-  vst1_f32((float *)&s, prod);
+  vst1_f32(reinterpret_cast<float*>(&s), prod);
 
   return s;
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet2cf>
-{
-  EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second)
-  {
-    if (Offset==1)
-    {
-      first.v = vextq_f32(first.v, second.v, 2);
-    }
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cf,Packet2f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
+template<> EIGEN_STRONG_INLINE Packet1cf pdiv<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
 {
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
+  // TODO optimize it for NEON
+  Packet1cf res = pmul(a, pconj(b));
+  Packet2f s, rev_s;
 
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+  // this computes the norm
+  s = vmul_f32(b.v, b.v);
+  rev_s = vrev64_f32(s);
 
+  return Packet1cf(pdiv<Packet2f>(res.v, vadd_f32(s, rev_s)));
+}
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   // TODO optimize it for NEON
-  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
+  Packet2cf res = pmul(a,pconj(b));
   Packet4f s, rev_s;
 
   // this computes the norm
   s = vmulq_f32(b.v, b.v);
   rev_s = vrev64q_f32(s);
 
-  return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s,rev_s)));
+  return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s, rev_s)));
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2cf,2>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1cf, 1>& /*kernel*/) {}
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel)
+{
   Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
   kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
   kernel.packet[1].v = tmp;
 }
 
+template<> EIGEN_STRONG_INLINE Packet1cf psqrt<Packet1cf>(const Packet1cf& a) {
+  return psqrt_complex<Packet1cf>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+  return psqrt_complex<Packet2cf>(a);
+}
+
 //---------- double ----------
 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
 
 // See bug 1325, clang fails to call vld1q_u64.
-#if EIGEN_COMP_CLANG
+#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
   static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000};
 #else
   const uint64_t  p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 };
@@ -309,7 +408,8 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 {
   typedef Packet1cd type;
   typedef Packet1cd half;
-  enum {
+  enum
+  {
     Vectorizable = 1,
     AlignedOnScalar = 0,
     size = 1,
@@ -328,24 +428,50 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd>
+{
+  typedef std::complex<double> type;
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
+  enum
+  {
+    size=1,
+    alignment=Aligned16,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>(reinterpret_cast<const double*>(from))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>(reinterpret_cast<const double*>(from))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
+{
+  /* here we really have to use unaligned loads :( */
+  return ploadu<Packet1cd>(&from);
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{ return Packet1cd(padd<Packet2d>(a.v, b.v)); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
-{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{ return Packet1cd(psub<Packet2d>(a.v, b.v)); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd<Packet2d>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub<Packet2d>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate<Packet2d>(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a)
+{ return Packet1cd(pnegate<Packet2d>(a.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
+{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); }
 
 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   Packet2d v1, v2;
 
-  // Get the real values of a 
+  // Get the real values of a
   v1 = vdupq_lane_f64(vget_low_f64(a.v), 0);
   // Get the imag values of a
   v2 = vdupq_lane_f64(vget_high_f64(a.v), 0);
@@ -353,7 +479,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
   v1 = vmulq_f64(v1, b.v);
   // Multiply the imag a with b
   v2 = vmulq_f64(v2, b.v);
-  // Conjugate v2 
+  // Conjugate v2
   v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR));
   // Swap real/imag elements in v2.
   v2 = preverse<Packet2d>(v2);
@@ -361,31 +487,44 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
   return Packet1cd(vaddq_f64(v1, v2));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
-  return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
-}
-template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
-  return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
-}
-template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
 {
-  return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+  // Compare real and imaginary parts of a and b to get the mask vector:
+  // [re(a)==re(b), im(a)==im(b)]
+  Packet2d eq = pcmp_eq<Packet2d>(a.v, b.v);
+  // Swap real/imag elements in the mask in to get:
+  // [im(a)==im(b), re(a)==re(b)]
+  Packet2d eq_swapped = vreinterpretq_f64_u32(vrev64q_u32(vreinterpretq_u32_f64(eq)));
+  // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+  return Packet1cd(pand<Packet2d>(eq, eq_swapped));
 }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{ return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{ return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+
 template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
-  return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
-}
+{ return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from)
+{ return pset1<Packet1cd>(*from); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *to, const Packet1cd& from)
+{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<double*>(to), from.v); }
 
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *to, const Packet1cd& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), from.v); }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ARM_PREFETCH((const double *)addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *addr)
+{ EIGEN_ARM_PREFETCH(reinterpret_cast<const double*>(addr)); }
 
-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
+    const std::complex<double>* from, Index stride)
 {
   Packet2d res = pset1<Packet2d>(0.0);
   res = vsetq_lane_f64(std::real(from[0*stride]), res, 0);
@@ -393,17 +532,14 @@ template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Pack
   return Packet1cd(res);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
-{
-  to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1));
-}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(
+    std::complex<double>* to, const Packet1cd& from, Index stride)
+{ to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); }
 
-
-template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
+template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
 {
-  std::complex<double> EIGEN_ALIGN16 res;
+  EIGEN_ALIGN16 std::complex<double> res;
   pstore<std::complex<double> >(&res, a);
-
   return res;
 }
 
@@ -411,59 +547,14 @@ template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a
 
 template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) { return vecs[0]; }
-
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
 
-template<int Offset>
-struct palign_impl<Offset,Packet1cd>
-{
-  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
-  {
-    // FIXME is it sure we never have to align a Packet1cd?
-    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   // TODO optimize it for NEON
-  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet1cd res = pmul(a,pconj(b));
   Packet2d s = pmul<Packet2d>(b.v, b.v);
   Packet2d rev_s = preverse<Packet2d>(s);
 
@@ -471,9 +562,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con
 }
 
 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
-{
-  return Packet1cd(preverse(Packet2d(x.v)));
-}
+{ return Packet1cd(preverse(Packet2d(x.v))); }
 
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
 {
@@ -481,6 +570,11 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
   kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v));
   kernel.packet[1].v = tmp;
 }
+
+template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+  return psqrt_complex<Packet1cd>(a);
+}
+
 #endif // EIGEN_ARCH_ARM64
 
 } // end namespace internal
diff --git a/contrib/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h b/contrib/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h
index 6bb05bb922aafac46b5eb043d66f1c81ab77834b..fa6615a85194f72887a41867f5e586fb8c2a62de 100644
--- a/contrib/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/contrib/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -5,10 +5,6 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-/* The sin, cos, exp, and log functions of this file come from
- * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
- */
-
 #ifndef EIGEN_MATH_FUNCTIONS_NEON_H
 #define EIGEN_MATH_FUNCTIONS_NEON_H
 
@@ -16,74 +12,62 @@ namespace Eigen {
 
 namespace internal {
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pexp<Packet4f>(const Packet4f& _x)
-{
-  Packet4f x = _x;
-  Packet4f tmp, fx;
-
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
-  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-
-  x = vminq_f32(x, p4f_exp_hi);
-  x = vmaxq_f32(x, p4f_exp_lo);
-
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF);
-
-  /* perform a floorf */
-  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
-
-  /* if greater, substract 1 */
-  Packet4ui mask = vcgtq_f32(tmp, fx);
-  mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1));
-
-  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
-
-  tmp = vmulq_f32(fx, p4f_cephes_exp_C1);
-  Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2);
-  x = vsubq_f32(x, tmp);
-  x = vsubq_f32(x, z);
-
-  Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x);
-  z = vmulq_f32(x, x);
-  y = vaddq_f32(y, p4f_cephes_exp_p1);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, p4f_cephes_exp_p2);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, p4f_cephes_exp_p3);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, p4f_cephes_exp_p4);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, p4f_cephes_exp_p5);
-
-  y = vmulq_f32(y, z);
-  y = vaddq_f32(y, x);
-  y = vaddq_f32(y, p4f_1);
-
-  /* build 2^n */
-  int32x4_t mm;
-  mm = vcvtq_s32_f32(fx);
-  mm = vaddq_s32(mm, p4i_0x7f);
-  mm = vshlq_n_s32(mm, 23);
-  Packet4f pow2n = vreinterpretq_f32_s32(mm);
-
-  y = vmulq_f32(y, pow2n);
-  return y;
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pexp<Packet2f>(const Packet2f& x)
+{ return pexp_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp<Packet4f>(const Packet4f& x)
+{ return pexp_float(x); }
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f plog<Packet2f>(const Packet2f& x)
+{ return plog_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog<Packet4f>(const Packet4f& x)
+{ return plog_float(x); }
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f psin<Packet2f>(const Packet2f& x)
+{ return psin_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psin<Packet4f>(const Packet4f& x)
+{ return psin_float(x); }
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pcos<Packet2f>(const Packet2f& x)
+{ return pcos_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pcos<Packet4f>(const Packet4f& x)
+{ return pcos_float(x); }
+
+// Hyperbolic Tangent function.
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f ptanh<Packet2f>(const Packet2f& x)
+{ return internal::generic_fast_tanh_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f ptanh<Packet4f>(const Packet4f& x)
+{ return internal::generic_fast_tanh_float(x); }
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, psin)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pcos)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, plog)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, ptanh)
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pfrexp(const Packet4bf& a, Packet4bf& exponent) {
+  Packet4f fexponent;
+  const Packet4bf out = F32ToBf16(pfrexp<Packet4f>(Bf16ToF32(a), fexponent));
+  exponent = F32ToBf16(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pldexp(const Packet4bf& a, const Packet4bf& exponent) {
+  return F32ToBf16(pldexp<Packet4f>(Bf16ToF32(a), Bf16ToF32(exponent)));
 }
 
+//---------- double ----------
+
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp<Packet2d>(const Packet2d& x)
+{ return pexp_double(x); }
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d plog<Packet2d>(const Packet2d& x)
+{ return plog_double(x); }
+
+#endif
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/contrib/eigen/Eigen/src/Core/arch/NEON/PacketMath.h b/contrib/eigen/Eigen/src/Core/arch/NEON/PacketMath.h
index 3d5ed0d240c15cde1ef9f1a600353710816d40e4..d2aeef4308b60def3670274e81c1380ccb52b91c 100644
--- a/contrib/eigen/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/contrib/eigen/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -24,54 +24,118 @@ namespace internal {
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#endif
-
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
 #if EIGEN_ARCH_ARM64
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
 #else
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
 #endif
 #endif
 
-#if EIGEN_COMP_MSVC
+#if EIGEN_COMP_MSVC_STRICT
 
 // In MSVC's arm_neon.h header file, all NEON vector types
 // are aliases to the same underlying type __n128.
 // We thus have to wrap them to make them different C++ types.
 // (See also bug 1428)
+typedef eigen_packet_wrapper<float32x2_t,0>  Packet2f;
+typedef eigen_packet_wrapper<float32x4_t,1>  Packet4f;
+typedef eigen_packet_wrapper<int32_t    ,2>  Packet4c;
+typedef eigen_packet_wrapper<int8x8_t   ,3>  Packet8c;
+typedef eigen_packet_wrapper<int8x16_t  ,4>  Packet16c;
+typedef eigen_packet_wrapper<uint32_t   ,5>  Packet4uc;
+typedef eigen_packet_wrapper<uint8x8_t  ,6>  Packet8uc;
+typedef eigen_packet_wrapper<uint8x16_t ,7>  Packet16uc;
+typedef eigen_packet_wrapper<int16x4_t  ,8>  Packet4s;
+typedef eigen_packet_wrapper<int16x8_t  ,9>  Packet8s;
+typedef eigen_packet_wrapper<uint16x4_t ,10> Packet4us;
+typedef eigen_packet_wrapper<uint16x8_t ,11> Packet8us;
+typedef eigen_packet_wrapper<int32x2_t  ,12> Packet2i;
+typedef eigen_packet_wrapper<int32x4_t  ,13> Packet4i;
+typedef eigen_packet_wrapper<uint32x2_t ,14> Packet2ui;
+typedef eigen_packet_wrapper<uint32x4_t ,15> Packet4ui;
+typedef eigen_packet_wrapper<int64x2_t  ,16> Packet2l;
+typedef eigen_packet_wrapper<uint64x2_t ,17> Packet2ul;
 
-template<typename T,int unique_id>
-struct eigen_packet_wrapper
-{
-  operator T&() { return m_val; }
-  operator const T&() const { return m_val; }
-  eigen_packet_wrapper() {}
-  eigen_packet_wrapper(const T &v) : m_val(v) {}
-  eigen_packet_wrapper& operator=(const T &v) {
-    m_val = v;
-    return *this;
-  }
+#else
 
-  T m_val;
-};
-typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
-typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
-typedef eigen_packet_wrapper<int32x4_t  ,2> Packet4i;
-typedef eigen_packet_wrapper<int32x2_t  ,3> Packet2i;
-typedef eigen_packet_wrapper<uint32x4_t ,4> Packet4ui;
+typedef float32x2_t                          Packet2f;
+typedef float32x4_t                          Packet4f;
+typedef eigen_packet_wrapper<int32_t    ,2>  Packet4c;
+typedef int8x8_t                             Packet8c;
+typedef int8x16_t                            Packet16c;
+typedef eigen_packet_wrapper<uint32_t   ,5>  Packet4uc;
+typedef uint8x8_t                            Packet8uc;
+typedef uint8x16_t                           Packet16uc;
+typedef int16x4_t                            Packet4s;
+typedef int16x8_t                            Packet8s;
+typedef uint16x4_t                           Packet4us;
+typedef uint16x8_t                           Packet8us;
+typedef int32x2_t                            Packet2i;
+typedef int32x4_t                            Packet4i;
+typedef uint32x2_t                           Packet2ui;
+typedef uint32x4_t                           Packet4ui;
+typedef int64x2_t                            Packet2l;
+typedef uint64x2_t                           Packet2ul;
+
+#endif // EIGEN_COMP_MSVC_STRICT
+
+EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){
+  const float* a = reinterpret_cast<const float*>(&m);
+  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))};
+  return res;
+}
 
-#else
+// fuctionally equivalent to _mm_shuffle_ps in SSE when interleave
+// == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
+// interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
+// to enable a shared implementation for fast inversion of matrices of size 4. 
+template<bool interleave> 
+EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask)
+{
+  const float* a = reinterpret_cast<const float*>(&m);
+  const float* b = reinterpret_cast<const float*>(&n);
+  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
+  return res;
+}
+
+template<> 
+EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f &m, const Packet4f &n, int mask) 
+{
+  const float* a = reinterpret_cast<const float*>(&m);
+  const float* b = reinterpret_cast<const float*>(&n);
+  Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
+  return res;
+}
 
-typedef float32x2_t Packet2f;
-typedef float32x4_t Packet4f;
-typedef int32x4_t   Packet4i;
-typedef int32x2_t   Packet2i;
-typedef uint32x4_t  Packet4ui;
+EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));}
 
-#endif // EIGEN_COMP_MSVC
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s)
+{ 
+  return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s)
+{ 
+  return shuffle2<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
+{
+  return shuffle2<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
+{
+  return shuffle2<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
+{
+  return shuffle2<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
+{
+  return shuffle2<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
+}
+#define vec4f_duplane(a, p) \
+  vdupq_lane_f32(vget_low_f32(a), p)
 
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
   const Packet4f p4f_##NAME = pset1<Packet4f>(X)
@@ -98,660 +162,4423 @@ typedef uint32x4_t  Packet4ui;
   #define EIGEN_ARM_PREFETCH(ADDR)
 #endif
 
-template<> struct packet_traits<float>  : default_packet_traits
-{
-  typedef Packet4f type;
-  typedef Packet4f half; // Packet2f intrinsics not implemented yet
+template <>
+struct packet_traits<float> : default_packet_traits
+{
+  typedef Packet4f type;
+  typedef Packet2f half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+
+    HasDiv   = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+
+    HasSin  = EIGEN_FAST_MATH,
+    HasCos  = EIGEN_FAST_MATH,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf  = EIGEN_FAST_MATH,
+    HasBessel = 0,  // Issues with accuracy.
+    HasNdtri = 0
+  };
+};
+
+template <>
+struct packet_traits<int8_t> : default_packet_traits
+{
+  typedef Packet16c type;
+  typedef Packet8c half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasAbsDiff   = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0
+  };
+};
+
+template <>
+struct packet_traits<uint8_t> : default_packet_traits
+{
+  typedef Packet16uc type;
+  typedef Packet8uc half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 0,
+    HasAbs       = 1,
+    HasAbsDiff   = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+
+    HasSqrt = 1
+  };
+};
+
+template <>
+struct packet_traits<int16_t> : default_packet_traits
+{
+  typedef Packet8s type;
+  typedef Packet4s half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasAbsDiff   = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0
+  };
+};
+
+template <>
+struct packet_traits<uint16_t> : default_packet_traits
+{
+  typedef Packet8us type;
+  typedef Packet4us half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 0,
+    HasAbs       = 0,
+    HasAbsDiff   = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+    HasSqrt = 1
+  };
+};
+
+template <>
+struct packet_traits<int32_t> : default_packet_traits
+{
+  typedef Packet4i type;
+  typedef Packet2i half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0
+  };
+};
+
+template <>
+struct packet_traits<uint32_t> : default_packet_traits
+{
+  typedef Packet4ui type;
+  typedef Packet2ui half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 1,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 0,
+    HasAbs       = 0,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+
+    HasSqrt = 1
+  };
+};
+
+template <>
+struct packet_traits<int64_t> : default_packet_traits
+{
+  typedef Packet2l type;
+  typedef Packet2l half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasCmp       = 1,
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0
+  };
+};
+
+template <>
+struct packet_traits<uint64_t> : default_packet_traits
+{
+  typedef Packet2ul type;
+  typedef Packet2ul half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasCmp       = 1,
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 0,
+    HasAbs       = 0,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0
+  };
+};
+
+#if EIGEN_GNUC_AT_MOST(4, 4) && !EIGEN_COMP_LLVM
+// workaround gcc 4.2, 4.3 and 4.4 compilation issue
+EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
+EIGEN_STRONG_INLINE float32x2_t vld1_f32(const float* x) { return ::vld1_f32 ((const float32_t*)x); }
+EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32(const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
+EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
+EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
+#endif
+
+template<> struct unpacket_traits<Packet2f>
+{
+  typedef float type;
+  typedef Packet2f half;
+  typedef Packet2i integer_packet;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4f>
+{
+  typedef float type;
+  typedef Packet2f half;
+  typedef Packet4i integer_packet;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4c>
+{
+  typedef int8_t type;
+  typedef Packet4c half;
+  enum
+  {
+    size = 4,
+    alignment = Unaligned,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet8c>
+{
+  typedef int8_t type;
+  typedef Packet4c half;
+  enum
+  {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet16c>
+{
+  typedef int8_t type;
+  typedef Packet8c half;
+  enum
+  {
+    size = 16,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4uc>
+{
+  typedef uint8_t type;
+  typedef Packet4uc half;
+  enum
+  {
+    size = 4,
+    alignment = Unaligned,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet8uc>
+{
+  typedef uint8_t type;
+  typedef Packet4uc half;
+  enum
+  {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet16uc>
+{
+  typedef uint8_t type;
+  typedef Packet8uc half;
+  enum
+  {
+    size = 16,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false};
+};
+template<> struct unpacket_traits<Packet4s>
+{
+  typedef int16_t type;
+  typedef Packet4s half;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet8s>
+{
+  typedef int16_t type;
+  typedef Packet4s half;
+  enum
+  {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4us>
+{
+  typedef uint16_t type;
+  typedef Packet4us half;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet8us>
+{
+  typedef uint16_t type;
+  typedef Packet4us half;
+  enum
+  {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet2i>
+{
+  typedef int32_t type;
+  typedef Packet2i half;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4i>
+{
+  typedef int32_t type;
+  typedef Packet2i half;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet2ui>
+{
+  typedef uint32_t type;
+  typedef Packet2ui half;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet4ui>
+{
+  typedef uint32_t type;
+  typedef Packet2ui half;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet2l>
+{
+  typedef int64_t type;
+  typedef Packet2l half;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template<> struct unpacket_traits<Packet2ul>
+{
+  typedef uint64_t type;
+  typedef Packet2ul half;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(const float& from) { return vdup_n_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(const int8_t& from)
+{ return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(const int8_t& from) { return vdup_n_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) { return vdupq_n_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(const uint8_t& from)
+{ return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0); }
+template<> EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(const uint8_t& from) { return vdup_n_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) { return vdupq_n_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(const int16_t& from) { return vdup_n_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) { return vdupq_n_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(const uint16_t& from) { return vdup_n_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) { return vdupq_n_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(const int32_t& from) { return vdup_n_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(const uint32_t& from) { return vdup_n_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) { return vdupq_n_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) { return vdupq_n_s64(from); }
+template<> EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) { return vdupq_n_u64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(unsigned int from)
+{ return vreinterpret_f32_u32(vdup_n_u32(from)); }
+template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from)
+{ return vreinterpretq_f32_u32(vdupq_n_u32(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(const float& a)
+{
+  const float c[] = {0.0f,1.0f};
+  return vadd_f32(pset1<Packet2f>(a), vld1_f32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
+{
+  const float c[] = {0.0f,1.0f,2.0f,3.0f};
+  return vaddq_f32(pset1<Packet4f>(a), vld1q_f32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(const int8_t& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(const int8_t& a)
+{
+  const int8_t c[] = {0,1,2,3,4,5,6,7};
+  return vadd_s8(pset1<Packet8c>(a), vld1_s8(c));
+}
+template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a)
+{
+  const int8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  return vaddq_s8(pset1<Packet16c>(a), vld1q_s8(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(const uint8_t& a)
+{ return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(const uint8_t& a)
+{
+  const uint8_t c[] = {0,1,2,3,4,5,6,7};
+  return vadd_u8(pset1<Packet8uc>(a), vld1_u8(c));
+}
+template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a)
+{
+  const uint8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  return vaddq_u8(pset1<Packet16uc>(a), vld1q_u8(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(const int16_t& a)
+{
+  const int16_t c[] = {0,1,2,3};
+  return vadd_s16(pset1<Packet4s>(a), vld1_s16(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(const uint16_t& a)
+{
+  const uint16_t c[] = {0,1,2,3};
+  return vadd_u16(pset1<Packet4us>(a), vld1_u16(c));
+}
+template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a)
+{
+  const int16_t c[] = {0,1,2,3,4,5,6,7};
+  return vaddq_s16(pset1<Packet8s>(a), vld1q_s16(c));
+}
+template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a)
+{
+  const uint16_t c[] = {0,1,2,3,4,5,6,7};
+  return vaddq_u16(pset1<Packet8us>(a), vld1q_u16(c));
+}
+template<> EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(const int32_t& a)
+{
+  const int32_t c[] = {0,1};
+  return vadd_s32(pset1<Packet2i>(a), vld1_s32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
+{
+  const int32_t c[] = {0,1,2,3};
+  return vaddq_s32(pset1<Packet4i>(a), vld1q_s32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(const uint32_t& a)
+{
+  const uint32_t c[] = {0,1};
+  return vadd_u32(pset1<Packet2ui>(a), vld1_u32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a)
+{
+  const uint32_t c[] = {0,1,2,3};
+  return vaddq_u32(pset1<Packet4ui>(a), vld1q_u32(c));
+}
+template<> EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a)
+{
+  const int64_t c[] = {0,1};
+  return vaddq_s64(pset1<Packet2l>(a), vld1q_s64(c));
+}
+template<> EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a)
+{
+  const uint64_t c[] = {0,1};
+  return vaddq_u64(pset1<Packet2ul>(a), vld1q_u64(c));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(const Packet2f& a, const Packet2f& b) { return vadd_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(const Packet8c& a, const Packet8c& b) { return vadd_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) { return vaddq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vadd_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vaddq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(const Packet4s& a, const Packet4s& b) { return vadd_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) { return vaddq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(const Packet4us& a, const Packet4us& b) { return vadd_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) { return vaddq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(const Packet2i& a, const Packet2i& b) { return vadd_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vadd_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vaddq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) { return vaddq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vaddq_u64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(const Packet2f& a, const Packet2f& b) { return vsub_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vsub_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(const Packet8c& a, const Packet8c& b) { return vsub_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) { return vsubq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vsub_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vsub_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vsubq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(const Packet4s& a, const Packet4s& b) { return vsub_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) { return vsubq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(const Packet4us& a, const Packet4us& b) { return vsub_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) { return vsubq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(const Packet2i& a, const Packet2i& b) { return vsub_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vsub_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vsubq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
+template<> EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f & b) {
+  Packet2f mask = {numext::bit_cast<float>(0x80000000u), 0.0f};
+  return padd(a, pxor(mask, b));
+}
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f mask = {numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f};
+  return padd(a, pxor(mask, b));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) { return vneg_s8(a); }
+template<> EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) { return vnegq_s8(a); }
+template<> EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) { return vneg_s16(a); }
+template<> EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) { return vnegq_s16(a); }
+template<> EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) { return vneg_s32(a); }
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
+template<> EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
+#if EIGEN_ARCH_ARM64
+  return vnegq_s64(a);
+#else
+  return vcombine_s64(
+      vdup_n_s64(-vgetq_lane_s64(a, 0)),
+      vdup_n_s64(-vgetq_lane_s64(a, 1)));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmul_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vmul_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmul_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmulq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vmul_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmul_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmulq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmul_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmulq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmul_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmulq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmul_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmul_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmulq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s64(
+    vdup_n_s64(vgetq_lane_s64(a, 0)*vgetq_lane_s64(b, 0)),
+    vdup_n_s64(vgetq_lane_s64(a, 1)*vgetq_lane_s64(b, 1)));
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u64(
+    vdup_n_u64(vgetq_lane_u64(a, 0)*vgetq_lane_u64(b, 0)),
+    vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vdiv_f32(a,b);
+#else
+  Packet2f inv, restep, div;
+
+  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
+  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
+  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
+  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
+  // Newton-Raphson and vrecpsq_f32()
+  inv = vrecpe_f32(b);
+
+  // This returns a differential, by which we will have to multiply inv to get a better
+  // approximation of 1/b.
+  restep = vrecps_f32(b, inv);
+  inv = vmul_f32(restep, inv);
+
+  // Finally, multiply a by 1/b and get the wanted result of the division.
+  div = vmul_f32(a, inv);
+
+  return div;
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vdivq_f32(a,b);
+#else
+  Packet4f inv, restep, div;
+
+  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
+  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
+  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
+  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
+  // Newton-Raphson and vrecpsq_f32()
+  inv = vrecpeq_f32(b);
+
+  // This returns a differential, by which we will have to multiply inv to get a better
+  // approximation of 1/b.
+  restep = vrecpsq_f32(b, inv);
+  inv = vmulq_f32(restep, inv);
+
+  // Finally, multiply a by 1/b and get the wanted result of the division.
+  div = vmulq_f32(a, inv);
+
+  return div;
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4c>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(const Packet8c& /*a*/, const Packet8c& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet8c>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(const Packet16c& /*a*/, const Packet16c& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet16c>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(const Packet4uc& /*a*/, const Packet4uc& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4uc>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(const Packet8uc& /*a*/, const Packet8uc& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet8uc>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& /*a*/, const Packet16uc& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet16uc>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(const Packet4s& /*a*/, const Packet4s& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4s>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& /*a*/, const Packet8s& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet8s>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(const Packet4us& /*a*/, const Packet4us& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4us>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& /*a*/, const Packet8us& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet8us>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(const Packet2i& /*a*/, const Packet2i& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet2i>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4i>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(const Packet2ui& /*a*/, const Packet2ui& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet2ui>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& /*a*/, const Packet4ui& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4ui>(0);
+}
+template<> EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& /*a*/, const Packet2l& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet2l>(0LL);
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/, const Packet2ul& /*b*/)
+{
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet2ul>(0ULL);
+}
+
+
+#ifdef __ARM_FEATURE_FMA
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
+{ return vfmaq_f32(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
+{ return vfma_f32(c,a,b); }
+#else
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
+{
+  return vmlaq_f32(c,a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
+{
+  return vmla_f32(c,a,b);
+}
+#endif
+
+// No FMA instruction for int, so use MLA unconditionally.
+template<> EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vmla_s8(
+      vreinterpret_s8_s32(vdup_n_s32(c)),
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c)
+{ return vmla_s8(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c)
+{ return vmlaq_s8(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vmla_u8(
+      vreinterpret_u8_u32(vdup_n_u32(c)),
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c)
+{ return vmla_u8(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c)
+{ return vmlaq_u8(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c)
+{ return vmla_s16(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c)
+{ return vmlaq_s16(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c)
+{ return vmla_u16(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c)
+{ return vmlaq_u16(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c)
+{ return vmla_s32(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c)
+{ return vmlaq_s32(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c)
+{ return vmla_u32(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c)
+{ return vmlaq_u32(c,a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vabd_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vabdq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vabd_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return vabd_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vabdq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vabd_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vabd_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vabdq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vabd_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vabdq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vabd_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vabdq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(const Packet2i& a, const Packet2i& b)
+{ return vabd_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b)
+{ return vabdq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vabd_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vabdq_u32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmin_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vminnmq_f32(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vminnm_f32(a, b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmin<Packet4f>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmin<Packet2f>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vmin_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmin_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vminq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vmin_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmin_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vminq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmin_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vminq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmin_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vminq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmin_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmin_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vminq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s64(
+      vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
+      vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u64(
+      vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
+      vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmax_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxnmq_f32(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vmaxnm_f32(a, b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmax<Packet4f>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmax<Packet2f>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vmax_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmax_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmaxq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vmax_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmax_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmaxq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmax_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmaxq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmax_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmaxq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmax_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmax_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmaxq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s64(
+      vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
+      vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u64(
+      vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
+      vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vcle_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vcleq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_u8(vcle_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return vreinterpret_s8_u8(vcle_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vreinterpretq_s8_u8(vcleq_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vcle_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vcle_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vcleq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vreinterpret_s16_u16(vcle_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vreinterpretq_s16_u16(vcleq_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vcle_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vcleq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(const Packet2i& a, const Packet2i& b)
+{ return vreinterpret_s32_u32(vcle_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b)
+{ return vreinterpretq_s32_u32(vcleq_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vcle_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vcleq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vreinterpretq_s64_u64(vcleq_s64(a,b));
+#else
+  return vcombine_s64(
+      vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+      vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vcleq_u64(a,b);
+#else
+  return vcombine_u64(
+      vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+      vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vclt_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vcltq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_u8(vclt_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return vreinterpret_s8_u8(vclt_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vreinterpretq_s8_u8(vcltq_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vclt_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vclt_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vcltq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vreinterpret_s16_u16(vclt_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vreinterpretq_s16_u16(vcltq_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vclt_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vcltq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(const Packet2i& a, const Packet2i& b)
+{ return vreinterpret_s32_u32(vclt_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b)
+{ return vreinterpretq_s32_u32(vcltq_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vclt_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vcltq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vreinterpretq_s64_u64(vcltq_s64(a,b));
+#else
+  return vcombine_s64(
+      vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+      vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vcltq_u64(a,b);
+#else
+  return vcombine_u64(
+      vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+      vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vceq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vceqq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(const Packet4c& a, const Packet4c& b)
+{
+  return vget_lane_s32(vreinterpret_s32_u8(vceq_s8(
+      vreinterpret_s8_s32(vdup_n_s32(a)),
+      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return vreinterpret_s8_u8(vceq_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vreinterpretq_s8_u8(vceqq_s8(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vceq_u8(
+      vreinterpret_u8_u32(vdup_n_u32(a)),
+      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vceq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vceqq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vreinterpret_s16_u16(vceq_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vreinterpretq_s16_u16(vceqq_s16(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vceq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vceqq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(const Packet2i& a, const Packet2i& b)
+{ return vreinterpret_s32_u32(vceq_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b)
+{ return vreinterpretq_s32_u32(vceqq_s32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vceq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vceqq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vreinterpretq_s64_u64(vceqq_s64(a,b));
+#else
+  return vcombine_s64(
+      vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+      vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{
+#if EIGEN_ARCH_ARM64
+  return vceqq_u64(a,b);
+#else
+  return vcombine_u64(
+      vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+      vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a,b))); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); }
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
+template<> EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(const Packet4c& a, const Packet4c& b)
+{ return a & b; }
+template<> EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return vand_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vandq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{ return a & b; }
+template<> EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vand_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vandq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(const Packet4s& a, const Packet4s& b) { return vand_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) { return vandq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vand_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vandq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(const Packet2i& a, const Packet2i& b) { return vand_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vand_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vandq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) { return vandq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{ return vandq_u64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f por<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4c por<Packet4c>(const Packet4c& a, const Packet4c& b)
+{ return a | b; }
+template<> EIGEN_STRONG_INLINE Packet8c por<Packet8c>(const Packet8c& a, const Packet8c& b) { return vorr_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return vorrq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{ return a | b; }
+template<> EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vorr_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vorrq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s por<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vorr_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vorrq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us por<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vorr_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vorrq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i por<Packet2i>(const Packet2i& a, const Packet2i& b) { return vorr_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vorr_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vorrq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b)
+{ return vorrq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{ return vorrq_u64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(const Packet4c& a, const Packet4c& b)
+{ return a ^ b; }
+template<> EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(const Packet8c& a, const Packet8c& b)
+{ return veor_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b)
+{ return veorq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{ return a ^ b; }
+template<> EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return veor_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return veorq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(const Packet4s& a, const Packet4s& b) { return veor_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) { return veorq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return veor_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return veorq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(const Packet2i& a, const Packet2i& b) { return veor_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return veor_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return veorq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b)
+{ return veorq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{ return veorq_u64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(const Packet2f& a, const Packet2f& b)
+{ return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
+template<> EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(const Packet4c& a, const Packet4c& b)
+{ return a & ~b; }
+template<> EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(const Packet8c& a, const Packet8c& b) { return vbic_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) { return vbicq_s8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
+{ return a & ~b; }
+template<> EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
+{ return vbic_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
+{ return vbicq_u8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(const Packet4s& a, const Packet4s& b)
+{ return vbic_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b)
+{ return vbicq_s16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(const Packet4us& a, const Packet4us& b)
+{ return vbic_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b)
+{ return vbicq_u16(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(const Packet2i& a, const Packet2i& b)
+{ return vbic_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b)
+{ return vbicq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
+{ return vbic_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
+{ return vbicq_u32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b)
+{ return vbicq_s64(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
+{ return vbicq_u64(a,b); }
+
+
+template<int N> EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) { return vshr_n_s8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) { return vshrq_n_s8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a)
+{ return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) { return vshr_n_s16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) { return vshrq_n_s16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) { return vshr_n_s32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return vshrq_n_s32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) { return vshrq_n_s64(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
+
+template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a)
+{ return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a)
+{ return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a)
+{ return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a)
+{ return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a)
+{ return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a)
+{ return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a)
+{ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a)
+{ return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a),N)); }
+template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
+
+template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) { return vshl_n_s8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) { return vshlq_n_s8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a)
+{ return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
+template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) { return vshl_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) { return vshlq_n_u8(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) { return vshl_n_s16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) { return vshlq_n_s16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) { return vshl_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) { return vshlq_n_u16(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) { return vshl_n_s32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return vshlq_n_s32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) { return vshl_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) { return vshlq_n_u32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) { return vshlq_n_s64(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) { return vshlq_n_u64(a,N); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from)
+{
+  Packet4c res;
+  memcpy(&res, from, sizeof(Packet4c));
+  return res;
+}
+template<> EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from)
+{
+  Packet4uc res;
+  memcpy(&res, from, sizeof(Packet4uc));
+  return res;
+}
+template<> EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); }
+template<> EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(const float* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(const int8_t* from)
+{
+  Packet4c res;
+  memcpy(&res, from, sizeof(Packet4c));
+  return res;
+}
+template<> EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(const int8_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from); }
+template<> EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(const uint8_t* from)
+{
+  Packet4uc res;
+  memcpy(&res, from, sizeof(Packet4uc));
+  return res;
+}
+template<> EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(const uint8_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from); }
+template<> EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(const int16_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from); }
+template<> EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(const uint16_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from); }
+template<> EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(const int32_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(const uint32_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from); }
+template<> EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(const float* from)
+{ return vld1_dup_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
+{ return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from+1)); }
+template<> EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(const int8_t* from)
+{
+  const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload<Packet4c>(from)));
+  return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a,a).val[0]), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(const int8_t* from)
+{
+  const int8x8_t a = vld1_s8(from);
+  return vzip_s8(a,a).val[0];
+}
+template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from)
+{
+  const int8x8_t a = vld1_s8(from);
+  const int8x8x2_t b = vzip_s8(a,a);
+  return vcombine_s8(b.val[0], b.val[1]);
+}
+template<> EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(const uint8_t* from)
+{
+  const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload<Packet4uc>(from)));
+  return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a,a).val[0]), 0);
+}
+template<> EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(const uint8_t* from)
+{
+  const uint8x8_t a = vld1_u8(from);
+  return vzip_u8(a,a).val[0];
+}
+template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from)
+{
+  const uint8x8_t a = vld1_u8(from);
+  const uint8x8x2_t b = vzip_u8(a,a);
+  return vcombine_u8(b.val[0], b.val[1]);
+}
+template<> EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(const int16_t* from)
+{
+  return vreinterpret_s16_u32(vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)),
+      vreinterpret_u32_s16(vld1_dup_s16(from+1))).val[0]);
+}
+template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from)
+{
+  const int16x4_t a = vld1_s16(from);
+  const int16x4x2_t b = vzip_s16(a,a);
+  return vcombine_s16(b.val[0], b.val[1]);
+}
+template<> EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(const uint16_t* from)
+{
+  return vreinterpret_u16_u32(vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)),
+      vreinterpret_u32_u16(vld1_dup_u16(from+1))).val[0]);
+}
+template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from)
+{
+  const uint16x4_t a = vld1_u16(from);
+  const uint16x4x2_t b = vzip_u16(a,a);
+  return vcombine_u16(b.val[0], b.val[1]);
+}
+template<> EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(const int32_t* from)
+{ return vld1_dup_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
+{ return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from+1)); }
+template<> EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(const uint32_t* from)
+{ return vld1_dup_u32(from); }
+template<> EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from)
+{ return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from+1)); }
+template<> EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from)
+{ return vld1q_dup_s64(from); }
+template<> EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from)
+{ return vld1q_dup_u64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) { return vld1q_dup_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from)
+{ return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from)
+{
+  return vreinterpret_s8_u32(vzip_u32(
+      vreinterpret_u32_s8(vld1_dup_s8(from)),
+      vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
+}
+template<> EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from)
+{
+  const int8x8_t a = vreinterpret_s8_u32(vzip_u32(
+      vreinterpret_u32_s8(vld1_dup_s8(from)),
+      vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
+  const int8x8_t b = vreinterpret_s8_u32(vzip_u32(
+      vreinterpret_u32_s8(vld1_dup_s8(from+2)),
+      vreinterpret_u32_s8(vld1_dup_s8(from+3))).val[0]);
+  return vcombine_s8(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from)
+{ return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0); }
+template<> EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from)
+{
+  return vreinterpret_u8_u32(vzip_u32(
+      vreinterpret_u32_u8(vld1_dup_u8(from)),
+      vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
+}
+template<> EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from)
+{
+  const uint8x8_t a = vreinterpret_u8_u32(vzip_u32(
+      vreinterpret_u32_u8(vld1_dup_u8(from)),
+      vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
+  const uint8x8_t b = vreinterpret_u8_u32(vzip_u32(
+      vreinterpret_u32_u8(vld1_dup_u8(from+2)),
+      vreinterpret_u32_u8(vld1_dup_u8(from+3))).val[0]);
+  return vcombine_u8(a,b);
+}
+template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from)
+{ return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from+1)); }
+template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from)
+{ return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from+1)); }
+template<> EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) { return vld1q_dup_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) { return vld1q_dup_u32(from); }
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from)
+{ memcpy(to, &from, sizeof(from)); }
+template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from)
+{ memcpy(to, &from, sizeof(from)); }
+template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to,from); }
+template<> EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to,from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2f& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet4c& from)
+{ memcpy(to, &from, sizeof(from)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet8c& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet4uc& from)
+{ memcpy(to, &from, sizeof(from)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet8uc& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet4s& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet4us& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet2i& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet2ui& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to,from); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(const float* from, Index stride)
+{
+  Packet2f res = vld1_dup_f32(from);
+  res = vld1_lane_f32(from + 1*stride, res, 1);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  Packet4f res = vld1q_dup_f32(from);
+  res = vld1q_lane_f32(from + 1*stride, res, 1);
+  res = vld1q_lane_f32(from + 2*stride, res, 2);
+  res = vld1q_lane_f32(from + 3*stride, res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride)
+{
+  Packet4c res;
+  for (int i = 0; i != 4; i++)
+    reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride)
+{
+  Packet8c res = vld1_dup_s8(from);
+  res = vld1_lane_s8(from + 1*stride, res, 1);
+  res = vld1_lane_s8(from + 2*stride, res, 2);
+  res = vld1_lane_s8(from + 3*stride, res, 3);
+  res = vld1_lane_s8(from + 4*stride, res, 4);
+  res = vld1_lane_s8(from + 5*stride, res, 5);
+  res = vld1_lane_s8(from + 6*stride, res, 6);
+  res = vld1_lane_s8(from + 7*stride, res, 7);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride)
+{
+  Packet16c res = vld1q_dup_s8(from);
+  res = vld1q_lane_s8(from + 1*stride, res, 1);
+  res = vld1q_lane_s8(from + 2*stride, res, 2);
+  res = vld1q_lane_s8(from + 3*stride, res, 3);
+  res = vld1q_lane_s8(from + 4*stride, res, 4);
+  res = vld1q_lane_s8(from + 5*stride, res, 5);
+  res = vld1q_lane_s8(from + 6*stride, res, 6);
+  res = vld1q_lane_s8(from + 7*stride, res, 7);
+  res = vld1q_lane_s8(from + 8*stride, res, 8);
+  res = vld1q_lane_s8(from + 9*stride, res, 9);
+  res = vld1q_lane_s8(from + 10*stride, res, 10);
+  res = vld1q_lane_s8(from + 11*stride, res, 11);
+  res = vld1q_lane_s8(from + 12*stride, res, 12);
+  res = vld1q_lane_s8(from + 13*stride, res, 13);
+  res = vld1q_lane_s8(from + 14*stride, res, 14);
+  res = vld1q_lane_s8(from + 15*stride, res, 15);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride)
+{
+  Packet4uc res;
+  for (int i = 0; i != 4; i++)
+    reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride)
+{
+  Packet8uc res = vld1_dup_u8(from);
+  res = vld1_lane_u8(from + 1*stride, res, 1);
+  res = vld1_lane_u8(from + 2*stride, res, 2);
+  res = vld1_lane_u8(from + 3*stride, res, 3);
+  res = vld1_lane_u8(from + 4*stride, res, 4);
+  res = vld1_lane_u8(from + 5*stride, res, 5);
+  res = vld1_lane_u8(from + 6*stride, res, 6);
+  res = vld1_lane_u8(from + 7*stride, res, 7);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride)
+{
+  Packet16uc res = vld1q_dup_u8(from);
+  res = vld1q_lane_u8(from + 1*stride, res, 1);
+  res = vld1q_lane_u8(from + 2*stride, res, 2);
+  res = vld1q_lane_u8(from + 3*stride, res, 3);
+  res = vld1q_lane_u8(from + 4*stride, res, 4);
+  res = vld1q_lane_u8(from + 5*stride, res, 5);
+  res = vld1q_lane_u8(from + 6*stride, res, 6);
+  res = vld1q_lane_u8(from + 7*stride, res, 7);
+  res = vld1q_lane_u8(from + 8*stride, res, 8);
+  res = vld1q_lane_u8(from + 9*stride, res, 9);
+  res = vld1q_lane_u8(from + 10*stride, res, 10);
+  res = vld1q_lane_u8(from + 11*stride, res, 11);
+  res = vld1q_lane_u8(from + 12*stride, res, 12);
+  res = vld1q_lane_u8(from + 13*stride, res, 13);
+  res = vld1q_lane_u8(from + 14*stride, res, 14);
+  res = vld1q_lane_u8(from + 15*stride, res, 15);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride)
+{
+  Packet4s res = vld1_dup_s16(from);
+  res = vld1_lane_s16(from + 1*stride, res, 1);
+  res = vld1_lane_s16(from + 2*stride, res, 2);
+  res = vld1_lane_s16(from + 3*stride, res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride)
+{
+  Packet8s res = vld1q_dup_s16(from);
+  res = vld1q_lane_s16(from + 1*stride, res, 1);
+  res = vld1q_lane_s16(from + 2*stride, res, 2);
+  res = vld1q_lane_s16(from + 3*stride, res, 3);
+  res = vld1q_lane_s16(from + 4*stride, res, 4);
+  res = vld1q_lane_s16(from + 5*stride, res, 5);
+  res = vld1q_lane_s16(from + 6*stride, res, 6);
+  res = vld1q_lane_s16(from + 7*stride, res, 7);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride)
+{
+  Packet4us res = vld1_dup_u16(from);
+  res = vld1_lane_u16(from + 1*stride, res, 1);
+  res = vld1_lane_u16(from + 2*stride, res, 2);
+  res = vld1_lane_u16(from + 3*stride, res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride)
+{
+  Packet8us res = vld1q_dup_u16(from);
+  res = vld1q_lane_u16(from + 1*stride, res, 1);
+  res = vld1q_lane_u16(from + 2*stride, res, 2);
+  res = vld1q_lane_u16(from + 3*stride, res, 3);
+  res = vld1q_lane_u16(from + 4*stride, res, 4);
+  res = vld1q_lane_u16(from + 5*stride, res, 5);
+  res = vld1q_lane_u16(from + 6*stride, res, 6);
+  res = vld1q_lane_u16(from + 7*stride, res, 7);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride)
+{
+  Packet2i res = vld1_dup_s32(from);
+  res = vld1_lane_s32(from + 1*stride, res, 1);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
+{
+  Packet4i res = vld1q_dup_s32(from);
+  res = vld1q_lane_s32(from + 1*stride, res, 1);
+  res = vld1q_lane_s32(from + 2*stride, res, 2);
+  res = vld1q_lane_s32(from + 3*stride, res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride)
+{
+  Packet2ui res = vld1_dup_u32(from);
+  res = vld1_lane_u32(from + 1*stride, res, 1);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride)
+{
+  Packet4ui res = vld1q_dup_u32(from);
+  res = vld1q_lane_u32(from + 1*stride, res, 1);
+  res = vld1q_lane_u32(from + 2*stride, res, 2);
+  res = vld1q_lane_u32(from + 3*stride, res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride)
+{
+  Packet2l res = vld1q_dup_s64(from);
+  res = vld1q_lane_s64(from + 1*stride, res, 1);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride)
+{
+  Packet2ul res = vld1q_dup_u64(from);
+  res = vld1q_lane_u64(from + 1*stride, res, 1);
+  return res;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride)
+{
+  vst1_lane_f32(to + stride*0, from, 0);
+  vst1_lane_f32(to + stride*1, from, 1);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  vst1q_lane_f32(to + stride*0, from, 0);
+  vst1q_lane_f32(to + stride*1, from, 1);
+  vst1q_lane_f32(to + stride*2, from, 2);
+  vst1q_lane_f32(to + stride*3, from, 3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride)
+{
+  for (int i = 0; i != 4; i++)
+    *(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride)
+{
+  vst1_lane_s8(to + stride*0, from, 0);
+  vst1_lane_s8(to + stride*1, from, 1);
+  vst1_lane_s8(to + stride*2, from, 2);
+  vst1_lane_s8(to + stride*3, from, 3);
+  vst1_lane_s8(to + stride*4, from, 4);
+  vst1_lane_s8(to + stride*5, from, 5);
+  vst1_lane_s8(to + stride*6, from, 6);
+  vst1_lane_s8(to + stride*7, from, 7);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from, Index stride)
+{
+  vst1q_lane_s8(to + stride*0, from, 0);
+  vst1q_lane_s8(to + stride*1, from, 1);
+  vst1q_lane_s8(to + stride*2, from, 2);
+  vst1q_lane_s8(to + stride*3, from, 3);
+  vst1q_lane_s8(to + stride*4, from, 4);
+  vst1q_lane_s8(to + stride*5, from, 5);
+  vst1q_lane_s8(to + stride*6, from, 6);
+  vst1q_lane_s8(to + stride*7, from, 7);
+  vst1q_lane_s8(to + stride*8, from, 8);
+  vst1q_lane_s8(to + stride*9, from, 9);
+  vst1q_lane_s8(to + stride*10, from, 10);
+  vst1q_lane_s8(to + stride*11, from, 11);
+  vst1q_lane_s8(to + stride*12, from, 12);
+  vst1q_lane_s8(to + stride*13, from, 13);
+  vst1q_lane_s8(to + stride*14, from, 14);
+  vst1q_lane_s8(to + stride*15, from, 15);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from, Index stride)
+{
+  for (int i = 0; i != 4; i++)
+    *(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from, Index stride)
+{
+  vst1_lane_u8(to + stride*0, from, 0);
+  vst1_lane_u8(to + stride*1, from, 1);
+  vst1_lane_u8(to + stride*2, from, 2);
+  vst1_lane_u8(to + stride*3, from, 3);
+  vst1_lane_u8(to + stride*4, from, 4);
+  vst1_lane_u8(to + stride*5, from, 5);
+  vst1_lane_u8(to + stride*6, from, 6);
+  vst1_lane_u8(to + stride*7, from, 7);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from, Index stride)
+{
+  vst1q_lane_u8(to + stride*0, from, 0);
+  vst1q_lane_u8(to + stride*1, from, 1);
+  vst1q_lane_u8(to + stride*2, from, 2);
+  vst1q_lane_u8(to + stride*3, from, 3);
+  vst1q_lane_u8(to + stride*4, from, 4);
+  vst1q_lane_u8(to + stride*5, from, 5);
+  vst1q_lane_u8(to + stride*6, from, 6);
+  vst1q_lane_u8(to + stride*7, from, 7);
+  vst1q_lane_u8(to + stride*8, from, 8);
+  vst1q_lane_u8(to + stride*9, from, 9);
+  vst1q_lane_u8(to + stride*10, from, 10);
+  vst1q_lane_u8(to + stride*11, from, 11);
+  vst1q_lane_u8(to + stride*12, from, 12);
+  vst1q_lane_u8(to + stride*13, from, 13);
+  vst1q_lane_u8(to + stride*14, from, 14);
+  vst1q_lane_u8(to + stride*15, from, 15);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from, Index stride)
+{
+  vst1_lane_s16(to + stride*0, from, 0);
+  vst1_lane_s16(to + stride*1, from, 1);
+  vst1_lane_s16(to + stride*2, from, 2);
+  vst1_lane_s16(to + stride*3, from, 3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from, Index stride)
+{
+  vst1q_lane_s16(to + stride*0, from, 0);
+  vst1q_lane_s16(to + stride*1, from, 1);
+  vst1q_lane_s16(to + stride*2, from, 2);
+  vst1q_lane_s16(to + stride*3, from, 3);
+  vst1q_lane_s16(to + stride*4, from, 4);
+  vst1q_lane_s16(to + stride*5, from, 5);
+  vst1q_lane_s16(to + stride*6, from, 6);
+  vst1q_lane_s16(to + stride*7, from, 7);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from, Index stride)
+{
+  vst1_lane_u16(to + stride*0, from, 0);
+  vst1_lane_u16(to + stride*1, from, 1);
+  vst1_lane_u16(to + stride*2, from, 2);
+  vst1_lane_u16(to + stride*3, from, 3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from, Index stride)
+{
+  vst1q_lane_u16(to + stride*0, from, 0);
+  vst1q_lane_u16(to + stride*1, from, 1);
+  vst1q_lane_u16(to + stride*2, from, 2);
+  vst1q_lane_u16(to + stride*3, from, 3);
+  vst1q_lane_u16(to + stride*4, from, 4);
+  vst1q_lane_u16(to + stride*5, from, 5);
+  vst1q_lane_u16(to + stride*6, from, 6);
+  vst1q_lane_u16(to + stride*7, from, 7);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from, Index stride)
+{
+  vst1_lane_s32(to + stride*0, from, 0);
+  vst1_lane_s32(to + stride*1, from, 1);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
+{
+  vst1q_lane_s32(to + stride*0, from, 0);
+  vst1q_lane_s32(to + stride*1, from, 1);
+  vst1q_lane_s32(to + stride*2, from, 2);
+  vst1q_lane_s32(to + stride*3, from, 3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from, Index stride)
+{
+  vst1_lane_u32(to + stride*0, from, 0);
+  vst1_lane_u32(to + stride*1, from, 1);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride)
+{
+  vst1q_lane_u32(to + stride*0, from, 0);
+  vst1q_lane_u32(to + stride*1, from, 1);
+  vst1q_lane_u32(to + stride*2, from, 2);
+  vst1q_lane_u32(to + stride*3, from, 3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride)
+{
+  vst1q_lane_s64(to + stride*0, from, 0);
+  vst1q_lane_s64(to + stride*1, from, 1);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from, Index stride)
+{
+  vst1q_lane_u64(to + stride*0, from, 0);
+  vst1q_lane_u64(to + stride*1, from, 1);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE float pfirst<Packet2f>(const Packet2f& a) { return vget_lane_f32(a,0); }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return vgetq_lane_f32(a,0); }
+template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(const Packet4c& a) { return static_cast<int8_t>(a & 0xff); }
+template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(const Packet8c& a) { return vget_lane_s8(a,0); }
+template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) { return vgetq_lane_s8(a,0); }
+template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(const Packet4uc& a) { return static_cast<uint8_t>(a & 0xff); }
+template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(const Packet8uc& a) { return vget_lane_u8(a,0); }
+template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) { return vgetq_lane_u8(a,0); }
+template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(const Packet4s& a) { return vget_lane_s16(a,0); }
+template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) { return vgetq_lane_s16(a,0); }
+template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(const Packet4us& a) { return vget_lane_u16(a,0); }
+template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) { return vgetq_lane_u16(a,0); }
+template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(const Packet2i& a) { return vget_lane_s32(a,0); }
+template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { return vgetq_lane_s32(a,0); }
+template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(a,0); }
+template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { return vgetq_lane_u32(a,0); }
+template<> EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) { return vgetq_lane_s64(a,0); }
+template<> EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) { return vgetq_lane_u64(a,0); }
+
+template<> EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) { return vrev64_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  const float32x4_t a_r64 = vrev64q_f32(a);
+  return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) { return vrev64_s8(a); }
+template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
+{
+  const int8x16_t a_r64 = vrev64q_s8(a);
+  return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a)
+{ return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) { return vrev64_u8(a); }
+template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
+{
+  const uint8x16_t a_r64 = vrev64q_u8(a);
+  return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) { return vrev64_s16(a); }
+template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
+{
+  const int16x8_t a_r64 = vrev64q_s16(a);
+  return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) { return vrev64_u16(a); }
+template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
+{
+  const uint16x8_t a_r64 = vrev64q_u16(a);
+  return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) { return vrev64_s32(a); }
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
+{
+  const int32x4_t a_r64 = vrev64q_s32(a);
+  return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) { return vrev64_u32(a); }
+template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a)
+{
+  const uint32x4_t a_r64 = vrev64q_u32(a);
+  return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64));
+}
+template<> EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a)
+{ return vcombine_s64(vget_high_s64(a), vget_low_s64(a)); }
+template<> EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a)
+{ return vcombine_u64(vget_high_u64(a), vget_low_u64(a)); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) { return vabs_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(const Packet4c& a)
+{ return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
+template<> EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) { return vabs_s8(a); }
+template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vabsq_s8(a); }
+template<> EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) { return vabs_s16(a); }
+template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vabsq_s16(a); }
+template<> EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) { return vabs_s32(a); }
+template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
+template<> EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
+#if EIGEN_ARCH_ARM64
+  return vabsq_s64(a);
+#else
+  return vcombine_s64(
+      vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))),
+      vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent)
+{ return pfrexp_generic(a,exponent); }
+template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent)
+{ return pfrexp_generic(a,exponent); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(const Packet2f& a, const Packet2f& exponent)
+{ return pldexp_generic(a,exponent); }
+template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent)
+{ return pldexp_generic(a,exponent); }
+
+template<> EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) { return vget_lane_f32(vpadd_f32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a));
+  return vget_lane_f32(vpadd_f32(sum, sum), 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux<Packet4c>(const Packet4c& a)
+{
+  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
+  int8x8_t sum = vpadd_s8(a_dup, a_dup);
+  sum = vpadd_s8(sum, sum);
+  return vget_lane_s8(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a)
+{
+  int8x8_t sum = vpadd_s8(a,a);
+  sum = vpadd_s8(sum, sum);
+  sum = vpadd_s8(sum, sum);
+  return vget_lane_s8(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a)
+{
+  int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a));
+  sum = vpadd_s8(sum, sum);
+  sum = vpadd_s8(sum, sum);
+  sum = vpadd_s8(sum, sum);
+  return vget_lane_s8(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(const Packet4uc& a)
+{
+  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
+  uint8x8_t sum = vpadd_u8(a_dup, a_dup);
+  sum = vpadd_u8(sum, sum);
+  return vget_lane_u8(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a)
+{
+  uint8x8_t sum = vpadd_u8(a,a);
+  sum = vpadd_u8(sum, sum);
+  sum = vpadd_u8(sum, sum);
+  return vget_lane_u8(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a)
+{
+  uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a));
+  sum = vpadd_u8(sum, sum);
+  sum = vpadd_u8(sum, sum);
+  sum = vpadd_u8(sum, sum);
+  return vget_lane_u8(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a)
+{
+  const int16x4_t sum = vpadd_s16(a,a);
+  return vget_lane_s16(vpadd_s16(sum, sum), 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a)
+{
+  int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a));
+  sum = vpadd_s16(sum, sum);
+  sum = vpadd_s16(sum, sum);
+  return vget_lane_s16(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a)
+{
+  const uint16x4_t sum = vpadd_u16(a,a);
+  return vget_lane_u16(vpadd_u16(sum, sum), 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a)
+{
+  uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a));
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  return vget_lane_u16(sum, 0);
+}
+template<> EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) { return vget_lane_s32(vpadd_s32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
+{
+  const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a));
+  return vget_lane_s32(vpadd_s32(sum, sum), 0);
+}
+template<> EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(vpadd_u32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a)
+{
+  const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a));
+  return vget_lane_u32(vpadd_u32(sum, sum), 0);
+}
+template<> EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a)
+{ return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); }
+template<> EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a)
+{ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a)
+{
+  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a,
+      vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a)
+{ return vadd_s8(vget_high_s8(a), vget_low_s8(a)); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a)
+{
+  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a,
+      vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a)
+{ return vadd_u8(vget_high_u8(a), vget_low_u8(a)); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a)
+{ return vadd_s16(vget_high_s16(a), vget_low_s16(a)); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a)
+{ return vadd_u16(vget_high_u16(a), vget_low_u16(a)); }
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a)
+{ return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); }
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
+template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a)
+{
+  int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
+  prod = vmul_s8(prod, vrev16_s8(prod));
+  return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a)
+{
+  int8x8_t prod = vmul_s8(a, vrev16_s8(a));
+  prod = vmul_s8(prod, vrev32_s8(prod));
+  return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a)
+{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
+template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a)
+{
+  uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
+  prod = vmul_u8(prod, vrev16_u8(prod));
+  return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a)
+{
+  uint8x8_t prod = vmul_u8(a, vrev16_u8(a));
+  prod = vmul_u8(prod, vrev32_u8(prod));
+  return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a)
+{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
+template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a)
+{
+  const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
+  return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a)
+{
+  int16x4_t prod;
+
+  // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
+  prod = vmul_s16(vget_low_s16(a), vget_high_s16(a));
+  // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
+  prod = vmul_s16(prod, vrev32_s16(prod));
+  // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
+  return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(const Packet4us& a)
+{
+  const uint16x4_t prod = vmul_u16(a, vrev32_u16(a));
+  return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a)
+{
+  uint16x4_t prod;
+
+  // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
+  prod = vmul_u16(vget_low_u16(a), vget_high_u16(a));
+  // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
+  prod = vmul_u16(prod, vrev32_u16(prod));
+  // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
+  return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
+}
+template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a)
+{ return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); }
+template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
+{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
+template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a)
+{ return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); }
+template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a)
+{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
+template<> EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a)
+{ return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); }
+template<> EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a)
+{ return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1); }
+
+// min
+template<> EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a)
+{ return vget_lane_f32(vpmin_f32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a));
+  return vget_lane_f32(vpmin_f32(min, min), 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(const Packet4c& a)
+{
+  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
+  int8x8_t min = vpmin_s8(a_dup, a_dup);
+  min = vpmin_s8(min, min);
+  return vget_lane_s8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a)
+{
+  int8x8_t min = vpmin_s8(a,a);
+  min = vpmin_s8(min, min);
+  min = vpmin_s8(min, min);
+  return vget_lane_s8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a)
+{
+  int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a));
+  min = vpmin_s8(min, min);
+  min = vpmin_s8(min, min);
+  min = vpmin_s8(min, min);
+  return vget_lane_s8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(const Packet4uc& a)
+{
+  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
+  uint8x8_t min = vpmin_u8(a_dup, a_dup);
+  min = vpmin_u8(min, min);
+  return vget_lane_u8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a)
+{
+  uint8x8_t min = vpmin_u8(a,a);
+  min = vpmin_u8(min, min);
+  min = vpmin_u8(min, min);
+  return vget_lane_u8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a)
+{
+  uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a));
+  min = vpmin_u8(min, min);
+  min = vpmin_u8(min, min);
+  min = vpmin_u8(min, min);
+  return vget_lane_u8(min, 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a)
+{
+  const int16x4_t min = vpmin_s16(a,a);
+  return vget_lane_s16(vpmin_s16(min, min), 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a)
+{
+  int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a));
+  min = vpmin_s16(min, min);
+  min = vpmin_s16(min, min);
+  return vget_lane_s16(min, 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a)
+{
+  const uint16x4_t min = vpmin_u16(a,a);
+  return vget_lane_u16(vpmin_u16(min, min), 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a)
+{
+  uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a));
+  min = vpmin_u16(min, min);
+  min = vpmin_u16(min, min);
+  return vget_lane_u16(min, 0);
+}
+template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a)
+{ return vget_lane_s32(vpmin_s32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
+{
+  const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a));
+  return vget_lane_s32(vpmin_s32(min, min), 0);
+}
+template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a)
+{ return vget_lane_u32(vpmin_u32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a)
+{
+  const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a));
+  return vget_lane_u32(vpmin_u32(min, min), 0);
+}
+template<> EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a)
+{ return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
+template<> EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a)
+{ return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
+
+// max
+template<> EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a)
+{ return vget_lane_f32(vpmax_f32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a));
+  return vget_lane_f32(vpmax_f32(max, max), 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(const Packet4c& a)
+{
+  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
+  int8x8_t max = vpmax_s8(a_dup, a_dup);
+  max = vpmax_s8(max, max);
+  return vget_lane_s8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a)
+{
+  int8x8_t max = vpmax_s8(a,a);
+  max = vpmax_s8(max, max);
+  max = vpmax_s8(max, max);
+  return vget_lane_s8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a)
+{
+  int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a));
+  max = vpmax_s8(max, max);
+  max = vpmax_s8(max, max);
+  max = vpmax_s8(max, max);
+  return vget_lane_s8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(const Packet4uc& a)
+{
+  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
+  uint8x8_t max = vpmax_u8(a_dup, a_dup);
+  max = vpmax_u8(max, max);
+  return vget_lane_u8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a)
+{
+  uint8x8_t max = vpmax_u8(a,a);
+  max = vpmax_u8(max, max);
+  max = vpmax_u8(max, max);
+  return vget_lane_u8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a)
+{
+  uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a));
+  max = vpmax_u8(max, max);
+  max = vpmax_u8(max, max);
+  max = vpmax_u8(max, max);
+  return vget_lane_u8(max, 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a)
+{
+  const int16x4_t max = vpmax_s16(a,a);
+  return vget_lane_s16(vpmax_s16(max, max), 0);
+}
+template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a)
+{
+  int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a));
+  max = vpmax_s16(max, max);
+  max = vpmax_s16(max, max);
+  return vget_lane_s16(max, 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a)
+{
+  const uint16x4_t max = vpmax_u16(a,a);
+  return vget_lane_u16(vpmax_u16(max, max), 0);
+}
+template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a)
+{
+  uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a));
+  max = vpmax_u16(max, max);
+  max = vpmax_u16(max, max);
+  return vget_lane_u16(max, 0);
+}
+template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a)
+{ return vget_lane_s32(vpmax_s32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
+{
+  const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a));
+  return vget_lane_s32(vpmax_s32(max, max), 0);
+}
+template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a)
+{ return vget_lane_u32(vpmax_u32(a,a), 0); }
+template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a)
+{
+  const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a));
+  return vget_lane_u32(vpmax_u32(max, max), 0);
+}
+template<> EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a)
+{ return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
+template<> EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a)
+{ return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+  uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
+                            vget_high_u32(vreinterpretq_u32_f32(x)));
+  return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
+}
+
+// Helpers for ptranspose.
+namespace detail {
+  
+template<typename Packet>
+void zip_in_place(Packet& p1, Packet& p2);
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet2f>(Packet2f& p1, Packet2f& p2) {
+  const float32x2x2_t tmp = vzip_f32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4f>(Packet4f& p1, Packet4f& p2) {
+  const float32x4x2_t tmp = vzipq_f32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8c>(Packet8c& p1, Packet8c& p2) {
+  const int8x8x2_t tmp = vzip_s8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet16c>(Packet16c& p1, Packet16c& p2) {
+  const int8x16x2_t tmp = vzipq_s8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8uc>(Packet8uc& p1, Packet8uc& p2) {
+  const uint8x8x2_t tmp = vzip_u8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet16uc>(Packet16uc& p1, Packet16uc& p2) {
+  const uint8x16x2_t tmp = vzipq_u8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet2i>(Packet2i& p1, Packet2i& p2) {
+  const int32x2x2_t tmp = vzip_s32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4i>(Packet4i& p1, Packet4i& p2) {
+  const int32x4x2_t tmp = vzipq_s32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet2ui>(Packet2ui& p1, Packet2ui& p2) {
+  const uint32x2x2_t tmp = vzip_u32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4ui>(Packet4ui& p1, Packet4ui& p2) {
+  const uint32x4x2_t tmp = vzipq_u32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4s>(Packet4s& p1, Packet4s& p2) {
+  const int16x4x2_t tmp = vzip_s16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8s>(Packet8s& p1, Packet8s& p2) {
+  const int16x8x2_t tmp = vzipq_s16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4us>(Packet4us& p1, Packet4us& p2) {
+  const uint16x4x2_t tmp = vzip_u16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8us>(Packet8us& p1, Packet8us& p2) {
+  const uint16x8x2_t tmp = vzipq_u16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[2]);
+  zip_in_place(kernel.packet[1], kernel.packet[3]);
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+  zip_in_place(kernel.packet[2], kernel.packet[3]);
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[4]);
+  zip_in_place(kernel.packet[1], kernel.packet[5]);
+  zip_in_place(kernel.packet[2], kernel.packet[6]);
+  zip_in_place(kernel.packet[3], kernel.packet[7]);
+
+  zip_in_place(kernel.packet[0], kernel.packet[2]);
+  zip_in_place(kernel.packet[1], kernel.packet[3]);
+  zip_in_place(kernel.packet[4], kernel.packet[6]);
+  zip_in_place(kernel.packet[5], kernel.packet[7]);
+  
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+  zip_in_place(kernel.packet[2], kernel.packet[3]);
+  zip_in_place(kernel.packet[4], kernel.packet[5]);
+  zip_in_place(kernel.packet[6], kernel.packet[7]);
+}
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
+  EIGEN_UNROLL_LOOP
+  for (int i=0; i<4; ++i) {
+    const int m = (1 << i);
+    EIGEN_UNROLL_LOOP
+    for (int j=0; j<m; ++j) {
+      const int n = (1 << (3-i));
+      EIGEN_UNROLL_LOOP
+      for (int k=0; k<n; ++k) {
+        const int idx = 2*j*n+k;
+        zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
+      }
+    }
+  }
+}
+
+} // namespace detail
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2f, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel)
+{
+  const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
+  const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
+
+  const int8x8x2_t zip8 = vzip_s8(a,b);
+  const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
+
+  kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
+  kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1);
+  kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
+  kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel)
+{
+  const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
+  const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
+
+  const uint8x8x2_t zip8 = vzip_u8(a,b);
+  const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
+
+  kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
+  kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1);
+  kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
+  kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4s, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4us, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2i, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
+    detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ui, 2>& kernel) {
+  detail::zip_in_place(kernel.packet[0], kernel.packet[1]);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet2l, 2>& kernel)
+{
+#if EIGEN_ARCH_ARM64
+  const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[0] = tmp1;
+#else
+  const int64x1_t tmp[2][2] = {
+    { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) },
+    { vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1]) }
+  };
+
+  kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]);
+  kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet2ul, 2>& kernel)
+{
+#if EIGEN_ARCH_ARM64
+  const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[0] = tmp1;
+#else
+  const uint64x1_t tmp[2][2] = {
+    { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) },
+    { vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1]) }
+  };
+
+  kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]);
+  kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]);
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect( const Packet2f& mask, const Packet2f& a, const Packet2f& b)
+{ return vbsl_f32(vreinterpret_u32_f32(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b)
+{ return vbslq_f32(vreinterpretq_u32_f32(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b)
+{ return vbsl_s8(vreinterpret_u8_s8(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b)
+{ return vbslq_s8(vreinterpretq_u8_s8(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b)
+{ return vbsl_u8(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, const Packet16uc& b)
+{ return vbslq_u8(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b)
+{ return vbsl_s16(vreinterpret_u16_s16(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b)
+{ return vbslq_s16(vreinterpretq_u16_s16(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b)
+{ return vbsl_u16(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b)
+{ return vbslq_u16(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b)
+{ return vbsl_s32(vreinterpret_u32_s32(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b)
+{ return vbslq_s32(vreinterpretq_u32_s32(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b)
+{ return vbsl_u32(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b)
+{ return vbslq_u32(mask, a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b)
+{ return vbslq_s64(vreinterpretq_u64_s64(mask), a, b); }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b)
+{ return vbslq_u64(mask, a, b); }
+
+// Use armv8 rounding intinsics if available.
+#if EIGEN_ARCH_ARMV8
+template<> EIGEN_STRONG_INLINE Packet2f print<Packet2f>(const Packet2f& a)
+{ return vrndn_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
+{ return vrndnq_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
+{ return vrndm_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{ return vrndmq_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
+{ return vrndp_f32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
+{ return vrndpq_f32(a); }
+
+#else
+
+template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
+  // Adds and subtracts signum(a) * 2^23 to force rounding.
+  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
+  const Packet4f abs_a = pabs(a);
+  Packet4f r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) {
+  // Adds and subtracts signum(a) * 2^23 to force rounding.
+  const Packet2f limit = pset1<Packet2f>(static_cast<float>(1<<23));
+  const Packet2f abs_a = pabs(a);
+  Packet2f r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  Packet4f tmp  = print<Packet4f>(a);
+  // If greater, subtract one.
+  Packet4f mask = pcmp_lt(a, tmp);
+  mask = pand(mask, cst_1);
+  return psub(tmp, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
+{
+  const Packet2f cst_1 = pset1<Packet2f>(1.0f);
+  Packet2f tmp  = print<Packet2f>(a);
+  // If greater, subtract one.
+  Packet2f mask = pcmp_lt(a, tmp);
+  mask = pand(mask, cst_1);
+  return psub(tmp, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  Packet4f tmp  = print<Packet4f>(a);
+  // If smaller, add one.
+  Packet4f mask = pcmp_lt(tmp, a);
+  mask = pand(mask, cst_1);
+  return padd(tmp, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
+{
+  const Packet2f cst_1 = pset1<Packet2f>(1.0);
+  Packet2f tmp  = print<Packet2f>(a);
+  // If smaller, add one.
+  Packet2f mask = pcmp_lt(tmp, a);
+  mask = pand(mask, cst_1);
+  return padd(tmp, mask);
+}
+
+#endif
+
+/**
+ * Computes the integer square root
+ * @remarks The calculation is performed using an algorithm which iterates through each binary digit of the result
+ *   and tests whether setting that digit to 1 would cause the square of the value to be greater than the argument
+ *   value. The algorithm is described in detail here: http://ww1.microchip.com/downloads/en/AppNotes/91040a.pdf .
+ */
+template<> EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) {
+  uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a));
+  uint8x8_t res = vdup_n_u8(0);
+  uint8x8_t add = vdup_n_u8(0x8);
+  for (int i = 0; i < 4; i++)
+  {
+    const uint8x8_t temp = vorr_u8(res, add);
+    res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res);
+    add = vshr_n_u8(add, 1);
+  }
+  return vget_lane_u32(vreinterpret_u32_u8(res), 0);
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) {
+  uint8x8_t res = vdup_n_u8(0);
+  uint8x8_t add = vdup_n_u8(0x8);
+  for (int i = 0; i < 4; i++)
+  {
+    const uint8x8_t temp = vorr_u8(res, add);
+    res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res);
+    add = vshr_n_u8(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
+  uint8x16_t res = vdupq_n_u8(0);
+  uint8x16_t add = vdupq_n_u8(0x8);
+  for (int i = 0; i < 4; i++)
+  {
+    const uint8x16_t temp = vorrq_u8(res, add);
+    res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res);
+    add = vshrq_n_u8(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) {
+  uint16x4_t res = vdup_n_u16(0);
+  uint16x4_t add = vdup_n_u16(0x80);
+  for (int i = 0; i < 8; i++)
+  {
+    const uint16x4_t temp = vorr_u16(res, add);
+    res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res);
+    add = vshr_n_u16(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
+  uint16x8_t res = vdupq_n_u16(0);
+  uint16x8_t add = vdupq_n_u16(0x80);
+  for (int i = 0; i < 8; i++)
+  {
+    const uint16x8_t temp = vorrq_u16(res, add);
+    res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res);
+    add = vshrq_n_u16(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) {
+  uint32x2_t res = vdup_n_u32(0);
+  uint32x2_t add = vdup_n_u32(0x8000);
+  for (int i = 0; i < 16; i++)
+  {
+    const uint32x2_t temp = vorr_u32(res, add);
+    res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res);
+    add = vshr_n_u32(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
+  uint32x4_t res = vdupq_n_u32(0);
+  uint32x4_t add = vdupq_n_u32(0x8000);
+  for (int i = 0; i < 16; i++)
+  {
+    const uint32x4_t temp = vorrq_u32(res, add);
+    res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res);
+    add = vshrq_n_u32(add, 1);
+  }
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
+  // Compute approximate reciprocal sqrt.
+  Packet4f x = vrsqrteq_f32(a);
+  // Do Newton iterations for 1/sqrt(x).
+  x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x);
+  x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x);
+  const Packet4f infinity = pset1<Packet4f>(NumTraits<float>::infinity());
+  return pselect(pcmp_eq(a, pzero(a)), infinity, x);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
+  // Compute approximate reciprocal sqrt.
+  Packet2f x = vrsqrte_f32(a);
+  // Do Newton iterations for 1/sqrt(x).
+  x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x);
+  x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x);
+  const Packet2f infinity = pset1<Packet2f>(NumTraits<float>::infinity());
+  return pselect(pcmp_eq(a, pzero(a)), infinity, x);
+}
+
+// Unfortunately vsqrt_f32 is only available for A64.
+#if EIGEN_ARCH_ARM64
+template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){return vsqrtq_f32(_x);}
+template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x){return vsqrt_f32(_x); }
+#else
+template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+  const Packet4f infinity = pset1<Packet4f>(NumTraits<float>::infinity());
+  const Packet4f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity));
+  return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a)));
+}
+template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
+  const Packet2f infinity = pset1<Packet2f>(NumTraits<float>::infinity());
+  const Packet2f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity));
+  return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a)));
+}
+#endif
+
+//---------- bfloat16 ----------
+// TODO: Add support for native armv8.6-a bfloat16_t
+
+// TODO: Guard if we have native bfloat16 support
+typedef eigen_packet_wrapper<uint16x4_t, 19> Packet4bf;
+
+template<> struct is_arithmetic<Packet4bf> { enum { value = true }; };
+
+template<> struct packet_traits<bfloat16> : default_packet_traits
+{
+  typedef Packet4bf type;
+  typedef Packet4bf half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+
+    HasCmp       = 1,
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+    HasDiv       = 1,
+    HasFloor     = 1,
+    HasCeil      = 1,
+    HasRint      = 1,
+
+    HasSin  = EIGEN_FAST_MATH,
+    HasCos  = EIGEN_FAST_MATH,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 0,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf  = EIGEN_FAST_MATH,
+    HasBessel = 0,  // Issues with accuracy.
+    HasNdtri = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet4bf>
+{
+  typedef bfloat16 type;
+  typedef Packet4bf half;
+  enum
+  {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+namespace detail {  
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4bf>(Packet4bf& p1, Packet4bf& p2) {
+  const uint16x4x2_t tmp = vzip_u16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+} // namespace detail
+
+EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p)
+{
+  // See the scalar implemention in BFloat16.h for a comprehensible explanation
+  // of this fast rounding algorithm
+  Packet4ui input = reinterpret_cast<Packet4ui>(p);
+
+  // lsb = (input >> 16) & 1
+  Packet4ui lsb =  vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
+
+  // rounding_bias = 0x7fff + lsb
+  Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff));
+
+  // input += rounding_bias
+  input = vaddq_u32(input, rounding_bias);
+
+  // input = input >> 16
+  input = vshrq_n_u32(input, 16);
+
+  // Replace float-nans by bfloat16-nans, that is 0x7fc0
+  const Packet4ui bf16_nan = vdupq_n_u32(0x7fc0);
+  const Packet4ui mask = vceqq_f32(p, p);
+  input = vbslq_u32(mask, input, bf16_nan);
+
+  // output = static_cast<uint16_t>(input)
+  return vmovn_u32(input);
+}
+
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p)
+{
+  return reinterpret_cast<Packet4f>(vshlq_n_u32(vmovl_u16(p), 16));
+}
+
+EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) {
+  return vmovn_u32(vreinterpretq_u32_f32(p));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
+  return pset1<Packet4us>(from.value);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
+  return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(from)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from)
+{
+  return pload<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from)
+{
+  return ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet4bf& from)
+{
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from)
+{
+  return ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+}
+
+template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
+  return F32ToBf16(pabs<Packet4f>(Bf16ToF32(a)));
+}
+
+template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(const Packet4bf &a,
+                                                                            const Packet4bf &b)
+{
+  return F32ToBf16(pmin<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(const Packet4bf &a,
+                                                                        const Packet4bf &b)
+{
+  return F32ToBf16(pmin<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <> EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(const Packet4bf &a,
+                                                          const Packet4bf &b)
+{
+  return F32ToBf16(pmin<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(const Packet4bf &a,
+                                                                            const Packet4bf &b)
+{
+  return F32ToBf16(pmax<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(const Packet4bf &a,
+                                                                        const Packet4bf &b)
+{
+  return F32ToBf16(pmax<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <> EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(const Packet4bf &a,
+                                                          const Packet4bf &b)
+{
+  return F32ToBf16(pmax<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a)
+{
+  return F32ToBf16(plset<Packet4f>(static_cast<float>(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) {
+  return por<Packet4us>(a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) {
+  return pxor<Packet4us>(a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) {
+  return pand<Packet4us>(a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) {
+  return pandnot<Packet4us>(a, b);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a,
+                                                      const Packet4bf& b)
+{
+  return pselect<Packet4us>(mask, a, b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a)
+{
+  return F32ToBf16(print<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a)
+{
+  return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(const Packet4bf& a)
+{
+  return F32ToBf16(pceil<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(padd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf psub<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(psub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(pmul<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<>
+EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride)
+{
+  return pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride);
+}
+
+template<>
+EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride)
+{
+  pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), from, stride);
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a)
+{
+  return static_cast<bfloat16>(predux<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet4bf>(const Packet4bf& a)
+{
+  return static_cast<bfloat16>(predux_max<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet4bf>(const Packet4bf& a)
+{
+  return static_cast<bfloat16>(predux_min<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a)
+{
+  return static_cast<bfloat16>(predux_mul<Packet4f>(Bf16ToF32(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a)
+{
+  return preverse<Packet4us>(a);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel)
+{
+  detail::ptranspose_impl(kernel);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32ToBf16(pabsdiff<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32MaskToBf16Mask(pcmp_eq<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32MaskToBf16Mask(pcmp_lt_or_nan<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a)
+{
+  return pxor<Packet4us>(a, pset1<Packet4us>(static_cast<uint16_t>(0x8000)));
+}
+
+//---------- double ----------
+
+// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
+// Confirmed at least with __apple_build_version__ = 6000054.
+#ifdef __apple_build_version__
+// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
+// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
+// major toolchain updates.
+#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
+#else
+#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
+#endif
+
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+
+// Bug 907: workaround missing declarations of the following two functions in the ADK
+// Defining these functions as templates ensures that if these intrinsics are
+// already defined in arm_neon.h, then our workaround doesn't cause a conflict
+// and has lower priority in overload resolution.
+template <typename T> uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; }
+
+template <typename T> float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; }
+
+typedef float64x2_t Packet2d;
+typedef float64x1_t Packet1d;
+
+// fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
+// Currently used in LU/arch/InverseSize4.h to enable a shared implementation
+// for fast inversion of matrices of size 4.
+EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask)
+{
+  const double* a = reinterpret_cast<const double*>(&m);
+  const double* b = reinterpret_cast<const double*>(&n);
+  Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))};
+  return res;
+}
+
+EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask)
+{
+  return shuffle(a, b, mask);
+}
+EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a,const Packet2d& b)
+{
+  return shuffle(a, b, 0);
+}
+EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b)
+{
+  return shuffle(a, b, 3);
+}
+#define vec2d_duplane(a, p) \
+  vdupq_laneq_f64(a, p)
+
+template<> struct packet_traits<double>  : default_packet_traits
+{
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum
+  {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasCmp       = 1,
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 1,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 1,
+    HasArg       = 0,
+    HasAbs2      = 1,
+    HasAbsDiff   = 1,
+    HasMin       = 1,
+    HasMax       = 1,
+    HasConj      = 1,
+    HasSetLinear = 0,
+    HasBlend     = 0,
+
+    HasDiv   = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh = 0,
+    HasErf  = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2d>
+{
+  typedef double type;
+  typedef Packet2d half;
+  typedef Packet2l integer_packet;
+  enum
+  {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) { return vdupq_n_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
+{
+  const double c[] = {0.0,1.0};
+  return vaddq_f64(pset1<Packet2d>(a), vld1q_f64(c));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
+template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b){
+  const Packet2d mask = {numext::bit_cast<double>(0x8000000000000000ull),0.0};
+  return padd(a, pxor(mask, b));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
+
+#ifdef __ARM_FEATURE_FMA
+// See bug 936. See above comment about FMA for float.
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
+{ return vfmaq_f64(c,a,b); }
+#else
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
+{ return vmlaq_f64(c,a,b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vminnmq_f64(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxnmq_f64(a, b); }
+
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmin<Packet2d>(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
+
+
+template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmax<Packet2d>(a, b); }
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(vcleq_f64(a,b)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(vcltq_f64(a,b)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a,b)))); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b)
+{ return vreinterpretq_f64_u64(vceqq_f64(a,b)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) { return vld1q_dup_f64(from); }
+template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
+{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to,from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
+{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to,from); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+{
+  Packet2d res = pset1<Packet2d>(0.0);
+  res = vld1q_lane_f64(from + 0*stride, res, 0);
+  res = vld1q_lane_f64(from + 1*stride, res, 1);
+  return res;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+  vst1q_lane_f64(to + stride*0, from, 0);
+  vst1q_lane_f64(to + stride*1, from, 1);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
+
+// FIXME only store the 2 first elements ?
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a,0); }
+
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
+{ return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
+
+#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+// workaround ICE, see bug 907
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{ return (vget_low_f64(a) + vget_high_f64(a))[0]; }
+#else
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{ return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
+#endif
+
+// Other reduction functions:
+// mul
+#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
+{ return (vget_low_f64(a) * vget_high_f64(a))[0]; }
+#else
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
+{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
+#endif
+
+// min
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
+{ return vgetq_lane_f64(vpminq_f64(a,a), 0); }
+
+// max
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
+{ return vgetq_lane_f64(vpmaxq_f64(a,a), 0); }
+
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet2d, 2>& kernel)
+{
+  const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
+  const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
+
+  kernel.packet[0] = tmp1;
+  kernel.packet[1] = tmp2;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b)
+{ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
+{ return vrndnq_f64(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
+{ return vrndmq_f64(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
+{ return vrndpq_f64(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent)
+{ return pldexp_generic(a, exponent); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent)
+{ return pfrexp_generic(a,exponent); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from)
+{ return vreinterpretq_f64_u64(vdupq_n_u64(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
+  // Compute approximate reciprocal sqrt.
+  Packet2d x = vrsqrteq_f64(a);
+  // Do Newton iterations for 1/sqrt(x).
+  x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
+  x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
+  x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
+  const Packet2d infinity = pset1<Packet2d>(NumTraits<double>::infinity());
+  return pselect(pcmp_eq(a, pzero(a)), infinity, x);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); }
+
+#endif // EIGEN_ARCH_ARM64
+
+// Do we have an fp16 types and supporting Neon intrinsics?
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+typedef float16x4_t Packet4hf;
+typedef float16x8_t Packet8hf;
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet8hf type;
+  typedef Packet4hf half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size = 4,
-    HasHalfPacket=0, // Packet2f intrinsics not implemented yet
-   
-    HasDiv  = 1,
-    // FIXME check the Has*
-    HasSin  = 0,
-    HasCos  = 0,
-    HasLog  = 0,
-    HasExp  = 1,
-    HasSqrt = 0
+    size = 8,
+    HasHalfPacket = 1,
+
+    HasCmp = 1,
+    HasCast = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasInsert = 1,
+    HasReduxp = 1,
+    HasDiv = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 0,
+    HasExp = 0,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasErf = EIGEN_FAST_MATH,
+    HasBessel = 0,  // Issues with accuracy.
+    HasNdtri = 0
   };
 };
-template<> struct packet_traits<int32_t>    : default_packet_traits
-{
-  typedef Packet4i type;
-  typedef Packet4i half; // Packet2i intrinsics not implemented yet
+
+template <>
+struct unpacket_traits<Packet4hf> {
+  typedef Eigen::half type;
+  typedef Packet4hf half;
   enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size=4,
-    HasHalfPacket=0 // Packet2i intrinsics not implemented yet
-    // FIXME check the Has*
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
   };
 };
 
-#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM
-// workaround gcc 4.2, 4.3 and 4.4 compilatin issue
-EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
-EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
-EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
-EIGEN_STRONG_INLINE void        vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
-EIGEN_STRONG_INLINE void        vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
-#endif
-
-template<> struct unpacket_traits<Packet4f> { typedef float   type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
-
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t&    from)   { return vdupq_n_s32(from); }
+template <>
+struct unpacket_traits<Packet8hf> {
+  typedef Eigen::half type;
+  typedef Packet4hf half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
 
-template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
-{
-  const float f[] = {0, 1, 2, 3};
-  Packet4f countdown = vld1q_f32(f);
-  return vaddq_f32(pset1<Packet4f>(a), countdown);
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(const Packet8hf& a) {
+  return vadd_f16(vget_low_f16(a), vget_high_f16(a));
 }
-template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
-{
-  const int32_t i[] = {0, 1, 2, 3};
-  Packet4i countdown = vld1q_s32(i);
-  return vaddq_s32(pset1<Packet4i>(a), countdown);
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pset1<Packet8hf>(const Eigen::half& from) {
+  return vdupq_n_f16(from.x);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4hf pset1<Packet4hf>(const Eigen::half& from) {
+  return vdup_n_f16(from.x);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet8hf plset<Packet8hf>(const Eigen::half& a) {
+  const float16_t f[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  Packet8hf countdown = vld1q_f16(f);
+  return vaddq_f16(pset1<Packet8hf>(a), countdown);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4hf plset<Packet4hf>(const Eigen::half& a) {
+  const float16_t f[] = {0, 1, 2, 3};
+  Packet4hf countdown = vld1_f16(f);
+  return vadd_f16(pset1<Packet4hf>(a), countdown);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet8hf padd<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vaddq_f16(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4hf padd<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vadd_f16(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-#if EIGEN_ARCH_ARM64
-  return vdivq_f32(a,b);
-#else
-  Packet4f inv, restep, div;
+template <>
+EIGEN_STRONG_INLINE Packet8hf psub<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vsubq_f16(a, b);
+}
 
-  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
-  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
-  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
-  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
-  // Newton-Raphson and vrecpsq_f32()
-  inv = vrecpeq_f32(b);
+template <>
+EIGEN_STRONG_INLINE Packet4hf psub<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vsub_f16(a, b);
+}
 
-  // This returns a differential, by which we will have to multiply inv to get a better
-  // approximation of 1/b.
-  restep = vrecpsq_f32(b, inv);
-  inv = vmulq_f32(restep, inv);
+template <>
+EIGEN_STRONG_INLINE Packet8hf pnegate(const Packet8hf& a) {
+  return vnegq_f16(a);
+}
 
-  // Finally, multiply a by 1/b and get the wanted result of the division.
-  div = vmulq_f32(a, inv);
+template <>
+EIGEN_STRONG_INLINE Packet4hf pnegate(const Packet4hf& a) {
+  return vneg_f16(a);
+}
 
-  return div;
-#endif
+template <>
+EIGEN_STRONG_INLINE Packet8hf pconj(const Packet8hf& a) {
+  return a;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
-{ eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet4i>(0);
+template <>
+EIGEN_STRONG_INLINE Packet4hf pconj(const Packet4hf& a) {
+  return a;
 }
 
-// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available,
-// then implements a slow software scalar fallback calling fmaf()!
-// Filed LLVM bug:
-//     https://llvm.org/bugs/show_bug.cgi?id=27216
-#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
-// See bug 936.
-// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
-// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
-// MLA is not fused i.e. does 2 roundings.
-// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
-// MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); }
-#else
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
-#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM
-  // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu,
-  // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on
-  // -march=armv7-a, that is a very common case.
-  // See e.g. this thread:
-  //     http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html
-  // Filed LLVM bug:
-  //     https://llvm.org/bugs/show_bug.cgi?id=27219
-  Packet4f r = c;
-  asm volatile(
-    "vmla.f32 %q[r], %q[a], %q[b]"
-    : [r] "+w" (r)
-    : [a] "w" (a),
-      [b] "w" (b)
-    : );
-  return r;
-#else
-  return vmlaq_f32(c,a,b);
-#endif
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmul<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vmulq_f16(a, b);
 }
-#endif
 
-// No FMA instruction for int, so use MLA unconditionally.
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmul<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vmul_f16(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet8hf pdiv<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vdivq_f16(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4hf pdiv<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vdiv_f16(a, b);
+}
 
-// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return vfmaq_f16(c, a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
+  return vfma_f16(c, a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vminq_f16(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmin<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vmin_f16(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*    from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t*  from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vminnm_f16(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vminnmq_f16(a, b); }
+#endif
 
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*   from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmin<Packet4hf>(a, b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
-{
-  float32x2_t lo, hi;
-  lo = vld1_dup_f32(from);
-  hi = vld1_dup_f32(from+1);
-  return vcombine_f32(lo, hi);
+template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmin<Packet8hf>(a, b); }
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmax<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vmaxq_f16(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
-{
-  int32x2_t lo, hi;
-  lo = vld1_dup_s32(from);
-  hi = vld1_dup_s32(from+1);
-  return vcombine_s32(lo, hi);
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmax<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vmax_f16(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>  (float*    to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t*  to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
+template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vmaxnm_f16(a, b); }
+template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vmaxnmq_f16(a, b); }
+#endif
 
-template<> EIGEN_STRONG_INLINE void pstoreu<float>  (float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
+template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmax<Packet4hf>(a, b); }
 
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
-{
-  Packet4f res = pset1<Packet4f>(0.f);
-  res = vsetq_lane_f32(from[0*stride], res, 0);
-  res = vsetq_lane_f32(from[1*stride], res, 1);
-  res = vsetq_lane_f32(from[2*stride], res, 2);
-  res = vsetq_lane_f32(from[3*stride], res, 3);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
-{
-  Packet4i res = pset1<Packet4i>(0);
-  res = vsetq_lane_s32(from[0*stride], res, 0);
-  res = vsetq_lane_s32(from[1*stride], res, 1);
-  res = vsetq_lane_s32(from[2*stride], res, 2);
-  res = vsetq_lane_s32(from[3*stride], res, 3);
-  return res;
-}
+template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmax<Packet8hf>(a, b); }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
-{
-  to[stride*0] = vgetq_lane_f32(from, 0);
-  to[stride*1] = vgetq_lane_f32(from, 1);
-  to[stride*2] = vgetq_lane_f32(from, 2);
-  to[stride*3] = vgetq_lane_f32(from, 3);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
-{
-  to[stride*0] = vgetq_lane_s32(from, 0);
-  to[stride*1] = vgetq_lane_s32(from, 1);
-  to[stride*2] = vgetq_lane_s32(from, 2);
-  to[stride*3] = vgetq_lane_s32(from, 3);
-}
+#define EIGEN_MAKE_ARM_FP16_CMP_8(name)                                               \
+  template <>                                                                         \
+  EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \
+    return vreinterpretq_f16_u16(vc##name##q_f16(a, b));                              \
+  }
 
-template<> EIGEN_STRONG_INLINE void prefetch<float>  (const float*    addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t*  addr) { EIGEN_ARM_PREFETCH(addr); }
+#define EIGEN_MAKE_ARM_FP16_CMP_4(name)                                               \
+  template <>                                                                         \
+  EIGEN_STRONG_INLINE Packet4hf pcmp_##name(const Packet4hf& a, const Packet4hf& b) { \
+    return vreinterpret_f16_u16(vc##name##_f16(a, b));                                \
+  }
 
-// FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE float   pfirst<Packet4f>(const Packet4f& a) { float   EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
+EIGEN_MAKE_ARM_FP16_CMP_8(eq)
+EIGEN_MAKE_ARM_FP16_CMP_8(lt)
+EIGEN_MAKE_ARM_FP16_CMP_8(le)
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
-  float32x2_t a_lo, a_hi;
-  Packet4f a_r64;
+EIGEN_MAKE_ARM_FP16_CMP_4(eq)
+EIGEN_MAKE_ARM_FP16_CMP_4(lt)
+EIGEN_MAKE_ARM_FP16_CMP_4(le)
 
-  a_r64 = vrev64q_f32(a);
-  a_lo = vget_low_f32(a_r64);
-  a_hi = vget_high_f32(a_r64);
-  return vcombine_f32(a_hi, a_lo);
+#undef EIGEN_MAKE_ARM_FP16_CMP_8
+#undef EIGEN_MAKE_ARM_FP16_CMP_4
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pcmp_lt_or_nan<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(vmvnq_u16(vcgeq_f16(a, b)));
 }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
-  int32x2_t a_lo, a_hi;
-  Packet4i a_r64;
 
-  a_r64 = vrev64q_s32(a);
-  a_lo = vget_low_s32(a_r64);
-  a_hi = vget_high_s32(a_r64);
-  return vcombine_s32(a_hi, a_lo);
+template <>
+EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(const Packet8hf& a)
+{ return vrndnq_f16(a); }
 
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
-  float32x2_t a_lo, a_hi, sum;
+template <>
+EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(const Packet4hf& a)
+{ return vrndn_f16(a); }
 
-  a_lo = vget_low_f32(a);
-  a_hi = vget_high_f32(a);
-  sum = vpadd_f32(a_lo, a_hi);
-  sum = vpadd_f32(sum, sum);
-  return vget_lane_f32(sum, 0);
-}
+template <>
+EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a)
+{ return vrndmq_f16(a); }
 
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
-  float32x4x2_t vtrn1, vtrn2, res1, res2;
-  Packet4f sum1, sum2, sum;
+template <>
+EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a)
+{ return vrndm_f16(a); }
 
-  // NEON zip performs interleaving of the supplied vectors.
-  // We perform two interleaves in a row to acquire the transposed vector
-  vtrn1 = vzipq_f32(vecs[0], vecs[2]);
-  vtrn2 = vzipq_f32(vecs[1], vecs[3]);
-  res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
-  res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);
+template <>
+EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a)
+{ return vrndpq_f16(a); }
 
-  // Do the addition of the resulting vectors
-  sum1 = vaddq_f32(res1.val[0], res1.val[1]);
-  sum2 = vaddq_f32(res2.val[0], res2.val[1]);
-  sum = vaddq_f32(sum1, sum2);
+template <>
+EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a)
+{ return vrndp_f16(a); }
 
-  return sum;
+template <>
+EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(const Packet8hf& a) {
+  return vsqrtq_f16(a);
 }
 
-template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
-{
-  int32x2_t a_lo, a_hi, sum;
+template <>
+EIGEN_STRONG_INLINE Packet4hf psqrt<Packet4hf>(const Packet4hf& a) {
+  return vsqrt_f16(a);
+}
 
-  a_lo = vget_low_s32(a);
-  a_hi = vget_high_s32(a);
-  sum = vpadd_s32(a_lo, a_hi);
-  sum = vpadd_s32(sum, sum);
-  return vget_lane_s32(sum, 0);
+template <>
+EIGEN_STRONG_INLINE Packet8hf pand<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
-{
-  int32x4x2_t vtrn1, vtrn2, res1, res2;
-  Packet4i sum1, sum2, sum;
+template <>
+EIGEN_STRONG_INLINE Packet4hf pand<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
+}
 
-  // NEON zip performs interleaving of the supplied vectors.
-  // We perform two interleaves in a row to acquire the transposed vector
-  vtrn1 = vzipq_s32(vecs[0], vecs[2]);
-  vtrn2 = vzipq_s32(vecs[1], vecs[3]);
-  res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
-  res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);
+template <>
+EIGEN_STRONG_INLINE Packet8hf por<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
+}
 
-  // Do the addition of the resulting vectors
-  sum1 = vaddq_s32(res1.val[0], res1.val[1]);
-  sum2 = vaddq_s32(res2.val[0], res2.val[1]);
-  sum = vaddq_s32(sum1, sum2);
+template <>
+EIGEN_STRONG_INLINE Packet4hf por<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
+}
 
-  return sum;
+template <>
+EIGEN_STRONG_INLINE Packet8hf pxor<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
 }
 
-// Other reduction functions:
-// mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
-  float32x2_t a_lo, a_hi, prod;
+template <>
+EIGEN_STRONG_INLINE Packet4hf pxor<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(veor_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
+}
 
-  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
-  a_lo = vget_low_f32(a);
-  a_hi = vget_high_f32(a);
-  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
-  prod = vmul_f32(a_lo, a_hi);
-  // Multiply prod with its swapped value |a2*a4|a1*a3|
-  prod = vmul_f32(prod, vrev64_f32(prod));
+template <>
+EIGEN_STRONG_INLINE Packet8hf pandnot<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
+}
 
-  return vget_lane_f32(prod, 0);
+template <>
+EIGEN_STRONG_INLINE Packet4hf pandnot<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(vbic_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
 }
-template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
-{
-  int32x2_t a_lo, a_hi, prod;
 
-  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
-  a_lo = vget_low_s32(a);
-  a_hi = vget_high_s32(a);
-  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
-  prod = vmul_s32(a_lo, a_hi);
-  // Multiply prod with its swapped value |a2*a4|a1*a3|
-  prod = vmul_s32(prod, vrev64_s32(prod));
+template <>
+EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
+}
 
-  return vget_lane_s32(prod, 0);
+template <>
+EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
 }
 
-// min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
-  float32x2_t a_lo, a_hi, min;
+template <>
+EIGEN_STRONG_INLINE Packet8hf ploadu<Packet8hf>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
+}
 
-  a_lo = vget_low_f32(a);
-  a_hi = vget_high_f32(a);
-  min = vpmin_f32(a_lo, a_hi);
-  min = vpmin_f32(min, min);
+template <>
+EIGEN_STRONG_INLINE Packet4hf ploadu<Packet4hf>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
+}
 
-  return vget_lane_f32(min, 0);
+template <>
+EIGEN_STRONG_INLINE Packet8hf ploaddup<Packet8hf>(const Eigen::half* from) {
+  Packet8hf packet;
+  packet[0] = from[0].x;
+  packet[1] = from[0].x;
+  packet[2] = from[1].x;
+  packet[3] = from[1].x;
+  packet[4] = from[2].x;
+  packet[5] = from[2].x;
+  packet[6] = from[3].x;
+  packet[7] = from[3].x;
+  return packet;
 }
 
-template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
-{
-  int32x2_t a_lo, a_hi, min;
+template <>
+EIGEN_STRONG_INLINE Packet4hf ploaddup<Packet4hf>(const Eigen::half* from) {
+  float16x4_t packet;
+  float16_t* tmp;
+  tmp = (float16_t*)&packet;
+  tmp[0] = from[0].x;
+  tmp[1] = from[0].x;
+  tmp[2] = from[1].x;
+  tmp[3] = from[1].x;
+  return packet;
+}
 
-  a_lo = vget_low_s32(a);
-  a_hi = vget_high_s32(a);
-  min = vpmin_s32(a_lo, a_hi);
-  min = vpmin_s32(min, min);
-  
-  return vget_lane_s32(min, 0);
+template <>
+EIGEN_STRONG_INLINE Packet8hf ploadquad<Packet8hf>(const Eigen::half* from) {
+  Packet4hf lo, hi;
+  lo = vld1_dup_f16(reinterpret_cast<const float16_t*>(from));
+  hi = vld1_dup_f16(reinterpret_cast<const float16_t*>(from+1));
+  return vcombine_f16(lo, hi);
 }
 
-// max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
-  float32x2_t a_lo, a_hi, max;
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 0); }
 
-  a_lo = vget_low_f32(a);
-  a_hi = vget_high_f32(a);
-  max = vpmax_f32(a_lo, a_hi);
-  max = vpmax_f32(max, max);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 0); }
 
-  return vget_lane_f32(max, 0);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) {
+  return vbslq_f16(vreinterpretq_u16_f16(mask), a, b);
 }
 
-template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
-{
-  int32x2_t a_lo, a_hi, max;
-
-  a_lo = vget_low_s32(a);
-  a_hi = vget_high_s32(a);
-  max = vpmax_s32(a_lo, a_hi);
-  max = vpmax_s32(max, max);
-
-  return vget_lane_s32(max, 0);
-}
-
-// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
-// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
-#define PALIGN_NEON(Offset,Type,Command) \
-template<>\
-struct palign_impl<Offset,Type>\
-{\
-    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
-    {\
-        if (Offset!=0)\
-            first = Command(first, second, Offset);\
-    }\
-};\
-
-PALIGN_NEON(0,Packet4f,vextq_f32)
-PALIGN_NEON(1,Packet4f,vextq_f32)
-PALIGN_NEON(2,Packet4f,vextq_f32)
-PALIGN_NEON(3,Packet4f,vextq_f32)
-PALIGN_NEON(0,Packet4i,vextq_s32)
-PALIGN_NEON(1,Packet4i,vextq_s32)
-PALIGN_NEON(2,Packet4i,vextq_s32)
-PALIGN_NEON(3,Packet4i,vextq_s32)
-
-#undef PALIGN_NEON
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
-  float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
-  float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
-
-  kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
-  kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
-  kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
-  kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
-  int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
-  int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
-  kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
-  kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
-  kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
-  kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(const Packet4hf& mask, const Packet4hf& a, const Packet4hf& b) {
+  return vbsl_f16(vreinterpret_u16_f16(mask), a, b);
 }
 
-//---------- double ----------
-
-// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
-// Confirmed at least with __apple_build_version__ = 6000054.
-#ifdef __apple_build_version__
-// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
-// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
-// major toolchain updates.
-#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
-#else
-#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
-#endif
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 7); }
 
-#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 3); }
 
-// Bug 907: workaround missing declarations of the following two functions in the ADK
-// Defining these functions as templates ensures that if these intrinsics are
-// already defined in arm_neon.h, then our workaround doesn't cause a conflict
-// and has lower priority in overload resolution.
-template <typename T>
-uint64x2_t vreinterpretq_u64_f64(T a)
-{
-  return (uint64x2_t) a;
+template <>
+EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
 }
 
-template <typename T>
-float64x2_t vreinterpretq_f64_u64(T a)
-{
-  return (float64x2_t) a;
+template <>
+EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
 }
 
-typedef float64x2_t Packet2d;
-typedef float64x1_t Packet1d;
-
-template<> struct packet_traits<double>  : default_packet_traits
-{
-  typedef Packet2d type;
-  typedef Packet2d half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 2,
-    HasHalfPacket=0,
-   
-    HasDiv  = 1,
-    // FIXME check the Has*
-    HasSin  = 0,
-    HasCos  = 0,
-    HasLog  = 0,
-    HasExp  = 0,
-    HasSqrt = 0
-  };
-};
+template <>
+EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
+}
 
-template<> struct unpacket_traits<Packet2d> { typedef double  type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+template <>
+EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) { return vdupq_n_f64(from); }
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather<Eigen::half, Packet8hf>(const Eigen::half* from, Index stride) {
+  Packet8hf res = pset1<Packet8hf>(Eigen::half(0.f));
+  res = vsetq_lane_f16(from[0 * stride].x, res, 0);
+  res = vsetq_lane_f16(from[1 * stride].x, res, 1);
+  res = vsetq_lane_f16(from[2 * stride].x, res, 2);
+  res = vsetq_lane_f16(from[3 * stride].x, res, 3);
+  res = vsetq_lane_f16(from[4 * stride].x, res, 4);
+  res = vsetq_lane_f16(from[5 * stride].x, res, 5);
+  res = vsetq_lane_f16(from[6 * stride].x, res, 6);
+  res = vsetq_lane_f16(from[7 * stride].x, res, 7);
+  return res;
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
-{
-  const double countdown_raw[] = {0.0,1.0};
-  const Packet2d countdown = vld1q_f64(countdown_raw);
-  return vaddq_f64(pset1<Packet2d>(a), countdown);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather<Eigen::half, Packet4hf>(const Eigen::half* from, Index stride) {
+  Packet4hf res = pset1<Packet4hf>(Eigen::half(0.f));
+  res = vset_lane_f16(from[0 * stride].x, res, 0);
+  res = vset_lane_f16(from[1 * stride].x, res, 1);
+  res = vset_lane_f16(from[2 * stride].x, res, 2);
+  res = vset_lane_f16(from[3 * stride].x, res, 3);
+  return res;
 }
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from, Index stride) {
+  to[stride * 0].x = vgetq_lane_f16(from, 0);
+  to[stride * 1].x = vgetq_lane_f16(from, 1);
+  to[stride * 2].x = vgetq_lane_f16(from, 2);
+  to[stride * 3].x = vgetq_lane_f16(from, 3);
+  to[stride * 4].x = vgetq_lane_f16(from, 4);
+  to[stride * 5].x = vgetq_lane_f16(from, 5);
+  to[stride * 6].x = vgetq_lane_f16(from, 6);
+  to[stride * 7].x = vgetq_lane_f16(from, 7);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from, Index stride) {
+  to[stride * 0].x = vget_lane_f16(from, 0);
+  to[stride * 1].x = vget_lane_f16(from, 1);
+  to[stride * 2].x = vget_lane_f16(from, 2);
+  to[stride * 3].x = vget_lane_f16(from, 3);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE void prefetch<Eigen::half>(const Eigen::half* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8hf>(const Packet8hf& a) {
+  float16_t x[8];
+  vst1q_f16(x, a);
+  Eigen::half h;
+  h.x = x[0];
+  return h;
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4hf>(const Packet4hf& a) {
+  float16_t x[4];
+  vst1_f16(x, a);
+  Eigen::half h;
+  h.x = x[0];
+  return h;
+}
 
-#ifdef __ARM_FEATURE_FMA
-// See bug 936. See above comment about FMA for float.
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
-#else
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
-#endif
+template<> EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi;
+  Packet8hf a_r64;
 
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
+  a_r64 = vrev64q_f16(a);
+  a_lo = vget_low_f16(a_r64);
+  a_hi = vget_high_f16(a_r64);
+  return vcombine_f16(a_hi, a_lo);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4hf preverse<Packet4hf>(const Packet4hf& a) {
+  return vrev64_f16(a);
+}
 
-// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
-  return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+template <>
+EIGEN_STRONG_INLINE Packet8hf pabs<Packet8hf>(const Packet8hf& a) {
+  return vabsq_f16(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
-  return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+template <>
+EIGEN_STRONG_INLINE Packet4hf pabs<Packet4hf>(const Packet4hf& a) {
+  return vabs_f16(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
-  return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux<Packet8hf>(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi, sum;
+
+  a_lo = vget_low_f16(a);
+  a_hi = vget_high_f16(a);
+  sum = vpadd_f16(a_lo, a_hi);
+  sum = vpadd_f16(sum, sum);
+  sum = vpadd_f16(sum, sum);
+
+  Eigen::half h;
+  h.x = vget_lane_f16(sum, 0);
+  return h;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
-  return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux<Packet4hf>(const Packet4hf& a) {
+  float16x4_t sum;
+
+  sum = vpadd_f16(a, a);
+  sum = vpadd_f16(sum, sum);
+  Eigen::half h;
+  h.x = vget_lane_f16(sum, 0);
+  return h;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8hf>(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi, prod;
 
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
+  a_lo = vget_low_f16(a);
+  a_hi = vget_high_f16(a);
+  prod = vmul_f16(a_lo, a_hi);
+  prod = vmul_f16(prod, vrev64_f16(prod));
 
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
-{
-  return vld1q_dup_f64(from);
+  Eigen::half h;
+  h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
+  return h;
 }
-template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); }
 
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); }
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4hf>(const Packet4hf& a) {
+  float16x4_t prod;
+  prod = vmul_f16(a, vrev64_f16(a));
+  Eigen::half h;
+  h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
+  return h;
+}
 
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
-{
-  Packet2d res = pset1<Packet2d>(0.0);
-  res = vsetq_lane_f64(from[0*stride], res, 0);
-  res = vsetq_lane_f64(from[1*stride], res, 1);
-  return res;
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8hf>(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi, min;
+
+  a_lo = vget_low_f16(a);
+  a_hi = vget_high_f16(a);
+  min = vpmin_f16(a_lo, a_hi);
+  min = vpmin_f16(min, min);
+  min = vpmin_f16(min, min);
+
+  Eigen::half h;
+  h.x = vget_lane_f16(min, 0);
+  return h;
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
-{
-  to[stride*0] = vgetq_lane_f64(from, 0);
-  to[stride*1] = vgetq_lane_f64(from, 1);
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4hf>(const Packet4hf& a) {
+  Packet4hf tmp;
+  tmp = vpmin_f16(a, a);
+  tmp = vpmin_f16(tmp, tmp);
+  Eigen::half h;
+  h.x = vget_lane_f16(tmp, 0);
+  return h;
 }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
 
-// FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a, 0); }
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8hf>(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi, max;
 
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
+  a_lo = vget_low_f16(a);
+  a_hi = vget_high_f16(a);
+  max = vpmax_f16(a_lo, a_hi);
+  max = vpmax_f16(max, max);
+  max = vpmax_f16(max, max);
 
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
+  Eigen::half h;
+  h.x = vget_lane_f16(max, 0);
+  return h;
+}
 
-#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
-// workaround ICE, see bug 907
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
-#else
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
-#endif
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4hf>(const Packet4hf& a) {
+  Packet4hf tmp;
+  tmp = vpmax_f16(a, a);
+  tmp = vpmax_f16(tmp, tmp);
+  Eigen::half h;
+  h.x = vget_lane_f16(tmp, 0);
+  return h;
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel)
 {
-  float64x2_t trn1, trn2;
+  const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
+  const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
 
-  // NEON zip performs interleaving of the supplied vectors.
-  // We perform two interleaves in a row to acquire the transposed vector
-  trn1 = vzip1q_f64(vecs[0], vecs[1]);
-  trn2 = vzip2q_f64(vecs[0], vecs[1]);
+  const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0]));
+  const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1]));
 
-  // Do the addition of the resulting vectors
-  return vaddq_f64(trn1, trn2);
+  kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]);
+  kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]);
+  kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]);
+  kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]);
 }
-// Other reduction functions:
-// mul
-#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
-#else
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
-#endif
 
-// min
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {
+  EIGEN_ALIGN16 float16x4x4_t tmp_x4;
+  float16_t* tmp = (float16_t*)&kernel;
+  tmp_x4 = vld4_f16(tmp);
 
-// max
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); }
-
-// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
-// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
-#define PALIGN_NEON(Offset,Type,Command) \
-template<>\
-struct palign_impl<Offset,Type>\
-{\
-    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
-    {\
-        if (Offset!=0)\
-            first = Command(first, second, Offset);\
-    }\
-};\
-
-PALIGN_NEON(0,Packet2d,vextq_f64)
-PALIGN_NEON(1,Packet2d,vextq_f64)
-#undef PALIGN_NEON
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2d,2>& kernel) {
-  float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
-  float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
-
-  kernel.packet[0] = trn1;
-  kernel.packet[1] = trn2;
-}
-#endif // EIGEN_ARCH_ARM64 
+  kernel.packet[0] = tmp_x4.val[0];
+  kernel.packet[1] = tmp_x4.val[1];
+  kernel.packet[2] = tmp_x4.val[2];
+  kernel.packet[3] = tmp_x4.val[3];
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 8>& kernel) {
+  float16x8x2_t T_1[4];
+
+  T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]);
+  T_1[1] = vuzpq_f16(kernel.packet[2], kernel.packet[3]);
+  T_1[2] = vuzpq_f16(kernel.packet[4], kernel.packet[5]);
+  T_1[3] = vuzpq_f16(kernel.packet[6], kernel.packet[7]);
+
+  float16x8x2_t T_2[4];
+  T_2[0] = vuzpq_f16(T_1[0].val[0], T_1[1].val[0]);
+  T_2[1] = vuzpq_f16(T_1[0].val[1], T_1[1].val[1]);
+  T_2[2] = vuzpq_f16(T_1[2].val[0], T_1[3].val[0]);
+  T_2[3] = vuzpq_f16(T_1[2].val[1], T_1[3].val[1]);
+
+  float16x8x2_t T_3[4];
+  T_3[0] = vuzpq_f16(T_2[0].val[0], T_2[2].val[0]);
+  T_3[1] = vuzpq_f16(T_2[0].val[1], T_2[2].val[1]);
+  T_3[2] = vuzpq_f16(T_2[1].val[0], T_2[3].val[0]);
+  T_3[3] = vuzpq_f16(T_2[1].val[1], T_2[3].val[1]);
+
+  kernel.packet[0] = T_3[0].val[0];
+  kernel.packet[1] = T_3[2].val[0];
+  kernel.packet[2] = T_3[1].val[0];
+  kernel.packet[3] = T_3[3].val[0];
+  kernel.packet[4] = T_3[0].val[1];
+  kernel.packet[5] = T_3[2].val[1];
+  kernel.packet[6] = T_3[1].val[1];
+  kernel.packet[7] = T_3[3].val[1];
+}
+#endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
 
 } // end namespace internal
 
diff --git a/contrib/eigen/Eigen/src/Core/arch/SSE/Complex.h b/contrib/eigen/Eigen/src/Core/arch/SSE/Complex.h
index d075043ce1b4917a8afe55188607a77d0a287124..8fe22da46c78b423bb5daf76b64ae24069bd04e7 100644
--- a/contrib/eigen/Eigen/src/Core/arch/SSE/Complex.h
+++ b/contrib/eigen/Eigen/src/Core/arch/SSE/Complex.h
@@ -19,7 +19,7 @@ struct Packet2cf
 {
   EIGEN_STRONG_INLINE Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
-  __m128  v;
+  Packet4f v;
 };
 
 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
@@ -40,20 +40,33 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     HasMul    = 1,
     HasDiv    = 1,
     HasNegate = 1,
+    HasSqrt   = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
     HasMax    = 0,
     HasSetLinear = 0,
-    HasBlend = 1
+    HasBlend  = 1
   };
 };
 #endif
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float> type;
+  typedef Packet2cf half;
+  typedef Packet4f as_real;
+  enum {
+    size=2,
+    alignment=Aligned16,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
 
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
+
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a)
 {
   const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
@@ -82,10 +95,11 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
   #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet2cf ptrue  <Packet2cf>(const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(b.v,a.v)); }
 
 template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from))); }
 template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from))); }
@@ -93,19 +107,13 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<fl
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
   Packet2cf res;
-#if EIGEN_GNUC_AT_MOST(4,2)
-  // Workaround annoying "may be used uninitialized in this function" warning with gcc 4.2
-  res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from));
-#elif EIGEN_GNUC_AT_LEAST(4,6)
-  // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6
-  #pragma GCC diagnostic push
-  #pragma GCC diagnostic ignored "-Wuninitialized"
-  res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
-  #pragma GCC diagnostic pop
+#ifdef EIGEN_VECTORIZE_SSE3
+  res.v = _mm_castpd_ps(_mm_loaddup_pd(reinterpret_cast<double const*>(&from)));
 #else
-  res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
+  res.v = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<double const*>(&from)));
+  res.v = _mm_movelh_ps(res.v, res.v);
 #endif
-  return Packet2cf(_mm_movelh_ps(res.v,res.v));
+  return res;
 }
 
 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
@@ -152,97 +160,26 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packe
   return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v))));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
-{
-  return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v)));
-}
-
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
 {
   return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v))));
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet2cf>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
-  {
-    if (Offset==1)
-    {
-      first.v = _mm_movehl_ps(first.v, first.v);
-      first.v = _mm_movelh_ps(first.v, second.v);
-    }
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return internal::pmul(a, pconj(b));
-    #else
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
-    return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
-                                _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-                                           vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-    #endif
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return internal::pmul(pconj(a), b);
-    #else
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
-    return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
-                                _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-                                                      vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
-    #endif
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip/* <Packet2cf> */(const Packet2cf& x)
 {
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return pconj(internal::pmul(a, b));
-    #else
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
-    return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
-                                _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-                                           vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-    #endif
-  }
-};
+  return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
+}
 
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   // TODO optimize it for SSE3 and 4
-  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
+  Packet2cf res = pmul(a, pconj(b));
   __m128 s = _mm_mul_ps(b.v,b.v);
-  return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1)))));
+  return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,vec4f_swizzle1(s, 1, 0, 3, 2))));
 }
 
-EIGEN_STRONG_INLINE Packet2cf pcplxflip/* <Packet2cf> */(const Packet2cf& x)
-{
-  return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
-}
 
 
 //---------- double ----------
@@ -250,7 +187,7 @@ struct Packet1cd
 {
   EIGEN_STRONG_INLINE Packet1cd() {}
   EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
-  __m128d  v;
+  Packet2d v;
 };
 
 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
@@ -271,6 +208,7 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
     HasMul    = 1,
     HasDiv    = 1,
     HasNegate = 1,
+    HasSqrt   = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
@@ -280,7 +218,18 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 };
 #endif
 
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
+  enum {
+    size=1,
+    alignment=Aligned16,
+    vectorizable=true,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
 
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
@@ -305,10 +254,11 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
   #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet1cd ptrue  <Packet1cd>(const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(b.v,a.v)); }
 
 // FIXME force unaligned load, this is a temporary fix
 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from)
@@ -340,86 +290,17 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Pack
   return pfirst(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
-{
-  return vecs[0];
-}
-
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
 {
   return pfirst(a);
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet1cd>
-{
-  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
-  {
-    // FIXME is it sure we never have to align a Packet1cd?
-    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return internal::pmul(a, pconj(b));
-    #else
-    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-    return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
-                                _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
-                                           vec2d_swizzle1(b.v, 1, 0))));
-    #endif
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return internal::pmul(pconj(a), b);
-    #else
-    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-    return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
-                                _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
-                                                      vec2d_swizzle1(b.v, 1, 0)), mask)));
-    #endif
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return pconj(internal::pmul(a, b));
-    #else
-    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-    return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
-                                _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
-                                           vec2d_swizzle1(b.v, 1, 0))));
-    #endif
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   // TODO optimize it for SSE3 and 4
-  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet1cd res = pmul(a,pconj(b));
   __m128d s = _mm_mul_pd(b.v,b.v);
   return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1))));
 }
@@ -439,33 +320,32 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
   kernel.packet[1].v = tmp;
 }
 
-template<>  EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
-  __m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
-  return Packet2cf(_mm_castpd_ps(result));
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
+{
+  __m128 eq = _mm_cmpeq_ps(a.v, b.v);
+  return Packet2cf(pand<Packet4f>(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pinsertfirst(const Packet2cf& a, std::complex<float> b)
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
 {
-  return Packet2cf(_mm_loadl_pi(a.v, reinterpret_cast<const __m64*>(&b)));
+  __m128d eq = _mm_cmpeq_pd(a.v, b.v);
+  return Packet1cd(pand<Packet2d>(eq, vec2d_swizzle1(eq, 1, 0)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pinsertfirst(const Packet1cd&, std::complex<double> b)
-{
-  return pset1<Packet1cd>(b);
+template<>  EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  __m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
+  return Packet2cf(_mm_castpd_ps(result));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pinsertlast(const Packet2cf& a, std::complex<float> b)
-{
-  return Packet2cf(_mm_loadh_pi(a.v, reinterpret_cast<const __m64*>(&b)));
+template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+  return psqrt_complex<Packet1cd>(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pinsertlast(const Packet1cd&, std::complex<double> b)
-{
-  return pset1<Packet1cd>(b);
+template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+  return psqrt_complex<Packet2cf>(a);
 }
 
 } // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_COMPLEX_SSE_H
diff --git a/contrib/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h b/contrib/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h
index 7b5f948e1199586a544b41b4af12b186abd82e49..8736d0d6b50a1b7711ccbfb31e38987fd7bb312d 100644
--- a/contrib/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/contrib/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -8,7 +8,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-/* The sin, cos, exp, and log functions of this file come from
+/* The sin and cos and functions of this file come from
  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
  */
 
@@ -20,426 +20,57 @@ namespace Eigen {
 namespace internal {
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f plog<Packet4f>(const Packet4f& _x)
-{
-  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
-  /* the smallest non denormalized float number */
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000);//-1.f/0.f);
-
-  /* natural logarithm computed for 4 simultaneous float
-    return NaN for x <= 0
-  */
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-
-
-  Packet4i emm0;
-
-  Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN
-  Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
-
-  x = pmax(x, p4f_min_norm_pos);  /* cut off denormalized stuff */
-  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
-
-  /* keep only the fractional part */
-  x = _mm_and_ps(x, p4f_inv_mant_mask);
-  x = _mm_or_ps(x, p4f_half);
-
-  emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
-  Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1);
-
-  /* part2:
-     if( x < SQRTHF ) {
-       e -= 1;
-       x = x + x - 1.0;
-     } else { x = x - 1.0; }
-  */
-  Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
-  Packet4f tmp = pand(x, mask);
-  x = psub(x, p4f_1);
-  e = psub(e, pand(p4f_1, mask));
-  x = padd(x, tmp);
-
-  Packet4f x2 = pmul(x,x);
-  Packet4f x3 = pmul(x2,x);
-
-  Packet4f y, y1, y2;
-  y  = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
-  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
-  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
-  y  = pmadd(y , x, p4f_cephes_log_p2);
-  y1 = pmadd(y1, x, p4f_cephes_log_p5);
-  y2 = pmadd(y2, x, p4f_cephes_log_p8);
-  y = pmadd(y, x3, y1);
-  y = pmadd(y, x3, y2);
-  y = pmul(y, x3);
-
-  y1 = pmul(e, p4f_cephes_log_q1);
-  tmp = pmul(x2, p4f_half);
-  y = padd(y, y1);
-  x = psub(x, tmp);
-  y2 = pmul(e, p4f_cephes_log_q2);
-  x = padd(x, y);
-  x = padd(x, y2);
-  // negative arg will be NAN, 0 will be -INF
-  return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)),
-                   _mm_and_ps(iszero_mask, p4f_minus_inf));
+Packet4f plog<Packet4f>(const Packet4f& _x) {
+  return plog_float(_x);
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pexp<Packet4f>(const Packet4f& _x)
-{
-  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-
-
-  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
-  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-
-  Packet4f tmp, fx;
-  Packet4i emm0;
+Packet2d plog<Packet2d>(const Packet2d& _x) {
+  return plog_double(_x);
+}
 
-  // clamp x
-  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog2<Packet4f>(const Packet4f& _x) {
+  return plog2_float(_x);
+}
 
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d plog2<Packet2d>(const Packet2d& _x) {
+  return plog2_double(_x);
+}
 
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  fx = _mm_floor_ps(fx);
-#else
-  emm0 = _mm_cvttps_epi32(fx);
-  tmp  = _mm_cvtepi32_ps(emm0);
-  /* if greater, substract 1 */
-  Packet4f mask = _mm_cmpgt_ps(tmp, fx);
-  mask = _mm_and_ps(mask, p4f_1);
-  fx = psub(tmp, mask);
-#endif
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog1p<Packet4f>(const Packet4f& _x) {
+  return generic_plog1p(_x);
+}
 
-  tmp = pmul(fx, p4f_cephes_exp_C1);
-  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
-  x = psub(x, tmp);
-  x = psub(x, z);
-
-  z = pmul(x,x);
-
-  Packet4f y = p4f_cephes_exp_p0;
-  y = pmadd(y, x, p4f_cephes_exp_p1);
-  y = pmadd(y, x, p4f_cephes_exp_p2);
-  y = pmadd(y, x, p4f_cephes_exp_p3);
-  y = pmadd(y, x, p4f_cephes_exp_p4);
-  y = pmadd(y, x, p4f_cephes_exp_p5);
-  y = pmadd(y, z, x);
-  y = padd(y, p4f_1);
-
-  // build 2^n
-  emm0 = _mm_cvttps_epi32(fx);
-  emm0 = _mm_add_epi32(emm0, p4i_0x7f);
-  emm0 = _mm_slli_epi32(emm0, 23);
-  return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexpm1<Packet4f>(const Packet4f& _x) {
+  return generic_expm1(_x);
 }
+
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet2d pexp<Packet2d>(const Packet2d& _x)
+Packet4f pexp<Packet4f>(const Packet4f& _x)
 {
-  Packet2d x = _x;
-
-  _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
-  _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
-  _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
-  _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
-  _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
-  static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
-
-  Packet2d tmp, fx;
-  Packet4i emm0;
-
-  // clamp x
-  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
-
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  fx = _mm_floor_pd(fx);
-#else
-  emm0 = _mm_cvttpd_epi32(fx);
-  tmp  = _mm_cvtepi32_pd(emm0);
-  /* if greater, substract 1 */
-  Packet2d mask = _mm_cmpgt_pd(tmp, fx);
-  mask = _mm_and_pd(mask, p2d_1);
-  fx = psub(tmp, mask);
-#endif
-
-  tmp = pmul(fx, p2d_cephes_exp_C1);
-  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
-  x = psub(x, tmp);
-  x = psub(x, z);
-
-  Packet2d x2 = pmul(x,x);
-
-  Packet2d px = p2d_cephes_exp_p0;
-  px = pmadd(px, x2, p2d_cephes_exp_p1);
-  px = pmadd(px, x2, p2d_cephes_exp_p2);
-  px = pmul (px, x);
-
-  Packet2d qx = p2d_cephes_exp_q0;
-  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
-  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
-  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
-
-  x = pdiv(px,psub(qx,px));
-  x = pmadd(p2d_2,x,p2d_1);
-
-  // build 2^n
-  emm0 = _mm_cvttpd_epi32(fx);
-  emm0 = _mm_add_epi32(emm0, p4i_1023_0);
-  emm0 = _mm_slli_epi32(emm0, 20);
-  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
-  return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
+  return pexp_float(_x);
 }
 
-/* evaluation of 4 sines at onces, using SSE2 intrinsics.
-
-   The code is the exact rewriting of the cephes sinf function.
-   Precision is excellent as long as x < 8192 (I did not bother to
-   take into account the special handling they have for greater values
-   -- it does not return garbage for arguments over 8192, though, but
-   the extra precision is missing).
-
-   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
-   surprising but correct result.
-*/
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d pexp<Packet2d>(const Packet2d& x)
+{
+  return pexp_double(x);
+}
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psin<Packet4f>(const Packet4f& _x)
 {
-  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
-  _EIGEN_DECLARE_CONST_Packet4i(1, 1);
-  _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
-  _EIGEN_DECLARE_CONST_Packet4i(2, 2);
-  _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
-
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p1,  8.3321608736E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p0,  2.443315711809948E-005f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
-  Packet4f xmm1, xmm2, xmm3, sign_bit, y;
-
-  Packet4i emm0, emm2;
-  sign_bit = x;
-  /* take the absolute value */
-  x = pabs(x);
-
-  /* take the modulo */
-
-  /* extract the sign bit (upper one) */
-  sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask);
-
-  /* scale by 4/Pi */
-  y = pmul(x, p4f_cephes_FOPI);
-
-  /* store the integer part of y in mm0 */
-  emm2 = _mm_cvttps_epi32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  emm2 = _mm_add_epi32(emm2, p4i_1);
-  emm2 = _mm_and_si128(emm2, p4i_not1);
-  y = _mm_cvtepi32_ps(emm2);
-  /* get the swap sign flag */
-  emm0 = _mm_and_si128(emm2, p4i_4);
-  emm0 = _mm_slli_epi32(emm0, 29);
-  /* get the polynom selection mask
-     there is one polynom for 0 <= x <= Pi/4
-     and another one for Pi/4<x<=Pi/2
-
-     Both branches will be computed.
-  */
-  emm2 = _mm_and_si128(emm2, p4i_2);
-  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
-  Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
-  Packet4f poly_mask = _mm_castsi128_ps(emm2);
-  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = pmul(y, p4f_minus_cephes_DP1);
-  xmm2 = pmul(y, p4f_minus_cephes_DP2);
-  xmm3 = pmul(y, p4f_minus_cephes_DP3);
-  x = padd(x, xmm1);
-  x = padd(x, xmm2);
-  x = padd(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = p4f_coscof_p0;
-  Packet4f z = _mm_mul_ps(x,x);
-
-  y = pmadd(y, z, p4f_coscof_p1);
-  y = pmadd(y, z, p4f_coscof_p2);
-  y = pmul(y, z);
-  y = pmul(y, z);
-  Packet4f tmp = pmul(z, p4f_half);
-  y = psub(y, tmp);
-  y = padd(y, p4f_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  Packet4f y2 = p4f_sincof_p0;
-  y2 = pmadd(y2, z, p4f_sincof_p1);
-  y2 = pmadd(y2, z, p4f_sincof_p2);
-  y2 = pmul(y2, z);
-  y2 = pmul(y2, x);
-  y2 = padd(y2, x);
-
-  /* select the correct result from the two polynoms */
-  y2 = _mm_and_ps(poly_mask, y2);
-  y = _mm_andnot_ps(poly_mask, y);
-  y = _mm_or_ps(y,y2);
-  /* update the sign */
-  return _mm_xor_ps(y, sign_bit);
+  return psin_float(_x);
 }
 
-/* almost the same as psin */
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f pcos<Packet4f>(const Packet4f& _x)
 {
-  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
-  _EIGEN_DECLARE_CONST_Packet4i(1, 1);
-  _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
-  _EIGEN_DECLARE_CONST_Packet4i(2, 2);
-  _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p1,  8.3321608736E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p0,  2.443315711809948E-005f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
-  Packet4f xmm1, xmm2, xmm3, y;
-  Packet4i emm0, emm2;
-
-  x = pabs(x);
-
-  /* scale by 4/Pi */
-  y = pmul(x, p4f_cephes_FOPI);
-
-  /* get the integer part of y */
-  emm2 = _mm_cvttps_epi32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  emm2 = _mm_add_epi32(emm2, p4i_1);
-  emm2 = _mm_and_si128(emm2, p4i_not1);
-  y = _mm_cvtepi32_ps(emm2);
-
-  emm2 = _mm_sub_epi32(emm2, p4i_2);
-
-  /* get the swap sign flag */
-  emm0 = _mm_andnot_si128(emm2, p4i_4);
-  emm0 = _mm_slli_epi32(emm0, 29);
-  /* get the polynom selection mask */
-  emm2 = _mm_and_si128(emm2, p4i_2);
-  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
-  Packet4f sign_bit = _mm_castsi128_ps(emm0);
-  Packet4f poly_mask = _mm_castsi128_ps(emm2);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = pmul(y, p4f_minus_cephes_DP1);
-  xmm2 = pmul(y, p4f_minus_cephes_DP2);
-  xmm3 = pmul(y, p4f_minus_cephes_DP3);
-  x = padd(x, xmm1);
-  x = padd(x, xmm2);
-  x = padd(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = p4f_coscof_p0;
-  Packet4f z = pmul(x,x);
-
-  y = pmadd(y,z,p4f_coscof_p1);
-  y = pmadd(y,z,p4f_coscof_p2);
-  y = pmul(y, z);
-  y = pmul(y, z);
-  Packet4f tmp = _mm_mul_ps(z, p4f_half);
-  y = psub(y, tmp);
-  y = padd(y, p4f_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-  Packet4f y2 = p4f_sincof_p0;
-  y2 = pmadd(y2, z, p4f_sincof_p1);
-  y2 = pmadd(y2, z, p4f_sincof_p2);
-  y2 = pmul(y2, z);
-  y2 = pmadd(y2, x, x);
-
-  /* select the correct result from the two polynoms */
-  y2 = _mm_and_ps(poly_mask, y2);
-  y  = _mm_andnot_ps(poly_mask, y);
-  y  = _mm_or_ps(y,y2);
-
-  /* update the sign */
-  return _mm_xor_ps(y, sign_bit);
+  return pcos_float(_x);
 }
 
 #if EIGEN_FAST_MATH
@@ -455,17 +86,17 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psqrt<Packet4f>(const Packet4f& _x)
 {
-  Packet4f half = pmul(_x, pset1<Packet4f>(.5f));
-  Packet4f denormal_mask = _mm_and_ps(
-      _mm_cmpge_ps(_x, _mm_setzero_ps()),
-      _mm_cmplt_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())));
+  Packet4f minus_half_x = pmul(_x, pset1<Packet4f>(-0.5f));
+  Packet4f denormal_mask = pandnot(
+      pcmp_lt(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())),
+      pcmp_lt(_x, pzero(_x)));
 
   // Compute approximate reciprocal sqrt.
   Packet4f x = _mm_rsqrt_ps(_x);
   // Do a single step of Newton's iteration.
-  x = pmul(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x,x))));
+  x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1<Packet4f>(1.5f)));
   // Flush results for denormals to zero.
-  return _mm_andnot_ps(denormal_mask, pmul(_x,x));
+  return pandnot(pmul(_x,x), denormal_mask);
 }
 
 #else
@@ -478,41 +109,48 @@ Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
 
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet16b psqrt<Packet16b>(const Packet16b& x) { return x; }
+
 #if EIGEN_FAST_MATH
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000);
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000);
   _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
   _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u);
 
   Packet4f neg_half = pmul(_x, p4f_minus_half);
 
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min);
-  Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x));
-
-  // Fill in NaNs and Infs for the negative/zero entries.
-  Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps());
-  Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask);
-  Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan),
-                                     _mm_and_ps(zero_mask, p4f_inf));
-
-  // Do a single step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five));
-
-  // Insert NaNs and Infs in all the right places.
-  return _mm_or_ps(x, infs_and_nans);
+  // Identity infinite, zero, negative and denormal arguments.
+  Packet4f lt_min_mask = _mm_cmplt_ps(_x, p4f_flt_min);
+  Packet4f inf_mask = _mm_cmpeq_ps(_x, p4f_inf);
+  Packet4f not_normal_finite_mask = _mm_or_ps(lt_min_mask, inf_mask);
+
+  // Compute an approximate result using the rsqrt intrinsic.
+  Packet4f y_approx = _mm_rsqrt_ps(_x);
+
+  // Do a single step of Newton-Raphson iteration to improve the approximation.
+  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
+  // It is essential to evaluate the inner term like this because forming
+  // y_n^2 may over- or underflow.
+  Packet4f y_newton = pmul(
+      y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p4f_one_point_five));
+
+  // Select the result of the Newton-Raphson step for positive normal arguments.
+  // For other arguments, choose the output of the intrinsic. This will
+  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if
+  // x is zero or a positive denormalized float (equivalent to flushing positive
+  // denormalized inputs to zero).
+  return pselect<Packet4f>(not_normal_finite_mask, y_approx, y_newton);
 }
 
 #else
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f prsqrt<Packet4f>(const Packet4f& x) {
-  // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation.
+  // Unfortunately we can't use the much faster mm_rsqrt_ps since it only provides an approximation.
   return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x));
 }
 
@@ -520,7 +158,6 @@ Packet4f prsqrt<Packet4f>(const Packet4f& x) {
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d prsqrt<Packet2d>(const Packet2d& x) {
-  // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
   return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x));
 }
 
@@ -548,7 +185,7 @@ double sqrt(const double &x)
 {
 #if EIGEN_COMP_GNUC_STRICT
   // This works around a GCC bug generating poor code for _mm_sqrt_pd
-  // See https://bitbucket.org/eigen/eigen/commits/14f468dba4d350d7c19c9b93072e19f7b3df563b
+  // See https://gitlab.com/libeigen/eigen/commit/8dca9f97e38970
   return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x))));
 #else
   return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x))));
diff --git a/contrib/eigen/Eigen/src/Core/arch/SSE/PacketMath.h b/contrib/eigen/Eigen/src/Core/arch/SSE/PacketMath.h
index 60e2517e4bdf935888d247dc71b30c633d953412..db102c73a2376887332d2b25f6894bfa86284653 100755
--- a/contrib/eigen/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/contrib/eigen/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -18,13 +18,15 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
 
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
+// 32 bits =>  8 registers
+// 64 bits => 16 registers
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif
 
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 #endif
 
@@ -34,47 +36,75 @@ namespace internal {
 // One solution is to increase ABI version using -fabi-version=4 (or greater).
 // Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
 // structure:
-template<typename T>
-struct eigen_packet_wrapper
-{
-  EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
-  EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
-  EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
-  EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
-  EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
-    m_val = v;
-    return *this;
-  }
-  
-  T m_val;
-};
 typedef eigen_packet_wrapper<__m128>  Packet4f;
-typedef eigen_packet_wrapper<__m128i> Packet4i;
 typedef eigen_packet_wrapper<__m128d> Packet2d;
 #else
 typedef __m128  Packet4f;
-typedef __m128i Packet4i;
 typedef __m128d Packet2d;
 #endif
 
+typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
+typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
+
 template<> struct is_arithmetic<__m128>  { enum { value = true }; };
 template<> struct is_arithmetic<__m128i> { enum { value = true }; };
 template<> struct is_arithmetic<__m128d> { enum { value = true }; };
+template<> struct is_arithmetic<Packet4i>  { enum { value = true }; };
+template<> struct is_arithmetic<Packet16b>  { enum { value = true }; };
+
+template<int p, int q, int r, int s>
+struct shuffle_mask{
+ enum { mask = (s)<<6|(r)<<4|(q)<<2|(p) };
+};
 
+// TODO: change the implementation of all swizzle* ops from macro to template,
 #define vec4f_swizzle1(v,p,q,r,s) \
-  (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
+  Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), (shuffle_mask<p,q,r,s>::mask))))
 
 #define vec4i_swizzle1(v,p,q,r,s) \
-  (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
+  Packet4i(_mm_shuffle_epi32( v, (shuffle_mask<p,q,r,s>::mask)))
 
 #define vec2d_swizzle1(v,p,q) \
-  (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
-  
+  Packet2d(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), (shuffle_mask<2*p,2*p+1,2*q,2*q+1>::mask))))
+
 #define vec4f_swizzle2(a,b,p,q,r,s) \
-  (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
+  Packet4f(_mm_shuffle_ps( (a), (b), (shuffle_mask<p,q,r,s>::mask)))
 
 #define vec4i_swizzle2(a,b,p,q,r,s) \
-  (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
+  Packet4i(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p,q,r,s>::mask)))))
+
+EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
+{
+  return Packet4f(_mm_movelh_ps(a,b));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
+{
+  return Packet4f(_mm_movehl_ps(a,b));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
+{
+  return Packet4f(_mm_unpacklo_ps(a,b));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
+{
+  return Packet4f(_mm_unpackhi_ps(a,b));
+}
+#define vec4f_duplane(a,p) \
+  vec4f_swizzle2(a,a,p,p,p,p)
+
+#define vec2d_swizzle2(a,b,mask) \
+  Packet2d(_mm_shuffle_pd(a,b,mask))
+
+EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b)
+{
+  return Packet2d(_mm_unpacklo_pd(a,b));
+}
+EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b)
+{
+  return Packet2d(_mm_unpackhi_pd(a,b));
+}
+#define vec2d_duplane(a,p) \
+  vec2d_swizzle2(a,a,(p<<1)|p)
 
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
   const Packet4f p4f_##NAME = pset1<Packet4f>(X)
@@ -83,7 +113,7 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
   const Packet2d p2d_##NAME = pset1<Packet2d>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
+  const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
@@ -92,36 +122,41 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
 // to leverage AVX instructions.
 #ifndef EIGEN_VECTORIZE_AVX
-template<> struct packet_traits<float>  : default_packet_traits
-{
+template <>
+struct packet_traits<float> : default_packet_traits {
   typedef Packet4f type;
   typedef Packet4f half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4,
+    size = 4,
     HasHalfPacket = 0,
 
-    HasDiv  = 1,
-    HasSin  = EIGEN_FAST_MATH,
-    HasCos  = EIGEN_FAST_MATH,
-    HasLog  = 1,
-    HasExp  = 1,
+    HasCmp  = 1,
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasNdtri = 1,
+    HasExp = 1,
+    HasBessel = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasTanh  = EIGEN_FAST_MATH,
-    HasBlend = 1
-
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasBlend = 1,
+    HasCeil = 1,
+    HasFloor = 1,
 #ifdef EIGEN_VECTORIZE_SSE4_1
-    ,
     HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1
 #endif
+    HasRint = 1
   };
 };
-template<> struct packet_traits<double> : default_packet_traits
-{
+template <>
+struct packet_traits<double> : default_packet_traits {
   typedef Packet2d type;
   typedef Packet2d half;
   enum {
@@ -130,18 +165,19 @@ template<> struct packet_traits<double> : default_packet_traits
     size=2,
     HasHalfPacket = 0,
 
+    HasCmp  = 1,
     HasDiv  = 1,
+    HasLog  = 1,
     HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasBlend = 1
-
+    HasBlend = 1,
+    HasFloor = 1,
+    HasCeil = 1,
 #ifdef EIGEN_VECTORIZE_SSE4_1
-    ,
     HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1
 #endif
+    HasRint = 1
   };
 };
 #endif
@@ -154,13 +190,56 @@ template<> struct packet_traits<int>    : default_packet_traits
     AlignedOnScalar = 1,
     size=4,
 
+    HasShift = 1,
     HasBlend = 1
   };
 };
 
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct packet_traits<bool> : default_packet_traits
+{
+  typedef Packet16b type;
+  typedef Packet16b half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    HasHalfPacket = 0,
+    size=16,
+
+    HasAdd       = 1,
+    HasSub       = 1,
+    HasShift     = 0,
+    HasMul       = 1,
+    HasNegate    = 1,
+    HasAbs       = 0,
+    HasAbs2      = 0,
+    HasMin       = 0,
+    HasMax       = 0,
+    HasConj      = 0,
+    HasSqrt      = 1
+  };
+};
+
+template<> struct unpacket_traits<Packet4f> {
+  typedef float     type;
+  typedef Packet4f  half;
+  typedef Packet4i  integer_packet;
+  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet2d> {
+  typedef double    type;
+  typedef Packet2d  half;
+  enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet4i> {
+  typedef int       type;
+  typedef Packet4i  half;
+  enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false, masked_store_available=false};
+};
+template<> struct unpacket_traits<Packet16b> {
+  typedef bool       type;
+  typedef Packet16b  half;
+  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
 
 #ifndef EIGEN_VECTORIZE_AVX
 template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
@@ -179,6 +258,18 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { re
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
 #endif
+template<> EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(const bool&    from) { return _mm_set1_epi8(static_cast<char>(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return _mm_castsi128_ps(pset1<Packet4i>(from)); }
+template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) { return _mm_castsi128_pd(_mm_set1_epi64x(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) { return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1)); }
+template<> EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) { return _mm_set_epi32(0, -1, 0, -1); }
+template<> EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& /*a*/) { return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); }
+template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); }
+template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); }
 
 // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
 // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
@@ -190,7 +281,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
   return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
 }
 #endif
-  
+
 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
@@ -199,9 +290,34 @@ template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
+
 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+#ifdef EIGEN_VECTORIZE_SSE3
+  return _mm_addsub_ps(a,b);
+#else
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x0,0x80000000,0x0));
+  return padd(a, pxor(mask, b));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
+template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) 
+{
+#ifdef EIGEN_VECTORIZE_SSE3  
+  return _mm_addsub_pd(a,b); 
+#else
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x0)); 
+  return padd(a, pxor(mask, b));
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
 {
@@ -218,6 +334,11 @@ template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
   return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
 }
 
+template<> EIGEN_STRONG_INLINE Packet16b pnegate(const Packet16b& a)
+{
+  return psub(pset1<Packet16b>(false), a);
+}
+
 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
@@ -240,18 +361,126 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
+
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
 
 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+  return _mm_blendv_ps(b,a,mask);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
+  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b),_mm_castsi128_ps(a),_mm_castsi128_ps(mask)));
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {  return _mm_blendv_pd(b,a,mask); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
+  return _mm_blendv_epi8(b,a,mask);
+}
+#else
+template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
+  Packet16b a_part = _mm_and_si128(mask, a);
+  Packet16b b_part = _mm_andnot_si128(mask, b);
+  return _mm_or_si128(a_part, b_part);
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
+template<> EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& a) { return _mm_cmpeq_epi8(a, a); }
+template<> EIGEN_STRONG_INLINE Packet4f
+ptrue<Packet4f>(const Packet4f& a) {
+  Packet4i b = _mm_castps_si128(a);
+  return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
+}
+template<> EIGEN_STRONG_INLINE Packet2d
+ptrue<Packet2d>(const Packet2d& a) {
+  Packet4i b = _mm_castpd_si128(a);
+  return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b por<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { return _mm_cmpnge_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // There appears to be a bug in GCC, by which the optimizer may
+  // flip the argument order in calls to _mm_min_ps, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  #ifdef EIGEN_VECTORIZE_AVX
+  Packet4f res;
+  asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  #else
+  Packet4f res = b;
+  asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  #endif
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm_min_ps(b, a);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // There appears to be a bug in GCC, by which the optimizer may
+  // flip the argument order in calls to _mm_min_pd, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  #ifdef EIGEN_VECTORIZE_AVX
+  Packet2d res;
+  asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  #else
+  Packet2d res = b;
+  asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  #endif
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm_min_pd(b, a);
+#endif
+}
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
 #ifdef EIGEN_VECTORIZE_SSE4_1
@@ -263,8 +492,45 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // There appears to be a bug in GCC, by which the optimizer may
+  // flip the argument order in calls to _mm_max_ps, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  #ifdef EIGEN_VECTORIZE_AVX
+  Packet4f res;
+  asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  #else
+  Packet4f res = b;
+  asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  #endif
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm_max_ps(b, a);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+  // There appears to be a bug in GCC, by which the optimizer may
+  // flip the argument order in calls to _mm_max_pd, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  #ifdef EIGEN_VECTORIZE_AVX
+  Packet2d res;
+  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  #else
+  Packet2d res = b;
+  asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  #endif
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm_max_pd(b, a);
+#endif
+}
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
 #ifdef EIGEN_VECTORIZE_SSE4_1
@@ -276,36 +542,180 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const
 #endif
 }
 
+template <typename Packet, typename Op>
+EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet& a, const Packet& b, Op op) {
+  // In this implementation, we take advantage of the fact that pmin/pmax for SSE
+  // always return a if either a or b is NaN.
+  Packet not_nan_mask_a = pcmp_eq(a, a);
+  Packet m = op(a, b);
+  return pselect<Packet>(not_nan_mask_a, m, b);
+}
+
+template <typename Packet, typename Op>
+EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet& a, const Packet& b, Op op) {
+  // In this implementation, we take advantage of the fact that pmin/pmax for SSE
+  // always return a if either a or b is NaN.
+  Packet not_nan_mask_a = pcmp_eq(a, a);
+  Packet m = op(b, a);
+  return pselect<Packet>(not_nan_mask_a, m, a);
+}
+
+// Add specializations for min/max with prescribed NaN progation.
+template<>
+EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet4f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet2d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet4f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet2d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet4f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet2d>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet4f>);
+}
+template<>
+EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet2d>);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) { return _mm_srai_epi32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right   (const Packet4i& a) { return _mm_srli_epi32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left    (const Packet4i& a) { return _mm_slli_epi32(a,N); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
+{
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
+  return _mm_and_ps(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
+{
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
+  return _mm_and_pd(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
+{
+  #ifdef EIGEN_VECTORIZE_SSSE3
+  return _mm_abs_epi32(a);
+  #else
+  Packet4i aux = _mm_srai_epi32(a,31);
+  return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
+  #endif
+}
+
 #ifdef EIGEN_VECTORIZE_SSE4_1
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
-template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+  // Unfortunatly _mm_round_ps doesn't have a rounding mode to implement numext::round.
+  const Packet4f mask = pset1frombits<Packet4f>(0x80000000u);
+  const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu);
+  return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
+{
+  const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
+  const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
+  return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
-#endif
+#else
+template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
+  // Adds and subtracts signum(a) * 2^23 to force rounding.
+  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
+  const Packet4f abs_a = pabs(a);
+  Packet4f r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
+  // Adds and subtracts signum(a) * 2^52 to force rounding.
+  const Packet2d limit = pset1<Packet2d>(static_cast<double>(1ull<<52));
+  const Packet2d abs_a = pabs(a);
+  Packet2d r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  Packet4f tmp  = print<Packet4f>(a);
+  // If greater, subtract one.
+  Packet4f mask = _mm_cmpgt_ps(tmp, a);
+  mask = pand(mask, cst_1);
+  return psub(tmp, mask);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
+{
+  const Packet2d cst_1 = pset1<Packet2d>(1.0);
+  Packet2d tmp  = print<Packet2d>(a);
+  // If greater, subtract one.
+  Packet2d mask = _mm_cmpgt_pd(tmp, a);
+  mask = pand(mask, cst_1);
+  return psub(tmp, mask);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  Packet4f tmp  = print<Packet4f>(a);
+  // If smaller, add one.
+  Packet4f mask = _mm_cmplt_ps(tmp, a);
+  mask = pand(mask, cst_1);
+  return padd(tmp, mask);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
+{
+  const Packet2d cst_1 = pset1<Packet2d>(1.0);
+  Packet2d tmp  = print<Packet2d>(a);
+  // If smaller, add one.
+  Packet2d mask = _mm_cmplt_pd(tmp, a);
+  mask = pand(mask, cst_1);
+  return padd(tmp, mask);
+}
+#endif
 
 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(const bool*     from) { EIGEN_DEBUG_ALIGNED_LOAD return  _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
 
 #if EIGEN_COMP_MSVC
   template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) {
@@ -340,6 +750,10 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
   EIGEN_DEBUG_UNALIGNED_LOAD
   return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
 }
+template<> EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(const bool*     from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
 
 
 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
@@ -355,13 +769,32 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
   return vec4i_swizzle1(tmp, 0, 0, 1, 1);
 }
 
+// Loads 8 bools from memory and returns the packet
+// {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7}
+template<> EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool*     from)
+{
+  __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(reinterpret_cast<const double*>(from)));
+  return  _mm_unpacklo_epi8(tmp, tmp);
+}
+
+// Loads 4 bools from memory and returns the packet
+// {b0, b0  b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3}
+template<> EIGEN_STRONG_INLINE Packet16b
+ploadquad<Packet16b>(const bool* from) {
+  __m128i tmp = _mm_castps_si128(pload1<Packet4f>(reinterpret_cast<const float*>(from)));
+  tmp = _mm_unpacklo_epi8(tmp, tmp);
+  return  _mm_unpacklo_epi16(tmp, tmp);
+}
+
 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
+template<> EIGEN_STRONG_INLINE void pstore<bool>(bool*     to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
 
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<bool>(bool*     to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
 
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
@@ -374,7 +807,15 @@ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const dou
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
 {
  return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
- }
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet16b pgather<bool, Packet16b>(const bool* from, Index stride)
+{
+  return _mm_set_epi8(from[15*stride], from[14*stride], from[13*stride], from[12*stride],
+                      from[11*stride], from[10*stride], from[9*stride], from[8*stride],
+                      from[7*stride], from[6*stride], from[5*stride], from[4*stride],
+                      from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
 
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
@@ -395,6 +836,14 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
   to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
   to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
 }
+template<> EIGEN_DEVICE_FUNC inline void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride)
+{
+  to[4*stride*0] = _mm_cvtsi128_si32(from);
+  to[4*stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
+  to[4*stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
+  to[4*stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
+}
+
 
 // some compilers might be tempted to perform multiple moves instead of using a vector path.
 template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
@@ -409,7 +858,7 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double&
   pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
 }
 
-#if EIGEN_COMP_PGI
+#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
 typedef const void * SsePrefetchPtrType;
 #else
 typedef const char * SsePrefetchPtrType;
@@ -437,32 +886,62 @@ template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { retu
 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
 #endif
+template<> EIGEN_STRONG_INLINE bool   pfirst<Packet16b>(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast<bool>(x & 1); }
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{ return _mm_shuffle_ps(a,a,0x1B); }
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
-{ return _mm_shuffle_pd(a,a,0x1); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
-{ return _mm_shuffle_epi32(a,0x1B); }
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return _mm_shuffle_ps(a,a,0x1B); }
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return _mm_shuffle_pd(a,a,0x1); }
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); }
+template<> EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
+#ifdef EIGEN_VECTORIZE_SSSE3
+  __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return _mm_shuffle_epi8(a, mask);
+#else
+  Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
+  tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
+  return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
+#endif
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
-{
-  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
-  return _mm_and_ps(a,mask);
+template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a,exponent);
 }
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
-{
-  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
-  return _mm_and_pd(a,mask);
+
+// Extract exponent without existence of Packet2l.
+template<>
+EIGEN_STRONG_INLINE  
+Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
+  const Packet2d cst_exp_mask  = pset1frombits<Packet2d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+  __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52);
+  return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3));
 }
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
-{
-  #ifdef EIGEN_VECTORIZE_SSSE3
-  return _mm_abs_epi32(a);
-  #else
-  Packet4i aux = _mm_srai_epi32(a,31);
-  return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
-  #endif
+
+template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a,exponent);
+}
+
+// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
+// supported by SSE, and has more range than is needed for exponents.
+template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
+  const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent);
+  
+  // Convert e to integer and swizzle to low-order bits.
+  const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
+  
+  // Split 2^e into four factors and multiply:
+  const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
+  Packet4i b = parithmetic_shift_right<2>(ei);  // floor(e/4)
+  Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));  // 2^b
+  Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
+  b = psub(psub(psub(ei, b), b), b);  // e - 3b
+  c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));  // 2^(e - 3b)
+  out = pmul(out, c);  // a * 2^e
+  return out;
 }
 
 // with AVX, the default implementations based on pload1 are faster
@@ -505,38 +984,6 @@ EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
   vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
 }
 
-#ifdef EIGEN_VECTORIZE_SSE3
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
-  return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
-{
-  return _mm_hadd_pd(vecs[0], vecs[1]);
-}
-
-#else
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
-  Packet4f tmp0, tmp1, tmp2;
-  tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
-  tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
-  tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
-  tmp0 = _mm_add_ps(tmp0, tmp1);
-  tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]);
-  tmp1 = _mm_add_ps(tmp1, tmp2);
-  tmp2 = _mm_movehl_ps(tmp1, tmp0);
-  tmp0 = _mm_movelh_ps(tmp0, tmp1);
-  return _mm_add_ps(tmp0, tmp2);
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
-{
-  return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
-}
-#endif  // SSE3
-
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
   // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
@@ -562,38 +1009,28 @@ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
 }
 
 #ifdef EIGEN_VECTORIZE_SSSE3
-template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
-{
-  return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
-}
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
   Packet4i tmp0 = _mm_hadd_epi32(a,a);
   return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0));
 }
+
 #else
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
   Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
   return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
 }
+#endif
 
-template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
-{
-  Packet4i tmp0, tmp1, tmp2;
-  tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
-  tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
-  tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
-  tmp0 = _mm_add_epi32(tmp0, tmp1);
-  tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
-  tmp1 = _mm_add_epi32(tmp1, tmp2);
-  tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
-  tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
-  return _mm_add_epi32(tmp0, tmp2);
+template<> EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
+  Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a));
+  return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
 }
-#endif
+
 // Other reduction functions:
 
+
 // mul
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 {
@@ -611,7 +1048,13 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
   // TODO try to call _mm_mul_epu32 directly
   EIGEN_ALIGN16 int aux[4];
   pstore(aux, a);
-  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);;
+  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);
+}
+
+template<> EIGEN_STRONG_INLINE bool predux_mul<Packet16b>(const Packet16b& a) {
+  Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a,a));
+  return ((pfirst<Packet4i>(tmp) == 0x01010101) &&
+          (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
 }
 
 // min
@@ -666,113 +1109,16 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 #endif // EIGEN_VECTORIZE_SSE4_1
 }
 
-#if EIGEN_COMP_GNUC
-// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f&  a, const Packet4f&  b, const Packet4f&  c)
-// {
-//   Packet4f res = b;
-//   asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
-//   return res;
-// }
-// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i&  a, const Packet4i&  b, const int i)
+// not needed yet
+// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
 // {
-//   Packet4i res = a;
-//   asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
-//   return res;
+//   return _mm_movemask_ps(x) == 0xF;
 // }
-#endif
-
-#ifdef EIGEN_VECTORIZE_SSSE3
-// SSSE3 versions
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
-  {
-    if (Offset!=0)
-      first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
-  }
-};
 
-template<int Offset>
-struct palign_impl<Offset,Packet4i>
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
 {
-  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
-  {
-    if (Offset!=0)
-      first = _mm_alignr_epi8(second,first, Offset*4);
-  }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet2d>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
-  {
-    if (Offset==1)
-      first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
-  }
-};
-#else
-// SSE2 versions
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
-  {
-    if (Offset==1)
-    {
-      first = _mm_move_ss(first,second);
-      first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
-    }
-    else if (Offset==2)
-    {
-      first = _mm_movehl_ps(first,first);
-      first = _mm_movelh_ps(first,second);
-    }
-    else if (Offset==3)
-    {
-      first = _mm_move_ss(first,second);
-      first = _mm_shuffle_ps(first,second,0x93);
-    }
-  }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet4i>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
-  {
-    if (Offset==1)
-    {
-      first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
-      first = _mm_shuffle_epi32(first,0x39);
-    }
-    else if (Offset==2)
-    {
-      first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
-      first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
-    }
-    else if (Offset==3)
-    {
-      first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
-      first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
-    }
-  }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet2d>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
-  {
-    if (Offset==1)
-    {
-      first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));
-      first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));
-    }
-  }
-};
-#endif
+  return _mm_movemask_ps(x) != 0x0;
+}
 
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4f,4>& kernel) {
@@ -799,6 +1145,100 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
   kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
 }
 
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16b,4>& kernel) {
+  __m128i T0 =  _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
+  __m128i T1 =  _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
+  __m128i T2 =  _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
+  __m128i T3 =  _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
+  kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
+  kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
+  kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
+  kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16b,16>& kernel) {
+  // If we number the elements in the input thus:
+  // kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f}
+  // kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f}
+  // ...
+  // kernel.packet[15] = {f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, fa, fb, fc, fd, fe, ff},
+  //
+  // the desired output is:
+  // kernel.packet[ 0] = {00, 10, 20, 30, 40, 50, 60, 70, 80, 90, a0, b0, c0, d0, e0, f0}
+  // kernel.packet[ 1] = {01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1}
+  // ...
+  // kernel.packet[15] = {0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff},
+  __m128i t0 =  _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  __m128i t1 =  _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
+  __m128i t2 =  _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); // 20 30 21 31 22 32 ...                     27 37
+  __m128i t3 =  _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); // 28 38 29 39 2a 3a ...                     2f 3f
+  __m128i t4 =  _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]); // 40 50 41 51 42 52                         47 57
+  __m128i t5 =  _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]); // 48 58 49 59 4a 5a
+  __m128i t6 =  _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
+  __m128i t7 =  _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
+  __m128i t8 =  _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
+  __m128i t9 =  _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
+  __m128i ta =  _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
+  __m128i tb =  _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
+  __m128i tc =  _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
+  __m128i td =  _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
+  __m128i te =  _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
+  __m128i tf =  _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
+
+  __m128i s0 =  _mm_unpacklo_epi16(t0, t2); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  __m128i s1 =  _mm_unpackhi_epi16(t0, t2); // 04 14 24 34
+  __m128i s2 =  _mm_unpacklo_epi16(t1, t3); // 08 18 28 38 ...
+  __m128i s3 =  _mm_unpackhi_epi16(t1, t3); // 0c 1c 2c 3c ...
+  __m128i s4 =  _mm_unpacklo_epi16(t4, t6); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+  __m128i s5 =  _mm_unpackhi_epi16(t4, t6); // 44 54 64 74 ...
+  __m128i s6 =  _mm_unpacklo_epi16(t5, t7);
+  __m128i s7 =  _mm_unpackhi_epi16(t5, t7);
+  __m128i s8 =  _mm_unpacklo_epi16(t8, ta);
+  __m128i s9 =  _mm_unpackhi_epi16(t8, ta);
+  __m128i sa =  _mm_unpacklo_epi16(t9, tb);
+  __m128i sb =  _mm_unpackhi_epi16(t9, tb);
+  __m128i sc =  _mm_unpacklo_epi16(tc, te);
+  __m128i sd =  _mm_unpackhi_epi16(tc, te);
+  __m128i se =  _mm_unpacklo_epi16(td, tf);
+  __m128i sf =  _mm_unpackhi_epi16(td, tf);
+
+  __m128i u0 =  _mm_unpacklo_epi32(s0, s4); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  __m128i u1 =  _mm_unpackhi_epi32(s0, s4); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  __m128i u2 =  _mm_unpacklo_epi32(s1, s5);
+  __m128i u3 =  _mm_unpackhi_epi32(s1, s5);
+  __m128i u4 =  _mm_unpacklo_epi32(s2, s6);
+  __m128i u5 =  _mm_unpackhi_epi32(s2, s6);
+  __m128i u6 =  _mm_unpacklo_epi32(s3, s7);
+  __m128i u7 =  _mm_unpackhi_epi32(s3, s7);
+  __m128i u8 =  _mm_unpacklo_epi32(s8, sc);
+  __m128i u9 =  _mm_unpackhi_epi32(s8, sc);
+  __m128i ua =  _mm_unpacklo_epi32(s9, sd);
+  __m128i ub =  _mm_unpackhi_epi32(s9, sd);
+  __m128i uc =  _mm_unpacklo_epi32(sa, se);
+  __m128i ud =  _mm_unpackhi_epi32(sa, se);
+  __m128i ue =  _mm_unpacklo_epi32(sb, sf);
+  __m128i uf =  _mm_unpackhi_epi32(sb, sf);
+
+  kernel.packet[0]  = _mm_unpacklo_epi64(u0, u8);
+  kernel.packet[1]  = _mm_unpackhi_epi64(u0, u8);
+  kernel.packet[2]  = _mm_unpacklo_epi64(u1, u9);
+  kernel.packet[3]  = _mm_unpackhi_epi64(u1, u9);
+  kernel.packet[4]  = _mm_unpacklo_epi64(u2, ua);
+  kernel.packet[5]  = _mm_unpackhi_epi64(u2, ua);
+  kernel.packet[6]  = _mm_unpacklo_epi64(u3, ub);
+  kernel.packet[7]  = _mm_unpackhi_epi64(u3, ub);
+  kernel.packet[8]  = _mm_unpacklo_epi64(u4, uc);
+  kernel.packet[9]  = _mm_unpackhi_epi64(u4, uc);
+  kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
+  kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
+  kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
+  kernel.packet[13] = _mm_unpackhi_epi64(u6, ue);
+  kernel.packet[14] = _mm_unpacklo_epi64(u7, uf);
+  kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
@@ -830,59 +1270,229 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b)
-{
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  return _mm_blend_ps(a,pset1<Packet4f>(b),1);
-#else
-  return _mm_move_ss(a, _mm_load_ss(&b));
+// Scalar path for pmadd with FMA to ensure consistency with vectorized path.
+#ifdef EIGEN_VECTORIZE_FMA
+template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
+  return ::fmaf(a,b,c);
+}
+template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
+  return ::fma(a,b,c);
+}
 #endif
+
+
+// Packet math for Eigen::half
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#if 0
+
+typedef struct {
+  __m64 x;
+} Packet4h;
+
+
+template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet4h type;
+  // There is no half-size packet for Packet4h.
+  typedef Packet4h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasExp = 0,
+    HasLog = 0,
+    HasBlend = 0
+  };
+};
+
+
+template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; };
+
+template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
+  Packet4h result;
+  result.x = _mm_set1_pi16(from.x);
+  return result;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b)
-{
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  return _mm_blend_pd(a,pset1<Packet2d>(b),1);
-#else
-  return _mm_move_sd(a, _mm_load_sd(&b));
-#endif
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b)
-{
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  return _mm_blend_ps(a,pset1<Packet4f>(b),(1<<3));
-#else
-  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF));
-  return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1<Packet4f>(b)));
-#endif
+template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha + hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha - hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha * hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha / hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
+  Packet4h result;
+  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
+  Packet4h result;
+  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+  return result;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b)
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+  __int64_t r = _mm_cvtm64_si64(from.x);
+  *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+  __int64_t r = _mm_cvtm64_si64(from.x);
+  *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h
+ploadquad<Packet4h>(const Eigen::half* from) {
+  return pset1<Packet4h>(*from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
 {
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  return _mm_blend_pd(a,pset1<Packet2d>(b),(1<<1));
-#else
-  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF));
-  return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1<Packet2d>(b)));
-#endif
+  Packet4h result;
+  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+  return result;
 }
 
-// Scalar path for pmadd with FMA to ensure consistency with vectorized path.
-#ifdef __FMA__
-template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
-  return ::fmaf(a,b,c);
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
+{
+  __int64_t a = _mm_cvtm64_si64(from.x);
+  to[stride*0].x = static_cast<unsigned short>(a);
+  to[stride*1].x = static_cast<unsigned short>(a >> 16);
+  to[stride*2].x = static_cast<unsigned short>(a >> 32);
+  to[stride*3].x = static_cast<unsigned short>(a >> 48);
 }
-template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
-  return ::fma(a,b,c);
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet4h,4>& kernel) {
+  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
+  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
+  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
+  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
+
+  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
+  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
+  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
+  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
 }
+
 #endif
 
+
 } // end namespace internal
 
 } // end namespace Eigen
 
-#if EIGEN_COMP_PGI
+#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
 // PGI++ does not define the following intrinsics in C++ mode.
 static inline __m128  _mm_castpd_ps   (__m128d x) { return reinterpret_cast<__m128&>(x);  }
 static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
diff --git a/contrib/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h b/contrib/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h
index c6ca8c716c04d19bb18f351768df4a6bd232c132..d2a0037e0147cb3d8cdd5b934f79728a62108c32 100644
--- a/contrib/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/contrib/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -69,6 +69,71 @@ template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f
   return _mm_cvtps_pd(a);
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+  return _mm_castps_si128(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+  return _mm_castsi128_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d,Packet4i>(const Packet4i& a) {
+  return _mm_castsi128_pd(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet2d>(const Packet2d& a) {
+  return _mm_castpd_si128(a);
+}
+
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#if 0
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
+  float f1 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  float f2 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  float f3 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  float f4 = static_cast<float>(h);
+  return _mm_set_ps(f4, f3, f2, f1);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
+  EIGEN_ALIGN16 float aux[4];
+  pstore(aux, a);
+  Eigen::half h0(aux[0]);
+  Eigen::half h1(aux[1]);
+  Eigen::half h2(aux[2]);
+  Eigen::half h3(aux[3]);
+
+  Packet4h result;
+  result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
+  return result;
+}
+
+#endif
 
 } // end namespace internal
 
diff --git a/contrib/eigen/Eigen/src/Core/arch/ZVector/Complex.h b/contrib/eigen/Eigen/src/Core/arch/ZVector/Complex.h
index 1bfb73397d6c7f24fe102b1811928ea6746a8e5c..0b9b33d99dd9c03296521e33eb2fb7986ff003ce 100644
--- a/contrib/eigen/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/contrib/eigen/Eigen/src/Core/arch/ZVector/Complex.h
@@ -15,6 +15,10 @@ namespace Eigen {
 
 namespace internal {
 
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+static Packet4ui  p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
+#endif
+
 static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 
@@ -29,10 +33,14 @@ struct Packet2cf
 {
   EIGEN_STRONG_INLINE Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
   union {
     Packet4f v;
     Packet1cd cd[2];
   };
+#else
+  Packet4f v;
+#endif
 };
 
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
@@ -83,69 +91,33 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float>  type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float>  type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; };
 
 /* Forward declaration */
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
 
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
+/* complex<double> first */
 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
 
 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
-{
-  Packet2cf res;
-  res.cd[0] = Packet1cd(vec_ld2f((const float *)&from));
-  res.cd[1] = res.cd[0];
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
-{
-  std::complex<float> EIGEN_ALIGN16 af[2];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  return pload<Packet2cf>(af);
-}
 template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride EIGEN_UNUSED)
 {
   return pload<Packet1cd>(from);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
-{
-  std::complex<float> EIGEN_ALIGN16 af[2];
-  pstore<std::complex<float> >((std::complex<float> *) af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
-}
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride EIGEN_UNUSED)
 {
   pstore<std::complex<double> >(to, from);
 }
-
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
-{
-  Packet2cf res;
-  res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
-  res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
-  return res;
-}
-
 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   Packet2d a_re, a_im, v1, v2;
@@ -163,27 +135,17 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
 
   return Packet1cd(v1 + v2);
 }
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  Packet2cf res;
-  res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
-  res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
-  return res;
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v,b.v)); }
-
+template<> EIGEN_STRONG_INLINE Packet1cd pand    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por     <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from) {  return pset1<Packet1cd>(*from); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*      from) {  return pset1<Packet2cf>(*from); }
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+  Packet2d eq = vec_cmpeq (a.v, b.v);
+  Packet2d tmp = { eq[1], eq[0] };
+  return (Packet1cd)pand<Packet2d>(eq, tmp);
+}
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *     addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 
 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
@@ -193,160 +155,157 @@ template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Pac
 
   return res;
 }
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
-{
-  std::complex<float> EIGEN_ALIGN16 res[2];
-  pstore<std::complex<float> >(res, a);
-
-  return res[0];
-}
 
 template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
 {
-  Packet2cf res;
-  res.cd[0] = a.cd[1];
-  res.cd[1] = a.cd[0];
-  return res;
+  return pfirst(a);
 }
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
 {
   return pfirst(a);
 }
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
-  std::complex<float> res;
-  Packet1cd b = padd<Packet1cd>(a.cd[0], a.cd[1]);
-  vec_st2f(b.v, (float*)&res);
-  return res;
+  // TODO optimize it for AltiVec
+  Packet1cd res = pmul(a,pconj(b));
+  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
+  return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
 {
-  return vecs[0];
+  return Packet1cd(preverse(Packet2d(x.v)));
 }
-template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
 {
-  PacketBlock<Packet2cf,2> transpose;
-  transpose.packet[0] = vecs[0];
-  transpose.packet[1] = vecs[1];
-  ptranspose(transpose);
+  Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
 
-  return padd<Packet2cf>(transpose.packet[0], transpose.packet[1]);
-} 
+/* complex<float> follows */
+template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
-  return pfirst(a);
+  std::complex<float> EIGEN_ALIGN16 res[2];
+  pstore<std::complex<float> >(res, a);
+
+  return res[0];
 }
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
-  std::complex<float> res;
-  Packet1cd b = pmul<Packet1cd>(a.cd[0], a.cd[1]);
-  vec_st2f(b.v, (float*)&res);
+  Packet2cf res;
+  res.cd[0] = Packet1cd(vec_ld2f((const float *)&from));
+  res.cd[1] = res.cd[0];
   return res;
 }
-
-template<int Offset>
-struct palign_impl<Offset,Packet1cd>
+#else
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
-  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
-  {
-    // FIXME is it sure we never have to align a Packet1cd?
-    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
-  }
-};
+  Packet2cf res;
+  if((std::ptrdiff_t(&from) % 16) == 0)
+    res.v = pload<Packet4f>((const float *)&from);
+  else
+    res.v = ploadu<Packet4f>((const float *)&from);
+  res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
+  return res;
+}
+#endif
 
-template<int Offset>
-struct palign_impl<Offset,Packet2cf>
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
-  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
-  {
-    if (Offset == 1) {
-      first.cd[0] = first.cd[1];
-      first.cd[1] = second.cd[0];
-    }
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  return pload<Packet2cf>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
 {
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  pstore<std::complex<float> >((std::complex<float> *) af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
 
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); }
 
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v,b.v)); }
 
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*      from) {  return pset1<Packet2cf>(*from); }
 
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *     addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
 
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
 
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = pcmp_eq<Packet4f> (a.v, b.v);
+  Packet2cf res;
+  Packet2d tmp1 = { eq.v4f[0][1], eq.v4f[0][0] };
+  Packet2d tmp2 = { eq.v4f[1][1], eq.v4f[1][0] };
+  res.v.v4f[0] = pand<Packet2d>(eq.v4f[0], tmp1);
+  res.v.v4f[1] = pand<Packet2d>(eq.v4f[1], tmp2);
+  return res;
+}
 
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
 {
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
+  Packet2cf res;
+  res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
+  res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
+  return res;
+}
 
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
+  Packet2cf res;
+  res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
+  res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
+  return res;
+}
 
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+{
+  Packet2cf res;
+  res.cd[0] = a.cd[1];
+  res.cd[1] = a.cd[0];
+  return res;
+}
 
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> res;
+  Packet1cd b = padd<Packet1cd>(a.cd[0], a.cd[1]);
+  vec_st2f(b.v, (float*)&res);
+  return res;
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
 {
-  // TODO optimize it for AltiVec
-  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
-  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
-  return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64)));
+  std::complex<float> res;
+  Packet1cd b = pmul<Packet1cd>(a.cd[0], a.cd[1]);
+  vec_st2f(b.v, (float*)&res);
+  return res;
 }
 
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   // TODO optimize it for AltiVec
@@ -356,11 +315,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
   return res;
 }
 
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
-{
-  return Packet1cd(preverse(Packet2d(x.v)));
-}
-
 EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
 {
   Packet2cf res;
@@ -369,13 +323,6 @@ EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
   return res;
 }
 
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
-{
-  Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
-  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
-  kernel.packet[0].v = tmp;
-}
-
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
 {
   Packet1cd tmp = kernel.packet[0].cd[1];
@@ -389,6 +336,88 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, con
   result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v);
   return result;
 }
+#else
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = vec_cmpeq (a.v, b.v);
+  Packet4f tmp = { eq[1], eq[0], eq[3], eq[2] };
+  return (Packet2cf)pand<Packet4f>(eq, tmp);
+}
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet4f a_re, a_im, prod, prod_im;
+
+  // Permute and multiply the real parts of a and b
+  a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
+  
+  // Get the imaginary parts of a
+  a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
+
+  // multiply a_im * b and get the conjugate result
+  prod_im = a_im * b.v;
+  prod_im = pxor<Packet4f>(prod_im, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR));
+  // permute back to a proper order
+  prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV);
+
+  // multiply a_re * b, add prod_im
+  prod = pmadd<Packet4f>(a_re, b.v, prod_im);
+ 
+  return Packet2cf(prod);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+{
+  Packet4f rev_a;
+  rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2);
+  return Packet2cf(rev_a);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  Packet4f b;
+  b = vec_sld(a.v, a.v, 8);
+  b = padd<Packet4f>(a.v, b);
+  return pfirst<Packet2cf>(Packet2cf(b));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  Packet4f b;
+  Packet2cf prod;
+  b = vec_sld(a.v, a.v, 8);
+  prod = pmul<Packet2cf>(a, Packet2cf(b));
+
+  return pfirst<Packet2cf>(prod);
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+
+template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for AltiVec
+  Packet2cf res = pmul(a, pconj(b));
+  Packet4f s = pmul<Packet4f>(b.v, b.v);
+  return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
+{
+  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
+{
+  Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  Packet2cf result;
+  result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
+  return result;
+}
+#endif
 
 } // end namespace internal
 
diff --git a/contrib/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h b/contrib/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h
index 5c7aa7256782a78834632280634eb4d4cb5e1945..1635e128c816ee27d5f4a15a31e74d1c7411a53f 100644
--- a/contrib/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h
+++ b/contrib/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h
@@ -20,6 +20,50 @@ namespace Eigen {
 
 namespace internal {
 
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+static _EIGEN_DECLARE_CONST_Packet4i(23, 23);
+
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+/* the smallest non denormalized float number */
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
+  
+/* natural logarithm computed for 4 simultaneous float
+  return NaN for x <= 0
+*/
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+#endif
+
 static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
 static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
 static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
@@ -93,43 +137,95 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pexp<Packet4f>(const Packet4f& x)
+Packet4f pexp<Packet4f>(const Packet4f& _x)
 {
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  Packet4f x = _x;
+
+  Packet4f tmp, fx;
+  Packet4i emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
+
+  // express exp(x) as exp(g + n*log(2))
+  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
+
+  fx = pfloor(fx);
+
+  tmp = pmul(fx, p4f_cephes_exp_C1);
+  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  z = pmul(x,x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // build 2^n
+  emm0 = (Packet4i){ (int)fx[0], (int)fx[1], (int)fx[2], (int)fx[3] };
+  emm0 = emm0 + p4i_0x7f;
+  emm0 = emm0 << reinterpret_cast<Packet4i>(p4i_23);
+
+  return pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x);
+#else
   Packet4f res;
-  res.v4f[0] = pexp<Packet2d>(x.v4f[0]);
-  res.v4f[1] = pexp<Packet2d>(x.v4f[1]);
+  res.v4f[0] = pexp<Packet2d>(_x.v4f[0]);
+  res.v4f[1] = pexp<Packet2d>(_x.v4f[1]);
   return res;
+#endif
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d psqrt<Packet2d>(const Packet2d& x)
 {
-  return  __builtin_s390_vfsqdb(x);
+  return vec_sqrt(x);
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psqrt<Packet4f>(const Packet4f& x)
 {
   Packet4f res;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  res = vec_sqrt(x);
+#else
   res.v4f[0] = psqrt<Packet2d>(x.v4f[0]);
   res.v4f[1] = psqrt<Packet2d>(x.v4f[1]);
+#endif
   return res;
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d prsqrt<Packet2d>(const Packet2d& x) {
-  // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
   return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f prsqrt<Packet4f>(const Packet4f& x) {
   Packet4f res;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  res = pset1<Packet4f>(1.0) / psqrt<Packet4f>(x);
+#else
   res.v4f[0] = prsqrt<Packet2d>(x.v4f[0]);
   res.v4f[1] = prsqrt<Packet2d>(x.v4f[1]);
+#endif
   return res;
 }
 
+// Hyperbolic Tangent function.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+ptanh<Packet4f>(const Packet4f& x) {
+  return internal::generic_fast_tanh_float(x);
+}
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/contrib/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h b/contrib/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h
index 57b01fc634d638f3d8b360adce028b1e9c81aad6..1f55a90a5cd83da1a5c2e90e9da4ff3ff6246757 100755
--- a/contrib/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/contrib/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -10,26 +10,20 @@
 #ifndef EIGEN_PACKET_MATH_ZVECTOR_H
 #define EIGEN_PACKET_MATH_ZVECTOR_H
 
-#include <stdint.h>
-
 namespace Eigen {
 
 namespace internal {
 
 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
 #endif
 
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#endif
-
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  16
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
 #endif
 
 typedef __vector int                 Packet4i;
@@ -41,21 +35,30 @@ typedef __vector double              Packet2d;
 typedef __vector unsigned long long  Packet2ul;
 typedef __vector long long           Packet2l;
 
+// Z14 has builtin support for float vectors
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+typedef __vector float               Packet4f;
+#else
 typedef struct {
 	Packet2d  v4f[2];
 } Packet4f;
+#endif
 
 typedef union {
-  int32_t   i[4];
-  uint32_t ui[4];
-  int64_t   l[2];
-  uint64_t ul[2];
+  numext::int32_t   i[4];
+  numext::uint32_t ui[4];
+  numext::int64_t   l[2];
+  numext::uint64_t ul[2];
   double    d[2];
+  float     f[4];
   Packet4i  v4i;
   Packet4ui v4ui;
   Packet2l  v2l;
   Packet2ul v2ul;
   Packet2d  v2d;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  Packet4f  v4f;
+#endif
 } Packet;
 
 // We don't want to write the same code all the time, but we need to reuse the constants
@@ -80,15 +83,31 @@ typedef union {
   Packet2l p2l_##NAME = pset1<Packet2l>(X)
 
 // These constants are endian-agnostic
-//static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
 
 static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
 static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
 static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
 
-static Packet2d p2d_ONE = { 1.0, 1.0 }; 
-static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
+static Packet2d p2d_ONE = { 1.0, 1.0 };
+static Packet2d p2d_ZERO_ = { numext::bit_cast<double>0x8000000000000000ull),
+                              numext::bit_cast<double>0x8000000000000000ull) };
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
+  Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
+  Packet4f p4f_##NAME = pset1<Packet4f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
+
+static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
+static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000};
+#endif
 
 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
 static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
@@ -120,9 +139,9 @@ static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0
 static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
 static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
 
-//static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
 
-//static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 
 
 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
@@ -149,29 +168,31 @@ template<> struct packet_traits<int>    : default_packet_traits
   };
 };
 
-template<> struct packet_traits<float> : default_packet_traits
-{
+template <>
+struct packet_traits<float> : default_packet_traits {
   typedef Packet4f type;
   typedef Packet4f half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4,
+    size = 4,
     HasHalfPacket = 0,
 
-    HasAdd  = 1,
-    HasSub  = 1,
-    HasMul  = 1,
-    HasDiv  = 1,
-    HasMin  = 1,
-    HasMax  = 1,
-    HasAbs  = 1,
-    HasSin  = 0,
-    HasCos  = 0,
-    HasLog  = 0,
-    HasExp  = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasAbs = 1,
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 0,
+    HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasTanh = 1,
+    HasErf = 1,
     HasRound = 1,
     HasFloor = 1,
     HasCeil = 1,
@@ -211,9 +232,9 @@ template<> struct packet_traits<double> : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
 
 /* Forward declaration */
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
@@ -258,82 +279,15 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
   return s;
 }
 
-/* Helper function to simulate a vec_splat_packet4f
- */
-template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f&   from)
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
 {
-  Packet4f splat;
-  switch (element) {
-  case 0:
-    splat.v4f[0] = vec_splat(from.v4f[0], 0);
-    splat.v4f[1] = splat.v4f[0];
-    break;
-  case 1:
-    splat.v4f[0] = vec_splat(from.v4f[0], 1);
-    splat.v4f[1] = splat.v4f[0];
-    break;
-  case 2:
-    splat.v4f[0] = vec_splat(from.v4f[1], 0);
-    splat.v4f[1] = splat.v4f[0];
-    break;
-  case 3:
-    splat.v4f[0] = vec_splat(from.v4f[1], 1);
-    splat.v4f[1] = splat.v4f[0];
-    break;
-  }
-  return splat;
+  Packet vt;
+  vt.v4f = v;
+  s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3];
+  return s;
 }
-
-template<int Offset>
-struct palign_impl<Offset,Packet4i>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
-  {
-    switch (Offset % 4) {
-    case 1:
-      first = vec_sld(first, second, 4); break;
-    case 2:
-      first = vec_sld(first, second, 8); break;
-    case 3:
-      first = vec_sld(first, second, 12); break;
-    }
-  }
-};
-
-/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double
- */
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
-  {
-    switch (Offset % 4) {
-    case 1:
-      first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8);
-      first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8);
-      break;
-    case 2:
-      first.v4f[0] = first.v4f[1];
-      first.v4f[1] = second.v4f[0];
-      break;
-    case 3:
-      first.v4f[0] = vec_sld(first.v4f[1],  second.v4f[0], 8);
-      first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8);
-      break;
-    }
-  }
-};
-
-
-template<int Offset>
-struct palign_impl<Offset,Packet2d>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
-  {
-    if (Offset == 1)
-      first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(first), reinterpret_cast<Packet4i>(second), 8));
-  }
-};
+#endif
 
 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
 {
@@ -344,16 +298,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
   return vfrom->v4i;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from)
-{
-  // FIXME: No intrinsic yet
-  EIGEN_DEBUG_ALIGNED_LOAD
-  Packet4f vfrom;
-  vfrom.v4f[0] = vec_ld2f(&from[0]);
-  vfrom.v4f[1] = vec_ld2f(&from[2]);
-  return vfrom;
-}
-
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
 {
   // FIXME: No intrinsic yet
@@ -372,15 +316,6 @@ template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& f
   vto->v4i = from;
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
-{
-  // FIXME: No intrinsic yet
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_st2f(from.v4f[0], &to[0]);
-  vec_st2f(from.v4f[1], &to[2]);
-}
-
-
 template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
 {
   // FIXME: No intrinsic yet
@@ -397,13 +332,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
   return vec_splats(from);
 }
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&    from)
-{
-  Packet4f to;
-  to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
-  to.v4f[1] = to.v4f[0];
-  return to;
-}
 
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4i>(const int *a,
@@ -416,17 +344,6 @@ pbroadcast4<Packet4i>(const int *a,
   a3 = vec_splat(a3, 3);
 }
 
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
-                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
-  a3 = pload<Packet4f>(a);
-  a0 = vec_splat_packet4f<0>(a3);
-  a1 = vec_splat_packet4f<1>(a3);
-  a2 = vec_splat_packet4f<2>(a3);
-  a3 = vec_splat_packet4f<3>(a3);
-}
-
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet2d>(const double *a,
                       Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
@@ -449,16 +366,6 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f
  return pload<Packet4i>(ai);
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
-{
-  float EIGEN_ALIGN16 ai[4];
-  ai[0] = from[0*stride];
-  ai[1] = from[1*stride];
-  ai[2] = from[2*stride];
-  ai[3] = from[3*stride];
- return pload<Packet4f>(ai);
-}
-
 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
   double EIGEN_ALIGN16 af[2];
@@ -477,16 +384,6 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
   to[3*stride] = ai[3];
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
-{
-  float EIGEN_ALIGN16 ai[4];
-  pstore<float>((float *)ai, from);
-  to[0*stride] = ai[0];
-  to[1*stride] = ai[1];
-  to[2*stride] = ai[2];
-  to[3*stride] = ai[3];
-}
-
 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
 {
   double EIGEN_ALIGN16 af[2];
@@ -496,160 +393,52 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to,
 }
 
 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f c;
-  c.v4f[0] = a.v4f[0] + b.v4f[0];
-  c.v4f[1] = a.v4f[1] + b.v4f[1];
-  return c;
-}
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }
 
 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f c;
-  c.v4f[0] = a.v4f[0] - b.v4f[0];
-  c.v4f[1] = a.v4f[1] - b.v4f[1];
-  return c;
-}
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }
 
 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f c;
-  c.v4f[0] = a.v4f[0] * b.v4f[0];
-  c.v4f[1] = a.v4f[1] * b.v4f[1];
-  return c;
-}
 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }
 
 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f c;
-  c.v4f[0] = a.v4f[0] / b.v4f[0];
-  c.v4f[1] = a.v4f[1] / b.v4f[1];
-  return c;
-}
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }
 
 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
-{
-  Packet4f c;
-  c.v4f[0] = -a.v4f[0];
-  c.v4f[1] = -a.v4f[1];
-  return c;
-}
 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }
 
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
 
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
-{
-  Packet4f res;
-  res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
-  res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
-  return res;
-}
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
 
 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)    { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)  { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }
 
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f res;
-  res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
-  res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
-  return res;
-}
 
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f res;
-  res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
-  res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
-  return res;
-}
 
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f res;
-  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
-  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
-  return res;
-}
 
 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f res;
-  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
-  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
-  return res;
-}
 
 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f res;
-  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
-  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
-  return res;
-}
 
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f res;
-  res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
-  res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
-  return res;
-}
 
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
-{
-  Packet4f res;
-  res.v4f[0] = vec_round(a.v4f[0]);
-  res.v4f[1] = vec_round(a.v4f[1]);
-  return res;
-}
 template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a)
-{
-  Packet4f res;
-  res.v4f[0] = vec_ceil(a.v4f[0]);
-  res.v4f[1] = vec_ceil(a.v4f[1]);
-  return res;
-}
 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
-{
-  Packet4f res;
-  res.v4f[0] = vec_floor(a.v4f[0]);
-  res.v4f[1] = vec_floor(a.v4f[1]);
-  return res;
-}
 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
 
 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*       from) { return pload<Packet4i>(from); }
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*     from) { return pload<Packet4f>(from); }
 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double*    from) { return pload<Packet2d>(from); }
 
 
@@ -659,14 +448,6 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
   return vec_perm(p, p, p16uc_DUPLICATE32_HI);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*    from)
-{
-  Packet4f p = pload<Packet4f>(from);
-  p.v4f[1] = vec_splat(p.v4f[0], 1);
-  p.v4f[0] = vec_splat(p.v4f[0], 0);
-  return p;
-}
-
 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 {
   Packet2d p = pload<Packet2d>(from);
@@ -674,15 +455,12 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 }
 
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*        to, const Packet4i& from) { pstore<int>(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*    to, const Packet4f& from) { pstore<float>(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { pstore<double>(to, from); }
 
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 
 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int    EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float  EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
 
 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
@@ -695,23 +473,8 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
   return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{
-  Packet4f rev;
-  rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
-  rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
-  return rev;
-}
-
 template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
 template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
-{
-  Packet4f res;
-  res.v4f[0] = pabs(a.v4f[0]);
-  res.v4f[1] = pabs(a.v4f[1]);
-  return res;
-}
 
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
@@ -730,71 +493,10 @@ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
   sum = padd<Packet2d>(a, b);
   return pfirst(sum);
 }
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
-  Packet2d sum;
-  sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
-  double first = predux<Packet2d>(sum);
-  return static_cast<float>(first);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
-{
-  Packet4i v[4], sum[4];
-
-  // It's easier and faster to transpose then add as columns
-  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
-  // Do the transpose, first set of moves
-  v[0] = vec_mergeh(vecs[0], vecs[2]);
-  v[1] = vec_mergel(vecs[0], vecs[2]);
-  v[2] = vec_mergeh(vecs[1], vecs[3]);
-  v[3] = vec_mergel(vecs[1], vecs[3]);
-  // Get the resulting vectors
-  sum[0] = vec_mergeh(v[0], v[2]);
-  sum[1] = vec_mergel(v[0], v[2]);
-  sum[2] = vec_mergeh(v[1], v[3]);
-  sum[3] = vec_mergel(v[1], v[3]);
-
-  // Now do the summation:
-  // Lines 0+1
-  sum[0] = padd<Packet4i>(sum[0], sum[1]);
-  // Lines 2+3
-  sum[1] = padd<Packet4i>(sum[2], sum[3]);
-  // Add the results
-  sum[0] = padd<Packet4i>(sum[0], sum[1]);
-
-  return sum[0];
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
-{
-  Packet2d v[2], sum;
-  v[0] = padd<Packet2d>(vecs[0], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[0]), reinterpret_cast<Packet4ui>(vecs[0]), 8)));
-  v[1] = padd<Packet2d>(vecs[1], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[1]), reinterpret_cast<Packet4ui>(vecs[1]), 8)));
- 
-  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v[0]), reinterpret_cast<Packet4ui>(v[1]), 8));
-
-  return sum;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
-  PacketBlock<Packet4f,4> transpose;
-  transpose.packet[0] = vecs[0];
-  transpose.packet[1] = vecs[1];
-  transpose.packet[2] = vecs[2];
-  transpose.packet[3] = vecs[3];
-  ptranspose(transpose);
-
-  Packet4f sum = padd(transpose.packet[0], transpose.packet[1]);
-  sum = padd(sum, transpose.packet[2]);
-  sum = padd(sum, transpose.packet[3]);
-  return sum;
-}
-
-// Other reduction functions:
-// mul
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
 {
   EIGEN_ALIGN16 int aux[4];
   pstore(aux, a);
@@ -806,12 +508,6 @@ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
   return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
 }
 
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
-  // Return predux_mul<Packet2d> of the subvectors product
-  return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
-}
-
 // min
 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
 {
@@ -826,14 +522,6 @@ template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
   return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
 }
 
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
-  Packet2d b, res;
-  b   = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
-  res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
-  return static_cast<float>(pfirst(res));
-}
-
 // max
 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 {
@@ -849,14 +537,6 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
   return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
 }
 
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
-  Packet2d b, res;
-  b   = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
-  res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
-  return static_cast<float>(pfirst(res));
-}
-
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4i,4>& kernel) {
   Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
@@ -877,6 +557,282 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
   kernel.packet[1] = t1;
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+/* z13 has no vector float support so we emulate that with double
+   z14 has proper vector float support.
+*/
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+/* Helper function to simulate a vec_splat_packet4f
+ */
+template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f&   from)
+{
+  Packet4f splat;
+  switch (element) {
+  case 0:
+    splat.v4f[0] = vec_splat(from.v4f[0], 0);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 1:
+    splat.v4f[0] = vec_splat(from.v4f[0], 1);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 2:
+    splat.v4f[0] = vec_splat(from.v4f[1], 0);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 3:
+    splat.v4f[0] = vec_splat(from.v4f[1], 1);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  }
+  return splat;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet4f vfrom;
+  vfrom.v4f[0] = vec_ld2f(&from[0]);
+  vfrom.v4f[1] = vec_ld2f(&from[2]);
+  return vfrom;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_st2f(from.v4f[0], &to[0]);
+  vec_st2f(from.v4f[1], &to[2]);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&    from)
+{
+  Packet4f to;
+  to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
+  to.v4f[1] = to.v4f[0];
+  return to;
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat_packet4f<0>(a3);
+  a1 = vec_splat_packet4f<1>(a3);
+  a2 = vec_splat_packet4f<2>(a3);
+  a3 = vec_splat_packet4f<3>(a3);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  float EIGEN_ALIGN16 ai[4];
+  ai[0] = from[0*stride];
+  ai[1] = from[1*stride];
+  ai[2] = from[2*stride];
+  ai[3] = from[3*stride];
+ return pload<Packet4f>(ai);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  float EIGEN_ALIGN16 ai[4];
+  pstore<float>((float *)ai, from);
+  to[0*stride] = ai[0];
+  to[1*stride] = ai[1];
+  to[2*stride] = ai[2];
+  to[3*stride] = ai[3];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] + b.v4f[0];
+  c.v4f[1] = a.v4f[1] + b.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] - b.v4f[0];
+  c.v4f[1] = a.v4f[1] - b.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] * b.v4f[0];
+  c.v4f[1] = a.v4f[1] * b.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] / b.v4f[0];
+  c.v4f[1] = a.v4f[1] / b.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
+{
+  Packet4f c;
+  c.v4f[0] = -a.v4f[0];
+  c.v4f[1] = -a.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
+{
+  Packet4f res;
+  res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
+  res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = por(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = por(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pxor(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pxor(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_round(a.v4f[0]);
+  res.v4f[1] = vec_round(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_ceil(a.v4f[0]);
+  res.v4f[1] = vec_ceil(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_floor(a.v4f[0]);
+  res.v4f[1] = vec_floor(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*    from)
+{
+  Packet4f p = pload<Packet4f>(from);
+  p.v4f[1] = vec_splat(p.v4f[0], 1);
+  p.v4f[0] = vec_splat(p.v4f[0], 0);
+  return p;
+}
+
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float  EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  Packet4f rev;
+  rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
+  rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
+  return rev;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = pabs(a.v4f[0]);
+  res.v4f[1] = pabs(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  Packet2d sum;
+  sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
+  double first = predux<Packet2d>(sum);
+  return static_cast<float>(first);
+}
+
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  // Return predux_mul<Packet2d> of the subvectors product
+  return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  Packet2d b, res;
+  b   = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  return static_cast<float>(pfirst(res));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  Packet2d b, res;
+  b   = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  return static_cast<float>(pfirst(res));
+}
+
 /* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
  */
 EIGEN_DEVICE_FUNC inline void
@@ -915,12 +871,6 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
   kernel.packet[3].v4f[1] = t3.packet[1];
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
-  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
-  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
-  return vec_sel(elsePacket, thenPacket, mask);
-}
-
 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
   Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
   Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
@@ -932,12 +882,177 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons
   return result;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
-  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
-  Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
+template<> Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+#else
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet *vfrom;
+  vfrom = (Packet *) from;
+  return vfrom->v4f;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  Packet *vto;
+  vto = (Packet *) to;
+  vto->v4f = from;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
+{
+  return vec_splats(from);
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  float EIGEN_ALIGN16 af[4];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  af[2] = from[2*stride];
+  af[3] = from[3*stride];
+ return pload<Packet4f>(af);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  float EIGEN_ALIGN16 af[4];
+  pstore<float>((float*)af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+  to[2*stride] = af[2];
+  to[3*stride] = af[3];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a + b); }
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a - b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a * b); }
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a / b); }
+template<> EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) { return (-a); }
+template<> EIGEN_STRONG_INLINE Packet4f pconj<Packet4f>  (const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f>  (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>    (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f> (const Packet4f& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>  (const Packet4f& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f> (const Packet4f& a) { return vec_floor(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>   (const Packet4f& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
+{
+  Packet4f p = pload<Packet4f>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  Packet4f b, sum;
+  b   = vec_sld(a, a, 8);
+  sum = padd<Packet4f>(a, b);
+  b   = vec_sld(sum, sum, 4);
+  sum = padd<Packet4f>(sum, b);
+  return pfirst(sum);
+}
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  Packet4f prod;
+  prod = pmul(a, vec_sld(a, a, 8));
+  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
+}
+
+// min
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  Packet4f b, res;
+  b   = pmin<Packet4f>(a, vec_sld(a, a, 8));
+  res = pmin<Packet4f>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+// max
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  Packet4f b, res;
+  b = pmax<Packet4f>(a, vec_sld(a, a, 8));
+  res = pmax<Packet4f>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
   return vec_sel(elsePacket, thenPacket, mask);
 }
 
+#endif
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f> (const float* from) { return pload<Packet4f>(from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); }
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>  (const float& a)  { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/contrib/eigen/Eigen/src/Core/functors/AssignmentFunctors.h b/contrib/eigen/Eigen/src/Core/functors/AssignmentFunctors.h
index 4153b877cffd64077d7d521d49d921b9af8d9ac2..bf64ef4ede6d605ad95412ccb71654ce00f6cc21 100644
--- a/contrib/eigen/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/contrib/eigen/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -144,7 +144,7 @@ template<typename Scalar> struct swap_assign_op {
   EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
   {
-#ifdef __CUDACC__
+#ifdef EIGEN_GPUCC
     // FIXME is there some kind of cuda::swap?
     Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
 #else
@@ -157,7 +157,16 @@ template<typename Scalar>
 struct functor_traits<swap_assign_op<Scalar> > {
   enum {
     Cost = 3 * NumTraits<Scalar>::ReadCost,
-    PacketAccess = packet_traits<Scalar>::Vectorizable
+    PacketAccess = 
+    #if defined(EIGEN_VECTORIZE_AVX) && EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<800 || defined(__apple_build_version__))
+    // This is a partial workaround for a bug in clang generating bad code
+    // when mixing 256/512 bits loads and 128 bits moves.
+    // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1684
+    //     https://bugs.llvm.org/show_bug.cgi?id=40815
+    0
+    #else
+    packet_traits<Scalar>::Vectorizable
+    #endif
   };
 };
 
diff --git a/contrib/eigen/Eigen/src/Core/functors/BinaryFunctors.h b/contrib/eigen/Eigen/src/Core/functors/BinaryFunctors.h
index 3eae6b8cada75488767699a698efa7dba383cbe9..63f09ab9317e31d26525916d8ca0a031fa7c1247 100644
--- a/contrib/eigen/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/contrib/eigen/Eigen/src/Core/functors/BinaryFunctors.h
@@ -39,32 +39,26 @@ struct scalar_sum_op : binary_op_base<LhsScalar,RhsScalar>
     EIGEN_SCALAR_BINARY_OP_PLUGIN
   }
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::padd(a,b); }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
   { return internal::predux(a); }
 };
 template<typename LhsScalar,typename RhsScalar>
 struct functor_traits<scalar_sum_op<LhsScalar,RhsScalar> > {
   enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2, // rough estimate!
+    Cost = (int(NumTraits<LhsScalar>::AddCost) + int(NumTraits<RhsScalar>::AddCost)) / 2, // rough estimate!
     PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAdd && packet_traits<RhsScalar>::HasAdd
     // TODO vectorize mixed sum
   };
 };
 
-/** \internal
-  * \brief Template specialization to deprecate the summation of boolean expressions.
-  * This is required to solve Bug 426.
-  * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast()
-  */
-template<> struct scalar_sum_op<bool,bool> : scalar_sum_op<int,int> {
-  EIGEN_DEPRECATED
-  scalar_sum_op() {}
-};
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_sum_op<bool,bool>::operator() (const bool& a, const bool& b) const { return a || b; }
 
 
 /** \internal
@@ -83,23 +77,27 @@ struct scalar_product_op  : binary_op_base<LhsScalar,RhsScalar>
     EIGEN_SCALAR_BINARY_OP_PLUGIN
   }
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::pmul(a,b); }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
   { return internal::predux_mul(a); }
 };
 template<typename LhsScalar,typename RhsScalar>
 struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
   enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
+    Cost = (int(NumTraits<LhsScalar>::MulCost) + int(NumTraits<RhsScalar>::MulCost))/2, // rough estimate!
     PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
     // TODO vectorize mixed product
   };
 };
 
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_product_op<bool,bool>::operator() (const bool& a, const bool& b) const { return a && b; }
+
+
 /** \internal
   * \brief Template functor to compute the conjugate product of two scalars
   *
@@ -116,11 +114,11 @@ struct scalar_conj_product_op  : binary_op_base<LhsScalar,RhsScalar>
   typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_conj_product_op>::ReturnType result_type;
   
   EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const
   { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
   
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
   { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
 };
 template<typename LhsScalar,typename RhsScalar>
@@ -136,21 +134,28 @@ struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
   *
   * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
   */
-template<typename LhsScalar,typename RhsScalar>
+template<typename LhsScalar,typename RhsScalar, int NaNPropagation>
 struct scalar_min_op : binary_op_base<LhsScalar,RhsScalar>
 {
   typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_min_op>::ReturnType result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const {
+    return internal::pmin<NaNPropagation>(a, b);
+  }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmin(a,b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
+  {
+    return internal::pmin<NaNPropagation>(a,b);
+  }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux_min(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
+  {
+    return internal::predux_min<NaNPropagation>(a);
+  }
 };
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_min_op<LhsScalar,RhsScalar> > {
+
+template<typename LhsScalar,typename RhsScalar, int NaNPropagation>
+struct functor_traits<scalar_min_op<LhsScalar,RhsScalar, NaNPropagation> > {
   enum {
     Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
     PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMin
@@ -162,21 +167,28 @@ struct functor_traits<scalar_min_op<LhsScalar,RhsScalar> > {
   *
   * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
   */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_max_op  : binary_op_base<LhsScalar,RhsScalar>
+template<typename LhsScalar,typename RhsScalar, int NaNPropagation>
+struct scalar_max_op : binary_op_base<LhsScalar,RhsScalar>
 {
   typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_max_op>::ReturnType result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const {
+    return internal::pmax<NaNPropagation>(a,b);
+  }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmax(a,b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
+  {
+    return internal::pmax<NaNPropagation>(a,b);
+  }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux_max(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
+  {
+    return internal::predux_max<NaNPropagation>(a);
+  }
 };
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_max_op<LhsScalar,RhsScalar> > {
+
+template<typename LhsScalar,typename RhsScalar, int NaNPropagation>
+struct functor_traits<scalar_max_op<LhsScalar,RhsScalar, NaNPropagation> > {
   enum {
     Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
     PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMax
@@ -253,7 +265,6 @@ struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_NEQ> : binary_op_base<LhsScalar,Rh
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a!=b;}
 };
 
-
 /** \internal
   * \brief Template functor to compute the hypot of two \b positive \b and \b real scalars
   *
@@ -287,6 +298,7 @@ struct functor_traits<scalar_hypot_op<Scalar,Scalar> > {
 
 /** \internal
   * \brief Template functor to compute the pow of two scalars
+  * See the specification of pow in https://en.cppreference.com/w/cpp/numeric/math/pow
   */
 template<typename Scalar, typename Exponent>
 struct scalar_pow_op  : binary_op_base<Scalar,Exponent>
@@ -301,16 +313,31 @@ struct scalar_pow_op  : binary_op_base<Scalar,Exponent>
     EIGEN_SCALAR_BINARY_OP_PLUGIN
   }
 #endif
+
   EIGEN_DEVICE_FUNC
   inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); }
+
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  {
+    return generic_pow(a,b);
+  }
 };
+
 template<typename Scalar, typename Exponent>
 struct functor_traits<scalar_pow_op<Scalar,Exponent> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = (!NumTraits<Scalar>::IsComplex && !NumTraits<Scalar>::IsInteger &&
+                    packet_traits<Scalar>::HasExp && packet_traits<Scalar>::HasLog &&
+                    packet_traits<Scalar>::HasRound && packet_traits<Scalar>::HasCmp &&
+                    // Temporarly disable packet access for half/bfloat16 until
+                    // accuracy is improved.
+                    !is_same<Scalar, half>::value && !is_same<Scalar, bfloat16>::value
+                    )
+  };
 };
 
-
-
 //---------- non associative binary functors ----------
 
 /** \internal
@@ -337,7 +364,7 @@ struct scalar_difference_op : binary_op_base<LhsScalar,RhsScalar>
 template<typename LhsScalar,typename RhsScalar>
 struct functor_traits<scalar_difference_op<LhsScalar,RhsScalar> > {
   enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    Cost = (int(NumTraits<LhsScalar>::AddCost) + int(NumTraits<RhsScalar>::AddCost)) / 2,
     PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasSub && packet_traits<RhsScalar>::HasSub
   };
 };
@@ -382,11 +409,14 @@ struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
 struct scalar_boolean_and_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pand(a,b); }
 };
 template<> struct functor_traits<scalar_boolean_and_op> {
   enum {
     Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
+    PacketAccess = true
   };
 };
 
@@ -398,11 +428,14 @@ template<> struct functor_traits<scalar_boolean_and_op> {
 struct scalar_boolean_or_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::por(a,b); }
 };
 template<> struct functor_traits<scalar_boolean_or_op> {
   enum {
     Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
+    PacketAccess = true
   };
 };
 
@@ -414,11 +447,44 @@ template<> struct functor_traits<scalar_boolean_or_op> {
 struct scalar_boolean_xor_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pxor(a,b); }
 };
 template<> struct functor_traits<scalar_boolean_xor_op> {
   enum {
     Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
+    PacketAccess = true
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the absolute difference of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::absolute_difference
+  */
+template<typename LhsScalar,typename RhsScalar>
+struct scalar_absolute_difference_op : binary_op_base<LhsScalar,RhsScalar>
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_absolute_difference_op>::ReturnType result_type;
+#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_absolute_difference_op)
+#else
+  scalar_absolute_difference_op() {
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
+  { return numext::absdiff(a,b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pabsdiff(a,b); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_absolute_difference_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAbsDiff
   };
 };
 
@@ -436,7 +502,7 @@ template<typename BinaryOp> struct bind1st_op : BinaryOp {
   typedef typename BinaryOp::second_argument_type second_argument_type;
   typedef typename BinaryOp::result_type          result_type;
 
-  bind1st_op(const first_argument_type &val) : m_value(val) {}
+  EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
 
@@ -455,7 +521,7 @@ template<typename BinaryOp> struct bind2nd_op : BinaryOp {
   typedef typename BinaryOp::second_argument_type second_argument_type;
   typedef typename BinaryOp::result_type          result_type;
 
-  bind2nd_op(const second_argument_type &val) : m_value(val) {}
+  EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
 
diff --git a/contrib/eigen/Eigen/src/Core/functors/NullaryFunctors.h b/contrib/eigen/Eigen/src/Core/functors/NullaryFunctors.h
index b03be0269c9ba3ff2fabf733d2309edbe7155b08..192f225ddafda2e867cf80adb396fac940eeeec0 100644
--- a/contrib/eigen/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/contrib/eigen/Eigen/src/Core/functors/NullaryFunctors.h
@@ -37,26 +37,27 @@ template<typename Scalar>
 struct functor_traits<scalar_identity_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
 
-template <typename Scalar, typename Packet, bool IsInteger> struct linspaced_op_impl;
+template <typename Scalar, bool IsInteger> struct linspaced_op_impl;
 
-template <typename Scalar, typename Packet>
-struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
+template <typename Scalar>
+struct linspaced_op_impl<Scalar,/*IsInteger*/false>
 {
-  linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
-    m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
+    m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : Scalar((high-low)/RealScalar(num_steps-1))),
     m_flip(numext::abs(high)<numext::abs(low))
   {}
 
   template<typename IndexType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const {
-    typedef typename NumTraits<Scalar>::Real RealScalar;
     if(m_flip)
-      return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step);
+      return (i==0)? m_low : Scalar(m_high - RealScalar(m_size1-i)*m_step);
     else
-      return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step);
+      return (i==m_size1)? m_high : Scalar(m_low + RealScalar(i)*m_step);
   }
 
-  template<typename IndexType>
+  template<typename Packet, typename IndexType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const
   {
     // Principle:
@@ -65,17 +66,17 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
     {
       Packet pi = plset<Packet>(Scalar(i-m_size1));
       Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
-      if(i==0)
-        res = pinsertfirst(res, m_low);
-      return res;
+      if (EIGEN_PREDICT_TRUE(i != 0)) return res;
+      Packet mask = pcmp_lt(pset1<Packet>(0), plset<Packet>(0));
+      return pselect<Packet>(mask, res, pset1<Packet>(m_low));
     }
     else
     {
       Packet pi = plset<Packet>(Scalar(i));
       Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
-      if(i==m_size1-unpacket_traits<Packet>::size+1)
-        res = pinsertlast(res, m_high);
-      return res;
+      if(EIGEN_PREDICT_TRUE(i != m_size1-unpacket_traits<Packet>::size+1)) return res;
+      Packet mask = pcmp_lt(plset<Packet>(0), pset1<Packet>(unpacket_traits<Packet>::size-1));
+      return pselect<Packet>(mask, res, pset1<Packet>(m_high));
     }
   }
 
@@ -86,10 +87,10 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
   const bool m_flip;
 };
 
-template <typename Scalar, typename Packet>
-struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/true>
+template <typename Scalar>
+struct linspaced_op_impl<Scalar,/*IsInteger*/true>
 {
-  linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
+  EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
     m_low(low),
     m_multiplier((high-low)/convert_index<Scalar>(num_steps<=1 ? 1 : num_steps-1)),
     m_divisor(convert_index<Scalar>((high>=low?num_steps:-num_steps)+(high-low))/((numext::abs(high-low)+1)==0?1:(numext::abs(high-low)+1))),
@@ -115,8 +116,8 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/true>
 // Forward declaration (we default to random access which does not really give
 // us a speed gain when using packet access but it allows to use the functor in
 // nested expressions).
-template <typename Scalar, typename PacketType> struct linspaced_op;
-template <typename Scalar, typename PacketType> struct functor_traits< linspaced_op<Scalar,PacketType> >
+template <typename Scalar> struct linspaced_op;
+template <typename Scalar> struct functor_traits< linspaced_op<Scalar> >
 {
   enum
   {
@@ -126,9 +127,9 @@ template <typename Scalar, typename PacketType> struct functor_traits< linspaced
     IsRepeatable = true
   };
 };
-template <typename Scalar, typename PacketType> struct linspaced_op
+template <typename Scalar> struct linspaced_op
 {
-  linspaced_op(const Scalar& low, const Scalar& high, Index num_steps)
+  EIGEN_DEVICE_FUNC linspaced_op(const Scalar& low, const Scalar& high, Index num_steps)
     : impl((num_steps==1 ? high : low),high,num_steps)
   {}
 
@@ -136,11 +137,11 @@ template <typename Scalar, typename PacketType> struct linspaced_op
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return impl(i); }
 
   template<typename Packet,typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.packetOp(i); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.template packetOp<Packet>(i); }
 
   // This proxy object handles the actual required temporaries and the different
   // implementations (integer vs. floating point).
-  const linspaced_op_impl<Scalar,PacketType,NumTraits<Scalar>::IsInteger> impl;
+  const linspaced_op_impl<Scalar,NumTraits<Scalar>::IsInteger> impl;
 };
 
 // Linear access is automatically determined from the operator() prototypes available for the given functor.
@@ -166,12 +167,12 @@ struct has_unary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value =
 template<typename Scalar,typename IndexType>
 struct has_binary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value = 1}; };
 
-template<typename Scalar, typename PacketType,typename IndexType>
-struct has_nullary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 0}; };
-template<typename Scalar, typename PacketType,typename IndexType>
-struct has_unary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 1}; };
-template<typename Scalar, typename PacketType,typename IndexType>
-struct has_binary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_nullary_operator<linspaced_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_unary_operator<linspaced_op<Scalar>,IndexType> { enum { value = 1}; };
+template<typename Scalar,typename IndexType>
+struct has_binary_operator<linspaced_op<Scalar>,IndexType> { enum { value = 0}; };
 
 template<typename Scalar,typename IndexType>
 struct has_nullary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 1}; };
diff --git a/contrib/eigen/Eigen/src/Core/functors/StlFunctors.h b/contrib/eigen/Eigen/src/Core/functors/StlFunctors.h
index 9c1d75850be409bdacab63c64a41d13a3e949a13..4570c9b634bd920753aa4d9ab063c4ab49ab4ecd 100644
--- a/contrib/eigen/Eigen/src/Core/functors/StlFunctors.h
+++ b/contrib/eigen/Eigen/src/Core/functors/StlFunctors.h
@@ -12,6 +12,28 @@
 
 namespace Eigen {
 
+// Portable replacements for certain functors.
+namespace numext {
+
+template<typename T = void>
+struct equal_to {
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const {
+    return lhs == rhs;
+  }
+};
+
+template<typename T = void>
+struct not_equal_to {
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const {
+    return lhs != rhs;
+  }
+};
+
+}
+
+
 namespace internal {
 
 // default functor traits for STL functors:
@@ -68,11 +90,19 @@ template<typename T>
 struct functor_traits<std::equal_to<T> >
 { enum { Cost = 1, PacketAccess = false }; };
 
+template<typename T>
+struct functor_traits<numext::equal_to<T> >
+  : functor_traits<std::equal_to<T> > {};
+
 template<typename T>
 struct functor_traits<std::not_equal_to<T> >
 { enum { Cost = 1, PacketAccess = false }; };
 
-#if (__cplusplus < 201103L) && (EIGEN_COMP_MSVC <= 1900)
+template<typename T>
+struct functor_traits<numext::not_equal_to<T> >
+  : functor_traits<std::not_equal_to<T> > {};
+
+#if (EIGEN_COMP_CXXVER < 11)
 // std::binder* are deprecated since c++11 and will be removed in c++17
 template<typename T>
 struct functor_traits<std::binder2nd<T> >
@@ -83,7 +113,7 @@ struct functor_traits<std::binder1st<T> >
 { enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
 #endif
 
-#if (__cplusplus < 201703L) && (EIGEN_COMP_MSVC < 1910)
+#if (EIGEN_COMP_CXXVER < 17)
 // std::unary_negate is deprecated since c++17 and will be removed in c++20
 template<typename T>
 struct functor_traits<std::unary_negate<T> >
diff --git a/contrib/eigen/Eigen/src/Core/functors/UnaryFunctors.h b/contrib/eigen/Eigen/src/Core/functors/UnaryFunctors.h
index b56e7afd2c5ebcb08720724ee9e870fc60bcf4b1..16136d185aa3c16cd7b7cff1fa97ca07c405ddf0 100644
--- a/contrib/eigen/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/contrib/eigen/Eigen/src/Core/functors/UnaryFunctors.h
@@ -109,7 +109,7 @@ struct functor_traits<scalar_abs2_op<Scalar> >
 template<typename Scalar> struct scalar_conjugate_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op)
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); }
+  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::conj(a); }
   template<typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); }
 };
@@ -117,7 +117,15 @@ template<typename Scalar>
 struct functor_traits<scalar_conjugate_op<Scalar> >
 {
   enum {
-    Cost = NumTraits<Scalar>::IsComplex ? NumTraits<Scalar>::AddCost : 0,
+    Cost = 0,
+    // Yes the cost is zero even for complexes because in most cases for which
+    // the cost is used, conjugation turns to be a no-op. Some examples:
+    //   cost(a*conj(b)) == cost(a*b)
+    //   cost(a+conj(b)) == cost(a+b)
+    //   <etc.
+    // If we don't set it to zero, then:
+    //   A.conjugate().lazyProduct(B.conjugate())
+    // will bake its operands. We definitely don't want that!
     PacketAccess = packet_traits<Scalar>::HasConj
   };
 };
@@ -130,7 +138,7 @@ struct functor_traits<scalar_conjugate_op<Scalar> >
 template<typename Scalar> struct scalar_arg_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op)
   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using numext::arg; return arg(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::arg(a); }
   template<typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
   { return internal::parg(a); }
@@ -158,6 +166,44 @@ template<typename Scalar, typename NewType>
 struct functor_traits<scalar_cast_op<Scalar,NewType> >
 { enum { Cost = is_same<Scalar, NewType>::value ? 0 : NumTraits<NewType>::AddCost, PacketAccess = false }; };
 
+/** \internal
+  * \brief Template functor to arithmetically shift a scalar right by a number of bits
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::shift_right()
+  */
+template<typename Scalar, int N>
+struct scalar_shift_right_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_right_op)
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const
+  { return a >> N; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::parithmetic_shift_right<N>(a); }
+};
+template<typename Scalar, int N>
+struct functor_traits<scalar_shift_right_op<Scalar,N> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasShift }; };
+
+/** \internal
+  * \brief Template functor to logically shift a scalar left by a number of bits
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::shift_left()
+  */
+template<typename Scalar, int N>
+struct scalar_shift_left_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_left_op)
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const
+  { return a << N; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::plogical_shift_left<N>(a); }
+};
+template<typename Scalar, int N>
+struct functor_traits<scalar_shift_left_op<Scalar,N> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasShift }; };
+
 /** \internal
   * \brief Template functor to extract the real part of a complex
   *
@@ -262,6 +308,26 @@ struct functor_traits<scalar_exp_op<Scalar> > {
   };
 };
 
+/** \internal
+  *
+  * \brief Template functor to compute the exponential of a scalar - 1.
+  *
+  * \sa class CwiseUnaryOp, ArrayBase::expm1()
+  */
+template<typename Scalar> struct scalar_expm1_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_expm1_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::expm1(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexpm1(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_expm1_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasExpm1,
+    Cost = functor_traits<scalar_exp_op<Scalar> >::Cost // TODO measure cost of expm1
+  };
+};
+
 /** \internal
   *
   * \brief Template functor to compute the logarithm of a scalar
@@ -321,7 +387,7 @@ struct functor_traits<scalar_log1p_op<Scalar> > {
   */
 template<typename Scalar> struct scalar_log10_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_log10_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { EIGEN_USING_STD_MATH(log10) return log10(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { EIGEN_USING_STD(log10) return log10(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog10(a); }
 };
@@ -329,6 +395,22 @@ template<typename Scalar>
 struct functor_traits<scalar_log10_op<Scalar> >
 { enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog10 }; };
 
+/** \internal
+  *
+  * \brief Template functor to compute the base-2 logarithm of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::log2()
+  */
+template<typename Scalar> struct scalar_log2_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_log2_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(EIGEN_LOG2E) * numext::log(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog2(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_log2_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; };
+
 /** \internal
   * \brief Template functor to compute the square root of a scalar
   * \sa class CwiseUnaryOp, Cwise::sqrt()
@@ -356,13 +438,25 @@ struct functor_traits<scalar_sqrt_op<Scalar> > {
   };
 };
 
+// Boolean specialization to eliminate -Wimplicit-conversion-floating-point-to-bool warnings.
+template<> struct scalar_sqrt_op<bool> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; }
+  template <typename Packet>
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return a; }
+};
+template <>
+struct functor_traits<scalar_sqrt_op<bool> > {
+  enum { Cost = 1, PacketAccess = packet_traits<bool>::Vectorizable };
+};
+
 /** \internal
   * \brief Template functor to compute the reciprocal square root of a scalar
   * \sa class CwiseUnaryOp, Cwise::rsqrt()
   */
 template<typename Scalar> struct scalar_rsqrt_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(1)/numext::sqrt(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::rsqrt(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); }
 };
@@ -528,6 +622,23 @@ struct functor_traits<scalar_tanh_op<Scalar> > {
   };
 };
 
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+  * \brief Template functor to compute the atanh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::atanh()
+  */
+template <typename Scalar>
+struct scalar_atanh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_atanh_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
 /** \internal
   * \brief Template functor to compute the sinh of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::sinh()
@@ -547,6 +658,23 @@ struct functor_traits<scalar_sinh_op<Scalar> >
   };
 };
 
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+  * \brief Template functor to compute the asinh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::asinh()
+  */
+template <typename Scalar>
+struct scalar_asinh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_asinh_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
 /** \internal
   * \brief Template functor to compute the cosh of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::cosh()
@@ -566,6 +694,23 @@ struct functor_traits<scalar_cosh_op<Scalar> >
   };
 };
 
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+  * \brief Template functor to compute the acosh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::acosh()
+  */
+template <typename Scalar>
+struct scalar_acosh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_acosh_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
 /** \internal
   * \brief Template functor to compute the inverse of a scalar
   * \sa class CwiseUnaryOp, Cwise::inverse()
@@ -578,9 +723,13 @@ struct scalar_inverse_op {
   EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
   { return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
 };
-template<typename Scalar>
-struct functor_traits<scalar_inverse_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
+template <typename Scalar>
+struct functor_traits<scalar_inverse_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasDiv,
+    Cost = scalar_div_cost<Scalar, PacketAccess>::value
+  };
+};
 
 /** \internal
   * \brief Template functor to compute the square of a scalar
@@ -598,6 +747,19 @@ template<typename Scalar>
 struct functor_traits<scalar_square_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
 
+// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC.
+template<>
+struct scalar_square_op<bool> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op)
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; }
+  template<typename Packet>
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return a; }
+};
+template<>
+struct functor_traits<scalar_square_op<bool> >
+{ enum { Cost = 0, PacketAccess = packet_traits<bool>::Vectorizable }; };
+
 /** \internal
   * \brief Template functor to compute the cube of a scalar
   * \sa class CwiseUnaryOp, Cwise::cube()
@@ -614,6 +776,19 @@ template<typename Scalar>
 struct functor_traits<scalar_cube_op<Scalar> >
 { enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
 
+// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC.
+template<>
+struct scalar_cube_op<bool> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op)
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; }
+  template<typename Packet>
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return a; }
+};
+template<>
+struct functor_traits<scalar_cube_op<bool> >
+{ enum { Cost = 0, PacketAccess = packet_traits<bool>::Vectorizable }; };
+
 /** \internal
   * \brief Template functor to compute the rounded value of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::round()
@@ -652,6 +827,25 @@ struct functor_traits<scalar_floor_op<Scalar> >
   };
 };
 
+/** \internal
+  * \brief Template functor to compute the rounded (with current rounding mode)  value of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::rint()
+  */
+template<typename Scalar> struct scalar_rint_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_rint_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::rint(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::print(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_rint_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRint
+  };
+};
+
 /** \internal
   * \brief Template functor to compute the ceil of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::ceil()
@@ -678,7 +872,13 @@ struct functor_traits<scalar_ceil_op<Scalar> >
 template<typename Scalar> struct scalar_isnan_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_isnan_op)
   typedef bool result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isnan)(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return numext::isnan(a);
+#else
+    return (numext::isnan)(a);
+#endif
+  }
 };
 template<typename Scalar>
 struct functor_traits<scalar_isnan_op<Scalar> >
@@ -696,7 +896,13 @@ struct functor_traits<scalar_isnan_op<Scalar> >
 template<typename Scalar> struct scalar_isinf_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_isinf_op)
   typedef bool result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isinf)(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return numext::isinf(a);
+#else
+    return (numext::isinf)(a);
+#endif
+  }
 };
 template<typename Scalar>
 struct functor_traits<scalar_isinf_op<Scalar> >
@@ -714,7 +920,13 @@ struct functor_traits<scalar_isinf_op<Scalar> >
 template<typename Scalar> struct scalar_isfinite_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_isfinite_op)
   typedef bool result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isfinite)(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return numext::isfinite(a);
+#else
+    return (numext::isfinite)(a);
+#endif
+  }
 };
 template<typename Scalar>
 struct functor_traits<scalar_isfinite_op<Scalar> >
@@ -746,9 +958,9 @@ struct functor_traits<scalar_boolean_not_op<Scalar> > {
   * \brief Template functor to compute the signum of a scalar
   * \sa class CwiseUnaryOp, Cwise::sign()
   */
-template<typename Scalar,bool iscpx=(NumTraits<Scalar>::IsComplex!=0) > struct scalar_sign_op;
+template<typename Scalar,bool is_complex=(NumTraits<Scalar>::IsComplex!=0), bool is_integer=(NumTraits<Scalar>::IsInteger!=0) > struct scalar_sign_op;
 template<typename Scalar>
-struct scalar_sign_op<Scalar,false> {
+struct scalar_sign_op<Scalar, false, true> {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
   {
@@ -758,8 +970,21 @@ struct scalar_sign_op<Scalar,false> {
   //template <typename Packet>
   //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); }
 };
+
 template<typename Scalar>
-struct scalar_sign_op<Scalar,true> {
+struct scalar_sign_op<Scalar, false, false> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
+  {
+    return (numext::isnan)(a) ? a : Scalar( (a>Scalar(0)) - (a<Scalar(0)) );
+  }
+  //TODO
+  //template <typename Packet>
+  //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); }
+};
+
+template<typename Scalar, bool is_integer>
+struct scalar_sign_op<Scalar,true, is_integer> {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
   {
@@ -777,7 +1002,7 @@ struct scalar_sign_op<Scalar,true> {
 template<typename Scalar>
 struct functor_traits<scalar_sign_op<Scalar> >
 { enum {
-    Cost = 
+    Cost =
         NumTraits<Scalar>::IsComplex
         ? ( 8*NumTraits<Scalar>::MulCost  ) // roughly
         : ( 3*NumTraits<Scalar>::AddCost),
@@ -785,6 +1010,120 @@ struct functor_traits<scalar_sign_op<Scalar> >
   };
 };
 
+/** \internal
+  * \brief Template functor to compute the logistic function of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::logistic()
+  */
+template <typename T>
+struct scalar_logistic_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
+    return packetOp(x);
+  }
+
+  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(const Packet& x) const {
+    const Packet one = pset1<Packet>(T(1));
+    return pdiv(one, padd(one, pexp(pnegate(x))));
+  }
+};
+
+#ifndef EIGEN_GPU_COMPILE_PHASE
+/** \internal
+  * \brief Template specialization of the logistic function for float.
+  *
+  *  Uses just a 9/10-degree rational interpolant which
+  *  interpolates 1/(1+exp(-x)) - 0.5 up to a couple of ulps in the range
+  *  [-9, 18]. Below -9 we use the more accurate approximation
+  *  1/(1+exp(-x)) ~= exp(x), and above 18 the logistic function is 1 withing
+  *  one ulp. The shifted logistic is interpolated because it was easier to
+  *  make the fit converge.
+  *
+  */
+template <>
+struct scalar_logistic_op<float> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(const float& x) const {
+    return packetOp(x);
+  }
+
+  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(const Packet& _x) const {
+    const Packet cutoff_lower = pset1<Packet>(-9.f);
+    const Packet lt_mask = pcmp_lt<Packet>(_x, cutoff_lower);
+    const bool any_small = predux_any(lt_mask);
+
+    // The upper cut-off is the smallest x for which the rational approximation evaluates to 1.
+    // Choosing this value saves us a few instructions clamping the results at the end.
+#ifdef EIGEN_VECTORIZE_FMA
+    const Packet cutoff_upper = pset1<Packet>(15.7243833541870117f);
+#else
+    const Packet cutoff_upper = pset1<Packet>(15.6437711715698242f);
+#endif
+    const Packet x = pmin(_x, cutoff_upper);
+
+    // The monomial coefficients of the numerator polynomial (odd).
+    const Packet alpha_1 = pset1<Packet>(2.48287947061529e-01f);
+    const Packet alpha_3 = pset1<Packet>(8.51377133304701e-03f);
+    const Packet alpha_5 = pset1<Packet>(6.08574864600143e-05f);
+    const Packet alpha_7 = pset1<Packet>(1.15627324459942e-07f);
+    const Packet alpha_9 = pset1<Packet>(4.37031012579801e-11f);
+
+    // The monomial coefficients of the denominator polynomial (even).
+    const Packet beta_0 = pset1<Packet>(9.93151921023180e-01f);
+    const Packet beta_2 = pset1<Packet>(1.16817656904453e-01f);
+    const Packet beta_4 = pset1<Packet>(1.70198817374094e-03f);
+    const Packet beta_6 = pset1<Packet>(6.29106785017040e-06f);
+    const Packet beta_8 = pset1<Packet>(5.76102136993427e-09f);
+    const Packet beta_10 = pset1<Packet>(6.10247389755681e-13f);
+
+    // Since the polynomials are odd/even, we need x^2.
+    const Packet x2 = pmul(x, x);
+
+    // Evaluate the numerator polynomial p.
+    Packet p = pmadd(x2, alpha_9, alpha_7);
+    p = pmadd(x2, p, alpha_5);
+    p = pmadd(x2, p, alpha_3);
+    p = pmadd(x2, p, alpha_1);
+    p = pmul(x, p);
+
+    // Evaluate the denominator polynomial q.
+    Packet q = pmadd(x2, beta_10, beta_8);
+    q = pmadd(x2, q, beta_6);
+    q = pmadd(x2, q, beta_4);
+    q = pmadd(x2, q, beta_2);
+    q = pmadd(x2, q, beta_0);
+    // Divide the numerator by the denominator and shift it up.
+    const Packet logistic = padd(pdiv(p, q), pset1<Packet>(0.5f));
+    if (EIGEN_PREDICT_FALSE(any_small)) {
+      const Packet exponential = pexp(_x);
+      return pselect(lt_mask, exponential, logistic);
+    } else {
+      return logistic;
+    }
+  }
+};
+#endif  // #ifndef EIGEN_GPU_COMPILE_PHASE
+
+template <typename T>
+struct functor_traits<scalar_logistic_op<T> > {
+  enum {
+    // The cost estimate for float here here is for the common(?) case where
+    // all arguments are greater than -9.
+    Cost = scalar_div_cost<T, packet_traits<T>::HasDiv>::value +
+           (internal::is_same<T, float>::value
+                ? NumTraits<T>::AddCost * 15 + NumTraits<T>::MulCost * 11
+                : NumTraits<T>::AddCost * 2 +
+                      functor_traits<scalar_exp_op<T> >::Cost),
+    PacketAccess =
+        packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
+        (internal::is_same<T, float>::value
+             ? packet_traits<T>::HasMul && packet_traits<T>::HasMax &&
+                   packet_traits<T>::HasMin
+             : packet_traits<T>::HasNegate && packet_traits<T>::HasExp)
+  };
+};
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/contrib/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/contrib/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 681451cc309fc7237ab188ab51b80b60f8ec2ab9..f35b760c1df804cd66bfe43c6493df33a90f84f6 100644
--- a/contrib/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/contrib/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -15,7 +15,13 @@ namespace Eigen {
 
 namespace internal {
 
-template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
+enum GEBPPacketSizeType {
+  GEBPPacketFull = 0,
+  GEBPPacketHalf,
+  GEBPPacketQuarter
+};
+
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull>
 class gebp_traits;
 
 
@@ -25,16 +31,42 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
   return a<=0 ? b : a;
 }
 
+#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
+#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
+#else
+#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
+#endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
+
+#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
+#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
+#else
+#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
+#endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
+
+#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
+#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
+#else
+#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
+#endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
+  
 #if EIGEN_ARCH_i386_OR_x86_64
-const std::ptrdiff_t defaultL1CacheSize = 32*1024;
-const std::ptrdiff_t defaultL2CacheSize = 256*1024;
-const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
+#elif EIGEN_ARCH_PPC
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
 #else
-const std::ptrdiff_t defaultL1CacheSize = 16*1024;
-const std::ptrdiff_t defaultL2CacheSize = 512*1024;
-const std::ptrdiff_t defaultL3CacheSize = 512*1024;
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
 #endif
 
+#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
+#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
+#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
+
 /** \internal */
 struct CacheSizes {
   CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
@@ -50,7 +82,6 @@ struct CacheSizes {
   std::ptrdiff_t m_l3;
 };
 
-
 /** \internal */
 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
 {
@@ -101,6 +132,16 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
   // at the register level. This small horizontal panel has to stay within L1 cache.
   std::ptrdiff_t l1, l2, l3;
   manage_caching_sizes(GetAction, &l1, &l2, &l3);
+  #ifdef EIGEN_VECTORIZE_AVX512
+  // We need to find a rationale for that, but without this adjustment,
+  // performance with AVX512 is pretty bad, like -20% slower.
+  // One reason is that with increasing packet-size, the blocking size k
+  // has to become pretty small if we want that 1 lhs panel fit within L1.
+  // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
+  //   k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
+  // This is quite small for a good reuse of the accumulation registers.
+  l1 *= 4;
+  #endif
 
   if (num_threads > 1) {
     typedef typename Traits::ResScalar ResScalar;
@@ -308,35 +349,60 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
   computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
 }
 
-#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-  #define CJMADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
-#else
-
-  // FIXME (a bit overkill maybe ?)
-
-  template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
-    EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
-    {
-      c = cj.pmadd(a,b,c);
-    }
-  };
-
-  template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
-    EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
-    {
-      t = b; t = cj.pmul(a,t); c = padd(c,t);
-    }
-  };
+template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
+struct RhsPanelHelper {
+ private:
+  static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
+ public:
+  typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
+};
 
-  template<typename CJ, typename A, typename B, typename C, typename T>
-  EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
-  {
-    gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
-  }
+template <typename Packet>
+struct QuadPacket
+{
+  Packet B_0, B1, B2, B3;
+  const Packet& get(const FixedInt<0>&) const { return B_0; }
+  const Packet& get(const FixedInt<1>&) const { return B1; }
+  const Packet& get(const FixedInt<2>&) const { return B2; }
+  const Packet& get(const FixedInt<3>&) const { return B3; }
+};
 
-  #define CJMADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
-//   #define CJMADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
-#endif
+template <int N, typename T1, typename T2, typename T3>
+struct packet_conditional { typedef T3 type; };
+
+template <typename T1, typename T2, typename T3>
+struct packet_conditional<GEBPPacketFull, T1, T2, T3> { typedef T1 type; };
+
+template <typename T1, typename T2, typename T3>
+struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; };
+
+#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size)         \
+  typedef typename packet_conditional<packet_size,                 \
+                                      typename packet_traits<name ## Scalar>::type, \
+                                      typename packet_traits<name ## Scalar>::half, \
+                                      typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
+  prefix ## name ## Packet
+
+#define PACKET_DECL_COND(name, packet_size)                        \
+  typedef typename packet_conditional<packet_size,                 \
+                                      typename packet_traits<name ## Scalar>::type, \
+                                      typename packet_traits<name ## Scalar>::half, \
+                                      typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
+  name ## Packet
+
+#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size)        \
+  typedef typename packet_conditional<packet_size,                 \
+                                      typename packet_traits<Scalar>::type, \
+                                      typename packet_traits<Scalar>::half, \
+                                      typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
+  prefix ## ScalarPacket
+
+#define PACKET_DECL_COND_SCALAR(packet_size)                       \
+  typedef typename packet_conditional<packet_size,                 \
+                                      typename packet_traits<Scalar>::type, \
+                                      typename packet_traits<Scalar>::half, \
+                                      typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
+  ScalarPacket
 
 /* Vectorization logic
  *  real*real: unpack rhs to constant packets, ...
@@ -348,7 +414,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
  *  cplx*real : unpack rhs to constant packets, ...
  *  real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
  */
-template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
 class gebp_traits
 {
 public:
@@ -356,13 +422,17 @@ public:
   typedef _RhsScalar RhsScalar;
   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
+  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+
   enum {
     ConjLhs = _ConjLhs,
     ConjRhs = _ConjRhs,
-    Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
-    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
+    Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
+    LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+    ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
     
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
 
@@ -371,10 +441,12 @@ public:
 
     // register block size along the M direction (currently, this one cannot be modified)
     default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
-#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
-    // we assume 16 registers
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
+    && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
+    // we assume 16 registers or more
     // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
     // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
+    // Bug 1515: MSVC prior to v19.14 yields to register spilling.
     mr = Vectorizable ? 3*LhsPacketSize : default_mr,
 #else
     mr = default_mr,
@@ -384,37 +456,41 @@ public:
     RhsProgress = 1
   };
 
-  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-  typedef typename packet_traits<ResScalar>::type  _ResPacket;
 
   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef LhsPacket LhsPacket4Packing;
 
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
   typedef ResPacket AccPacket;
   
   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
   {
     p = pset1<ResPacket>(ResScalar(0));
   }
-  
-  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
-  {
-    pbroadcast4(b, b0, b1, b2, b3);
-  }
-  
-//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
-//   {
-//     pbroadcast2(b, b0, b1);
-//   }
-  
+
   template<typename RhsPacketType>
   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
   {
     dest = pset1<RhsPacketType>(*b);
   }
-  
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    loadRhs(b, dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {
+  }
+
   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
     dest = ploadquad<RhsPacket>(b);
@@ -432,8 +508,8 @@ public:
     dest = ploadu<LhsPacketType>(a);
   }
 
-  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
-  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
+  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
   {
     conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
     // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
@@ -448,6 +524,12 @@ public:
 #endif
   }
 
+  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+  {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+
   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
   {
     r = pmadd(c,alpha,r);
@@ -461,21 +543,25 @@ public:
 
 };
 
-template<typename RealScalar, bool _ConjLhs>
-class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
+template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize>
+class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
 {
 public:
   typedef std::complex<RealScalar> LhsScalar;
   typedef RealScalar RhsScalar;
   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
+  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+
   enum {
     ConjLhs = _ConjLhs,
     ConjRhs = false,
-    Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
-    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
+    Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
+    LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+    ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
     
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
     nr = 4,
@@ -490,13 +576,12 @@ public:
     RhsProgress = 1
   };
 
-  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-  typedef typename packet_traits<ResScalar>::type  _ResPacket;
-
   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
 
   typedef ResPacket AccPacket;
 
@@ -505,42 +590,64 @@ public:
     p = pset1<ResPacket>(ResScalar(0));
   }
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
   {
-    dest = pset1<RhsPacket>(*b);
+    dest = pset1<RhsPacketType>(*b);
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
   }
+
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    loadRhs(b, dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {}
   
   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
-    dest = pset1<RhsPacket>(*b);
+    loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type());
   }
 
-  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+  EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
   {
-    dest = pload<LhsPacket>(a);
+    // FIXME we can do better!
+    // what we want here is a ploadheight
+    RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
+    dest = ploadquad<RhsPacket>(tmp);
   }
 
-  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+  EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const
   {
-    dest = ploadu<LhsPacket>(a);
+    eigen_internal_assert(RhsPacketSize<=8);
+    dest = pset1<RhsPacket>(*b);
   }
 
-  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
   {
-    pbroadcast4(b, b0, b1, b2, b3);
+    dest = pload<LhsPacket>(a);
   }
-  
-//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
-//   {
-//     pbroadcast2(b, b0, b1);
-//   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
+  {
+    dest = ploadu<LhsPacketType>(a);
+  }
+
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
   {
     madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
   }
 
-  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
   {
 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     EIGEN_UNUSED_VARIABLE(tmp);
@@ -555,13 +662,20 @@ public:
     c += a * b;
   }
 
-  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
   {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+
+  template <typename ResPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
+  {
+    conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
     r = cj.pmadd(c,alpha,r);
   }
 
 protected:
-  conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
 };
 
 template<typename Packet>
@@ -580,13 +694,57 @@ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Pack
   return res;
 }
 
+// note that for DoublePacket<RealPacket> the "4" in "downto4"
+// corresponds to the number of complexes, so it means "8"
+// it terms of real coefficients.
+
 template<typename Packet>
-const DoublePacket<Packet>& predux_downto4(const DoublePacket<Packet> &a)
+const DoublePacket<Packet>&
+predux_half_dowto4(const DoublePacket<Packet> &a,
+                   typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
 {
   return a;
 }
 
-template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
+template<typename Packet>
+DoublePacket<typename unpacket_traits<Packet>::half>
+predux_half_dowto4(const DoublePacket<Packet> &a,
+                   typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
+{
+  // yes, that's pretty hackish :(
+  DoublePacket<typename unpacket_traits<Packet>::half> res;
+  typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
+  typedef typename packet_traits<Cplx>::type CplxPacket;
+  res.first  = predux_half_dowto4(CplxPacket(a.first)).v;
+  res.second = predux_half_dowto4(CplxPacket(a.second)).v;
+  return res;
+}
+
+// same here, "quad" actually means "8" in terms of real coefficients
+template<typename Scalar, typename RealPacket>
+void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
+                            typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0)
+{
+  dest.first  = pset1<RealPacket>(numext::real(*b));
+  dest.second = pset1<RealPacket>(numext::imag(*b));
+}
+
+template<typename Scalar, typename RealPacket>
+void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
+                            typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
+{
+  // yes, that's pretty hackish too :(
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
+  RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
+  dest.first  = ploadquad<RealPacket>(r);
+  dest.second = ploadquad<RealPacket>(i);
+}
+
+
+template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
+  typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
+};
 // template<typename Packet>
 // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
 // {
@@ -596,8 +754,8 @@ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typede
 //   return res;
 // }
 
-template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
+template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
+class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
 {
 public:
   typedef std::complex<RealScalar>  Scalar;
@@ -605,15 +763,21 @@ public:
   typedef std::complex<RealScalar>  RhsScalar;
   typedef std::complex<RealScalar>  ResScalar;
   
+  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+  PACKET_DECL_COND(Real, _PacketSize);
+  PACKET_DECL_COND_SCALAR(_PacketSize);
+
   enum {
     ConjLhs = _ConjLhs,
     ConjRhs = _ConjRhs,
-    Vectorizable = packet_traits<RealScalar>::Vectorizable
-                && packet_traits<Scalar>::Vectorizable,
-    RealPacketSize  = Vectorizable ? packet_traits<RealScalar>::size : 1,
-    ResPacketSize   = Vectorizable ? packet_traits<ResScalar>::size : 1,
-    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+    Vectorizable = unpacket_traits<RealPacket>::vectorizable
+                && unpacket_traits<ScalarPacket>::vectorizable,
+    ResPacketSize   = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+    LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
+    RealPacketSize  = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
 
     // FIXME: should depend on NumberOfRegisters
     nr = 4,
@@ -623,14 +787,16 @@ public:
     RhsProgress = 1
   };
   
-  typedef typename packet_traits<RealScalar>::type RealPacket;
-  typedef typename packet_traits<Scalar>::type     ScalarPacket;
-  typedef DoublePacket<RealPacket> DoublePacketType;
+  typedef DoublePacket<RealPacket>                 DoublePacketType;
 
+  typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing;
   typedef typename conditional<Vectorizable,RealPacket,  Scalar>::type LhsPacket;
   typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
   typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
+
+  // this actualy holds 8 packets!
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
   
   EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
 
@@ -641,51 +807,49 @@ public:
   }
 
   // Scalar path
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const
   {
-    dest = pset1<ResPacket>(*b);
+    dest = pset1<ScalarPacket>(*b);
   }
 
   // Vectorized path
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
+  template<typename RealPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
   {
-    dest.first  = pset1<RealPacket>(numext::real(*b));
-    dest.second = pset1<RealPacket>(numext::imag(*b));
+    dest.first  = pset1<RealPacketType>(numext::real(*b));
+    dest.second = pset1<RealPacketType>(numext::imag(*b));
   }
-  
-  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
   {
-    loadRhs(b,dest);
+    loadRhs(b, dest.B_0);
+    loadRhs(b + 1, dest.B1);
+    loadRhs(b + 2, dest.B2);
+    loadRhs(b + 3, dest.B3);
   }
-  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
+
+  // Scalar path
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
   {
-    eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
-    loadRhs(b,dest);
+    loadRhs(b, dest);
   }
-  
-  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+
+  // Vectorized path
+  template<typename RealPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
   {
-    // FIXME not sure that's the best way to implement it!
-    loadRhs(b+0, b0);
-    loadRhs(b+1, b1);
-    loadRhs(b+2, b2);
-    loadRhs(b+3, b3);
+    loadRhs(b, dest);
   }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
   
-  // Vectorized path
-  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
   {
-    // FIXME not sure that's the best way to implement it!
-    loadRhs(b+0, b0);
-    loadRhs(b+1, b1);
+    loadRhs(b,dest);
   }
-  
-  // Scalar path
-  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
   {
-    // FIXME not sure that's the best way to implement it!
-    loadRhs(b+0, b0);
-    loadRhs(b+1, b1);
+    loadQuadToDoublePacket(b,dest);
   }
 
   // nothing special here
@@ -694,47 +858,59 @@ public:
     dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
   }
 
-  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
   {
-    dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
+    dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
+  template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
+  EIGEN_STRONG_INLINE
+  typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type
+  madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
   {
     c.first   = padd(pmul(a,b.first), c.first);
     c.second  = padd(pmul(a,b.second),c.second);
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
+  template<typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
   {
     c = cj.pmadd(a,b,c);
   }
+
+  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+  {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
   
   EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
   
-  EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
+  template<typename RealPacketType, typename ResPacketType>
+  EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const
   {
     // assemble c
-    ResPacket tmp;
+    ResPacketType tmp;
     if((!ConjLhs)&&(!ConjRhs))
     {
-      tmp = pcplxflip(pconj(ResPacket(c.second)));
-      tmp = padd(ResPacket(c.first),tmp);
+      tmp = pcplxflip(pconj(ResPacketType(c.second)));
+      tmp = padd(ResPacketType(c.first),tmp);
     }
     else if((!ConjLhs)&&(ConjRhs))
     {
-      tmp = pconj(pcplxflip(ResPacket(c.second)));
-      tmp = padd(ResPacket(c.first),tmp);
+      tmp = pconj(pcplxflip(ResPacketType(c.second)));
+      tmp = padd(ResPacketType(c.first),tmp);
     }
     else if((ConjLhs)&&(!ConjRhs))
     {
-      tmp = pcplxflip(ResPacket(c.second));
-      tmp = padd(pconj(ResPacket(c.first)),tmp);
+      tmp = pcplxflip(ResPacketType(c.second));
+      tmp = padd(pconj(ResPacketType(c.first)),tmp);
     }
     else if((ConjLhs)&&(ConjRhs))
     {
-      tmp = pcplxflip(ResPacket(c.second));
-      tmp = psub(pconj(ResPacket(c.first)),tmp);
+      tmp = pcplxflip(ResPacketType(c.second));
+      tmp = psub(pconj(ResPacketType(c.first)),tmp);
     }
     
     r = pmadd(tmp,alpha,r);
@@ -744,8 +920,8 @@ protected:
   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
 };
 
-template<typename RealScalar, bool _ConjRhs>
-class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
+template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize>
+class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
 {
 public:
   typedef std::complex<RealScalar>  Scalar;
@@ -753,14 +929,25 @@ public:
   typedef Scalar      RhsScalar;
   typedef Scalar      ResScalar;
 
+  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
+  PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
+
+#undef PACKET_DECL_COND_SCALAR_PREFIX
+#undef PACKET_DECL_COND_PREFIX
+#undef PACKET_DECL_COND_SCALAR
+#undef PACKET_DECL_COND
+
   enum {
     ConjLhs = false,
     ConjRhs = _ConjRhs,
-    Vectorizable = packet_traits<RealScalar>::Vectorizable
-                && packet_traits<Scalar>::Vectorizable,
-    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
+    Vectorizable = unpacket_traits<_RealPacket>::vectorizable
+                && unpacket_traits<_ScalarPacket>::vectorizable,
+    LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+    ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
     
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
     // FIXME: should depend on NumberOfRegisters
@@ -771,14 +958,11 @@ public:
     RhsProgress = 1
   };
 
-  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-  typedef typename packet_traits<ResScalar>::type  _ResPacket;
-
   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
-
+  typedef LhsPacket LhsPacket4Packing;
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
   typedef ResPacket AccPacket;
 
   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
@@ -786,22 +970,25 @@ public:
     p = pset1<ResPacket>(ResScalar(0));
   }
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
   {
-    dest = pset1<RhsPacket>(*b);
+    dest = pset1<RhsPacketType>(*b);
   }
-  
-  void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
   {
-    pbroadcast4(b, b0, b1, b2, b3);
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
   }
-  
-//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
-//   {
-//     // FIXME not sure that's the best way to implement it!
-//     b0 = pload1<RhsPacket>(b+0);
-//     b1 = pload1<RhsPacket>(b+1);
-//   }
+
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    loadRhs(b, dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {}
 
   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
   {
@@ -810,21 +997,23 @@ public:
   
   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
-    eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
-    loadRhs(b,dest);
+    dest = ploadquad<RhsPacket>(b);
   }
 
-  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
   {
-    dest = ploaddup<LhsPacket>(a);
+    dest = ploaddup<LhsPacketType>(a);
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
   {
     madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
   }
 
-  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
   {
 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     EIGEN_UNUSED_VARIABLE(tmp);
@@ -840,16 +1029,24 @@ public:
     c += a * b;
   }
 
-  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+  {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+
+  template <typename ResPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
   {
+    conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
     r = cj.pmadd(alpha,c,r);
   }
 
 protected:
-  conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
+
 };
 
-/* optimized GEneral packed Block * packed Panel product kernel
+/* optimized General packed Block * packed Panel product kernel
  *
  * Mixing type logic: C += A * B
  *  |  A  |  B  | comments
@@ -859,26 +1056,47 @@ protected:
 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 struct gebp_kernel
 {
-  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf> HalfTraits;
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketQuarter> QuarterTraits;
+  
   typedef typename Traits::ResScalar ResScalar;
   typedef typename Traits::LhsPacket LhsPacket;
   typedef typename Traits::RhsPacket RhsPacket;
   typedef typename Traits::ResPacket ResPacket;
   typedef typename Traits::AccPacket AccPacket;
+  typedef typename Traits::RhsPacketx4 RhsPacketx4;
+
+  typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
+
+  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
 
-  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
   typedef typename SwappedTraits::ResScalar SResScalar;
   typedef typename SwappedTraits::LhsPacket SLhsPacket;
   typedef typename SwappedTraits::RhsPacket SRhsPacket;
   typedef typename SwappedTraits::ResPacket SResPacket;
   typedef typename SwappedTraits::AccPacket SAccPacket;
 
+  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+  typedef typename HalfTraits::ResPacket ResPacketHalf;
+  typedef typename HalfTraits::AccPacket AccPacketHalf;
+
+  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
+  typedef typename QuarterTraits::AccPacket AccPacketQuarter;
+
   typedef typename DataMapper::LinearMapper LinearMapper;
 
   enum {
     Vectorizable  = Traits::Vectorizable,
     LhsProgress   = Traits::LhsProgress,
+    LhsProgressHalf      = HalfTraits::LhsProgress,
+    LhsProgressQuarter   = QuarterTraits::LhsProgress,
     RhsProgress   = Traits::RhsProgress,
+    RhsProgressHalf      = HalfTraits::RhsProgress,
+    RhsProgressQuarter   = QuarterTraits::RhsProgress,
     ResPacketSize = Traits::ResPacketSize
   };
 
@@ -888,6 +1106,299 @@ struct gebp_kernel
                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
 };
 
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
+int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress>
+struct last_row_process_16_packets
+{
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
+  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
+
+  typedef typename Traits::ResScalar ResScalar;
+  typedef typename SwappedTraits::LhsPacket SLhsPacket;
+  typedef typename SwappedTraits::RhsPacket SRhsPacket;
+  typedef typename SwappedTraits::ResPacket SResPacket;
+  typedef typename SwappedTraits::AccPacket SAccPacket;
+
+  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
+                  const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
+                  ResScalar alpha, SAccPacket &C0)
+    {
+      EIGEN_UNUSED_VARIABLE(res);
+      EIGEN_UNUSED_VARIABLE(straits);
+      EIGEN_UNUSED_VARIABLE(blA);
+      EIGEN_UNUSED_VARIABLE(blB);
+      EIGEN_UNUSED_VARIABLE(depth);
+      EIGEN_UNUSED_VARIABLE(endk);
+      EIGEN_UNUSED_VARIABLE(i);
+      EIGEN_UNUSED_VARIABLE(j2);
+      EIGEN_UNUSED_VARIABLE(alpha);
+      EIGEN_UNUSED_VARIABLE(C0);
+    }
+};
+
+
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper,  mr,  nr, ConjugateLhs,  ConjugateRhs, 16> {
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
+  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
+
+  typedef typename Traits::ResScalar ResScalar;
+  typedef typename SwappedTraits::LhsPacket SLhsPacket;
+  typedef typename SwappedTraits::RhsPacket SRhsPacket;
+  typedef typename SwappedTraits::ResPacket SResPacket;
+  typedef typename SwappedTraits::AccPacket SAccPacket;
+
+  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
+                  const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
+                  ResScalar alpha, SAccPacket &C0)
+  {
+    typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
+    typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
+    typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
+    typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
+
+    SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
+    SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
+
+    if (depth - endk > 0)
+      {
+	// We have to handle the last row(s) of the rhs, which
+	// correspond to a half-packet
+	SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
+
+	for (Index kk = endk; kk < depth; kk++)
+	  {
+	    SLhsPacketQuarter a0;
+	    SRhsPacketQuarter b0;
+	    straits.loadLhsUnaligned(blB, a0);
+	    straits.loadRhs(blA, b0);
+	    straits.madd(a0,b0,c0,b0, fix<0>);
+	    blB += SwappedTraits::LhsProgress/4;
+	    blA += 1;
+	  }
+	straits.acc(c0, alphav, R);
+      }
+    else
+      {
+	straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
+      }
+    res.scatterPacket(i, j2, R);
+  }
+};
+
+template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
+struct lhs_process_one_packet
+{
+  typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
+
+  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
+  {
+    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
+    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
+    traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
+    traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
+    traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
+    traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
+    traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
+    traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
+    #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
+    __asm__  ("" : "+x,m" (*A0));
+    #endif
+    EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
+  }
+
+  EIGEN_STRONG_INLINE void operator()(
+    const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
+    Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
+    int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
+  {
+    GEBPTraits traits;
+
+    // loops on each largest micro horizontal panel of lhs
+    // (LhsProgress x depth)
+    for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
+    {
+      // loops on each largest micro vertical panel of rhs (depth * nr)
+      for(Index j2=0; j2<packet_cols4; j2+=nr)
+      {
+        // We select a LhsProgress x nr micro block of res
+        // which is entirely stored into 1 x nr registers.
+
+        const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
+        prefetch(&blA[0]);
+
+        // gets res block as register
+        AccPacket C0, C1, C2, C3;
+        traits.initAcc(C0);
+        traits.initAcc(C1);
+        traits.initAcc(C2);
+        traits.initAcc(C3);
+        // To improve instruction pipelining, let's double the accumulation registers:
+        //  even k will accumulate in C*, while odd k will accumulate in D*.
+        // This trick is crutial to get good performance with FMA, otherwise it is 
+        // actually faster to perform separated MUL+ADD because of a naturally
+        // better instruction-level parallelism.
+        AccPacket D0, D1, D2, D3;
+        traits.initAcc(D0);
+        traits.initAcc(D1);
+        traits.initAcc(D2);
+        traits.initAcc(D3);
+
+        LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+        LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+        LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+        LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+        r0.prefetch(prefetch_res_offset);
+        r1.prefetch(prefetch_res_offset);
+        r2.prefetch(prefetch_res_offset);
+        r3.prefetch(prefetch_res_offset);
+
+        // performs "inner" products
+        const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+        prefetch(&blB[0]);
+        LhsPacket A0, A1;
+
+        for(Index k=0; k<peeled_kc; k+=pk)
+        {
+          EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
+          RhsPacketx4 rhs_panel;
+          RhsPacket T0;
+
+          internal::prefetch(blB+(48+0));
+          peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+          peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+          internal::prefetch(blB+(48+16));
+          peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+          peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+
+          blB += pk*4*RhsProgress;
+          blA += pk*LhsProgress;
+
+          EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
+        }
+        C0 = padd(C0,D0);
+        C1 = padd(C1,D1);
+        C2 = padd(C2,D2);
+        C3 = padd(C3,D3);
+
+        // process remaining peeled loop
+        for(Index k=peeled_kc; k<depth; k++)
+        {
+          RhsPacketx4 rhs_panel;
+          RhsPacket T0;
+          peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          blB += 4*RhsProgress;
+          blA += LhsProgress;
+        }
+
+        ResPacket R0, R1;
+        ResPacket alphav = pset1<ResPacket>(alpha);
+
+        R0 = r0.template loadPacket<ResPacket>(0);
+        R1 = r1.template loadPacket<ResPacket>(0);
+        traits.acc(C0, alphav, R0);
+        traits.acc(C1,  alphav, R1);
+        r0.storePacket(0, R0);
+        r1.storePacket(0, R1);
+
+        R0 = r2.template loadPacket<ResPacket>(0);
+        R1 = r3.template loadPacket<ResPacket>(0);
+        traits.acc(C2,  alphav, R0);
+        traits.acc(C3,  alphav, R1);
+        r2.storePacket(0, R0);
+        r3.storePacket(0, R1);
+      }
+
+      // Deal with remaining columns of the rhs
+      for(Index j2=packet_cols4; j2<cols; j2++)
+      {
+        // One column at a time
+        const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
+        prefetch(&blA[0]);
+
+        // gets res block as register
+        AccPacket C0;
+        traits.initAcc(C0);
+
+        LinearMapper r0 = res.getLinearMapper(i, j2);
+
+        // performs "inner" products
+        const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+        LhsPacket A0;
+
+        for(Index k= 0; k<peeled_kc; k+=pk)
+        {
+          EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
+          RhsPacket B_0;
+
+#define EIGEN_GEBGP_ONESTEP(K)                                          \
+	      do {                                                      \
+		EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
+		EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+    /* FIXME: why unaligned???? */ \
+		traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
+		traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);		\
+		traits.madd(A0, B_0, C0, B_0, fix<0>);				\
+		EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
+	      } while(false);
+
+          EIGEN_GEBGP_ONESTEP(0);
+          EIGEN_GEBGP_ONESTEP(1);
+          EIGEN_GEBGP_ONESTEP(2);
+          EIGEN_GEBGP_ONESTEP(3);
+          EIGEN_GEBGP_ONESTEP(4);
+          EIGEN_GEBGP_ONESTEP(5);
+          EIGEN_GEBGP_ONESTEP(6);
+          EIGEN_GEBGP_ONESTEP(7);
+
+          blB += pk*RhsProgress;
+          blA += pk*LhsProgress;
+
+          EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
+        }
+
+        // process remaining peeled loop
+        for(Index k=peeled_kc; k<depth; k++)
+        {
+          RhsPacket B_0;
+          EIGEN_GEBGP_ONESTEP(0);
+          blB += RhsProgress;
+          blA += LhsProgress;
+        }
+#undef EIGEN_GEBGP_ONESTEP
+        ResPacket R0;
+        ResPacket alphav = pset1<ResPacket>(alpha);
+        R0 = r0.template loadPacket<ResPacket>(0);
+        traits.acc(C0, alphav, R0);
+        r0.storePacket(0, R0);
+      }
+    }
+  }
+};
+
+template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
+struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
+{
+
+EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
+  {
+        EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
+        EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
+        traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
+        traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
+        traits.madd(*A0, *B_0, *C0, *B_0);
+        traits.madd(*A0, *B1,  *C1, *B1);
+        traits.madd(*A0, *B2,  *C2, *B2);
+        traits.madd(*A0, *B3,  *C3, *B3);
+        EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
+  }
+};
+
 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 EIGEN_DONT_INLINE
 void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
@@ -904,10 +1415,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
     const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
     const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
-    const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
+    const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
+    const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
+    const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
     enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
     const Index peeled_kc  = depth & ~(pk-1);
-    const Index prefetch_res_offset = 32/sizeof(ResScalar);    
+    const int prefetch_res_offset = 32/sizeof(ResScalar);    
 //     const Index depth2     = depth & ~1;
 
     //---------- Process 3 * LhsProgress rows at once ----------
@@ -965,36 +1478,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           for(Index k=0; k<peeled_kc; k+=pk)
           {
             EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
-            RhsPacket B_0, T0;
+            // 15 registers are taken (12 for acc, 2 for lhs).
+            RhsPanel15 rhs_panel;
+            RhsPacket T0;
             LhsPacket A2;
-
-#define EIGEN_GEBP_ONESTEP(K) \
-            do { \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
+            #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
+            // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
+            // without this workaround A0, A1, and A2 are loaded in the same register,
+            // which is not good for pipelining
+            #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__  ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
+            #else
+            #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
+            #endif
+#define EIGEN_GEBP_ONESTEP(K)                                                     \
+            do {                                                                  \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4");          \
               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-              internal::prefetch(blA+(3*K+16)*LhsProgress); \
-              if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
-              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
-              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
-              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-              traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
-              traits.madd(A0, B_0, C0, T0); \
-              traits.madd(A1, B_0, C4, T0); \
-              traits.madd(A2, B_0, C8, B_0); \
-              traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
-              traits.madd(A0, B_0, C1, T0); \
-              traits.madd(A1, B_0, C5, T0); \
-              traits.madd(A2, B_0, C9, B_0); \
-              traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
-              traits.madd(A0, B_0, C2,  T0); \
-              traits.madd(A1, B_0, C6,  T0); \
-              traits.madd(A2, B_0, C10, B_0); \
-              traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
-              traits.madd(A0, B_0, C3 , T0); \
-              traits.madd(A1, B_0, C7,  T0); \
-              traits.madd(A2, B_0, C11, B_0); \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
-            } while(false)
+              internal::prefetch(blA + (3 * K + 16) * LhsProgress);               \
+              if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) {                            \
+                internal::prefetch(blB + (4 * K + 16) * RhsProgress);             \
+              } /* Bug 953 */                                                     \
+              traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                \
+              traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                \
+              traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                \
+              EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
+              traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel);     \
+              traits.madd(A0, rhs_panel, C0, T0, fix<0>);                         \
+              traits.madd(A1, rhs_panel, C4, T0, fix<0>);                         \
+              traits.madd(A2, rhs_panel, C8, T0, fix<0>);                         \
+              traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel);   \
+              traits.madd(A0, rhs_panel, C1, T0, fix<1>);                         \
+              traits.madd(A1, rhs_panel, C5, T0, fix<1>);                         \
+              traits.madd(A2, rhs_panel, C9, T0, fix<1>);                         \
+              traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel);   \
+              traits.madd(A0, rhs_panel, C2, T0, fix<2>);                         \
+              traits.madd(A1, rhs_panel, C6, T0, fix<2>);                         \
+              traits.madd(A2, rhs_panel, C10, T0, fix<2>);                        \
+              traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel);   \
+              traits.madd(A0, rhs_panel, C3, T0, fix<3>);                         \
+              traits.madd(A1, rhs_panel, C7, T0, fix<3>);                         \
+              traits.madd(A2, rhs_panel, C11, T0, fix<3>);                        \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4");            \
+            } while (false)
 
             internal::prefetch(blB);
             EIGEN_GEBP_ONESTEP(0);
@@ -1014,7 +1539,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           // process remaining peeled loop
           for(Index k=peeled_kc; k<depth; k++)
           {
-            RhsPacket B_0, T0;
+            RhsPanel15 rhs_panel;
+            RhsPacket T0;
             LhsPacket A2;
             EIGEN_GEBP_ONESTEP(0);
             blB += 4*RhsProgress;
@@ -1026,9 +1552,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           traits.acc(C8, alphav, R2);
@@ -1036,9 +1562,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           r0.storePacket(1 * Traits::ResPacketSize, R1);
           r0.storePacket(2 * Traits::ResPacketSize, R2);
 
-          R0 = r1.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r1.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r1.loadPacket(2 * Traits::ResPacketSize);
+          R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
           traits.acc(C1, alphav, R0);
           traits.acc(C5, alphav, R1);
           traits.acc(C9, alphav, R2);
@@ -1046,9 +1572,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           r1.storePacket(1 * Traits::ResPacketSize, R1);
           r1.storePacket(2 * Traits::ResPacketSize, R2);
 
-          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r2.loadPacket(2 * Traits::ResPacketSize);
+          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
           traits.acc(C2, alphav, R0);
           traits.acc(C6, alphav, R1);
           traits.acc(C10, alphav, R2);
@@ -1056,9 +1582,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           r2.storePacket(1 * Traits::ResPacketSize, R1);
           r2.storePacket(2 * Traits::ResPacketSize, R2);
 
-          R0 = r3.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r3.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r3.loadPacket(2 * Traits::ResPacketSize);
+          R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
           traits.acc(C3, alphav, R0);
           traits.acc(C7, alphav, R1);
           traits.acc(C11, alphav, R2);
@@ -1094,20 +1620,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           {
             EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
             RhsPacket B_0;
-#define EIGEN_GEBGP_ONESTEP(K) \
-            do { \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
+#define EIGEN_GEBGP_ONESTEP(K)                                                    \
+            do {                                                                  \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1");          \
               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
-              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
-              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
-              traits.madd(A0, B_0, C0, B_0); \
-              traits.madd(A1, B_0, C4, B_0); \
-              traits.madd(A2, B_0, C8, B_0); \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
-            } while(false)
-        
+              traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                \
+              traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                \
+              traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                \
+              traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0);                   \
+              traits.madd(A0, B_0, C0, B_0, fix<0>);                              \
+              traits.madd(A1, B_0, C4, B_0, fix<0>);                              \
+              traits.madd(A2, B_0, C8, B_0, fix<0>);                              \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1");            \
+            } while (false)
+
             EIGEN_GEBGP_ONESTEP(0);
             EIGEN_GEBGP_ONESTEP(1);
             EIGEN_GEBGP_ONESTEP(2);
@@ -1117,8 +1643,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
             EIGEN_GEBGP_ONESTEP(6);
             EIGEN_GEBGP_ONESTEP(7);
 
-            blB += pk*RhsProgress;
-            blA += pk*3*Traits::LhsProgress;
+            blB += int(pk) * int(RhsProgress);
+            blA += int(pk) * 3 * int(Traits::LhsProgress);
 
             EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
           }
@@ -1135,9 +1661,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           traits.acc(C8, alphav, R2);
@@ -1196,7 +1722,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           for(Index k=0; k<peeled_kc; k+=pk)
           {
             EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
-            RhsPacket B_0, B1, B2, B3, T0;
+            RhsPacketx4 rhs_panel;
+            RhsPacket T0;
 
           // NOTE: the begin/end asm comments below work around bug 935!
           // but they are not enough for gcc>=6 without FMA (bug 1637)
@@ -1205,24 +1732,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           #else
             #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
           #endif
-          #define EIGEN_GEBGP_ONESTEP(K) \
-            do {                                                                \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");        \
-              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                    \
-              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                    \
-              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
-              traits.madd(A0, B_0, C0, T0);                                     \
-              traits.madd(A1, B_0, C4, B_0);                                    \
-              traits.madd(A0, B1,  C1, T0);                                     \
-              traits.madd(A1, B1,  C5, B1);                                     \
-              traits.madd(A0, B2,  C2, T0);                                     \
-              traits.madd(A1, B2,  C6, B2);                                     \
-              traits.madd(A0, B3,  C3, T0);                                     \
-              traits.madd(A1, B3,  C7, B3);                                     \
-              EIGEN_GEBP_2PX4_SPILLING_WORKAROUND                               \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");          \
-            } while(false)
-            
+#define EIGEN_GEBGP_ONESTEP(K)                                            \
+            do {                                                          \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");  \
+              traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0);        \
+              traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1);        \
+              traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
+              traits.madd(A0, rhs_panel, C0, T0, fix<0>);                 \
+              traits.madd(A1, rhs_panel, C4, T0, fix<0>);                 \
+              traits.madd(A0, rhs_panel, C1, T0, fix<1>);                 \
+              traits.madd(A1, rhs_panel, C5, T0, fix<1>);                 \
+              traits.madd(A0, rhs_panel, C2, T0, fix<2>);                 \
+              traits.madd(A1, rhs_panel, C6, T0, fix<2>);                 \
+              traits.madd(A0, rhs_panel, C3, T0, fix<3>);                 \
+              traits.madd(A1, rhs_panel, C7, T0, fix<3>);                 \
+              EIGEN_GEBP_2PX4_SPILLING_WORKAROUND                         \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");    \
+            } while (false)
+
             internal::prefetch(blB+(48+0));
             EIGEN_GEBGP_ONESTEP(0);
             EIGEN_GEBGP_ONESTEP(1);
@@ -1242,7 +1769,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           // process remaining peeled loop
           for(Index k=peeled_kc; k<depth; k++)
           {
-            RhsPacket B_0, B1, B2, B3, T0;
+            RhsPacketx4 rhs_panel;
+            RhsPacket T0;
             EIGEN_GEBGP_ONESTEP(0);
             blB += 4*RhsProgress;
             blA += 2*Traits::LhsProgress;
@@ -1252,10 +1780,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           ResPacket R0, R1, R2, R3;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r1.loadPacket(0 * Traits::ResPacketSize);
-          R3 = r1.loadPacket(1 * Traits::ResPacketSize);
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           traits.acc(C1, alphav, R2);
@@ -1265,10 +1793,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           r1.storePacket(0 * Traits::ResPacketSize, R2);
           r1.storePacket(1 * Traits::ResPacketSize, R3);
 
-          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r3.loadPacket(0 * Traits::ResPacketSize);
-          R3 = r3.loadPacket(1 * Traits::ResPacketSize);
+          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
           traits.acc(C2,  alphav, R0);
           traits.acc(C6,  alphav, R1);
           traits.acc(C3,  alphav, R2);
@@ -1313,8 +1841,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
               traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                      \
               traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                      \
               traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                       \
-              traits.madd(A0, B_0, C0, B1);                                       \
-              traits.madd(A1, B_0, C4, B_0);                                      \
+              traits.madd(A0, B_0, C0, B1, fix<0>);                               \
+              traits.madd(A1, B_0, C4, B_0, fix<0>);                              \
               EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");            \
             } while(false)
         
@@ -1327,8 +1855,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
             EIGEN_GEBGP_ONESTEP(6);
             EIGEN_GEBGP_ONESTEP(7);
 
-            blB += pk*RhsProgress;
-            blA += pk*2*Traits::LhsProgress;
+            blB += int(pk) * int(RhsProgress);
+            blA += int(pk) * 2 * int(Traits::LhsProgress);
 
             EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
           }
@@ -1345,8 +1873,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           ResPacket R0, R1;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           r0.storePacket(0 * Traits::ResPacketSize, R0);
@@ -1358,186 +1886,43 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     //---------- Process 1 * LhsProgress rows at once ----------
     if(mr>=1*Traits::LhsProgress)
     {
-      // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
-      for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
-      {
-        // loops on each largest micro vertical panel of rhs (depth * nr)
-        for(Index j2=0; j2<packet_cols4; j2+=nr)
-        {
-          // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
-          // stored into 1 x nr registers.
-          
-          const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
-          prefetch(&blA[0]);
-
-          // gets res block as register
-          AccPacket C0, C1, C2, C3;
-          traits.initAcc(C0);
-          traits.initAcc(C1);
-          traits.initAcc(C2);
-          traits.initAcc(C3);
-
-          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
-          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
-          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
-          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
-
-          r0.prefetch(prefetch_res_offset);
-          r1.prefetch(prefetch_res_offset);
-          r2.prefetch(prefetch_res_offset);
-          r3.prefetch(prefetch_res_offset);
-
-          // performs "inner" products
-          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
-          prefetch(&blB[0]);
-          LhsPacket A0;
-
-          for(Index k=0; k<peeled_kc; k+=pk)
-          {
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
-            RhsPacket B_0, B1, B2, B3;
-               
-#define EIGEN_GEBGP_ONESTEP(K) \
-            do {                                                                \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4");        \
-              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
-              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
-              traits.madd(A0, B_0, C0, B_0);                                    \
-              traits.madd(A0, B1,  C1, B1);                                     \
-              traits.madd(A0, B2,  C2, B2);                                     \
-              traits.madd(A0, B3,  C3, B3);                                     \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4");          \
-            } while(false)
-            
-            internal::prefetch(blB+(48+0));
-            EIGEN_GEBGP_ONESTEP(0);
-            EIGEN_GEBGP_ONESTEP(1);
-            EIGEN_GEBGP_ONESTEP(2);
-            EIGEN_GEBGP_ONESTEP(3);
-            internal::prefetch(blB+(48+16));
-            EIGEN_GEBGP_ONESTEP(4);
-            EIGEN_GEBGP_ONESTEP(5);
-            EIGEN_GEBGP_ONESTEP(6);
-            EIGEN_GEBGP_ONESTEP(7);
-
-            blB += pk*4*RhsProgress;
-            blA += pk*1*LhsProgress;
-
-            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
-          }
-          // process remaining peeled loop
-          for(Index k=peeled_kc; k<depth; k++)
-          {
-            RhsPacket B_0, B1, B2, B3;
-            EIGEN_GEBGP_ONESTEP(0);
-            blB += 4*RhsProgress;
-            blA += 1*LhsProgress;
-          }
-#undef EIGEN_GEBGP_ONESTEP
-
-          ResPacket R0, R1;
-          ResPacket alphav = pset1<ResPacket>(alpha);
-
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r1.loadPacket(0 * Traits::ResPacketSize);
-          traits.acc(C0, alphav, R0);
-          traits.acc(C1,  alphav, R1);
-          r0.storePacket(0 * Traits::ResPacketSize, R0);
-          r1.storePacket(0 * Traits::ResPacketSize, R1);
-
-          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r3.loadPacket(0 * Traits::ResPacketSize);
-          traits.acc(C2,  alphav, R0);
-          traits.acc(C3,  alphav, R1);
-          r2.storePacket(0 * Traits::ResPacketSize, R0);
-          r3.storePacket(0 * Traits::ResPacketSize, R1);
-        }
-
-        // Deal with remaining columns of the rhs
-        for(Index j2=packet_cols4; j2<cols; j2++)
-        {
-          // One column at a time
-          const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
-          prefetch(&blA[0]);
-
-          // gets res block as register
-          AccPacket C0;
-          traits.initAcc(C0);
-
-          LinearMapper r0 = res.getLinearMapper(i, j2);
-
-          // performs "inner" products
-          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
-          LhsPacket A0;
-
-          for(Index k=0; k<peeled_kc; k+=pk)
-          {
-            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
-            RhsPacket B_0;
-        
-#define EIGEN_GEBGP_ONESTEP(K) \
-            do {                                                                \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1");        \
-              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
-              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                     \
-              traits.madd(A0, B_0, C0, B_0);                                    \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1");          \
-            } while(false);
-
-            EIGEN_GEBGP_ONESTEP(0);
-            EIGEN_GEBGP_ONESTEP(1);
-            EIGEN_GEBGP_ONESTEP(2);
-            EIGEN_GEBGP_ONESTEP(3);
-            EIGEN_GEBGP_ONESTEP(4);
-            EIGEN_GEBGP_ONESTEP(5);
-            EIGEN_GEBGP_ONESTEP(6);
-            EIGEN_GEBGP_ONESTEP(7);
-
-            blB += pk*RhsProgress;
-            blA += pk*1*Traits::LhsProgress;
-
-            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
-          }
-
-          // process remaining peeled loop
-          for(Index k=peeled_kc; k<depth; k++)
-          {
-            RhsPacket B_0;
-            EIGEN_GEBGP_ONESTEP(0);
-            blB += RhsProgress;
-            blA += 1*Traits::LhsProgress;
-          }
-#undef EIGEN_GEBGP_ONESTEP
-          ResPacket R0;
-          ResPacket alphav = pset1<ResPacket>(alpha);
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          traits.acc(C0, alphav, R0);
-          r0.storePacket(0 * Traits::ResPacketSize, R0);
-        }
-      }
+      lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
+      p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
+    }
+    //---------- Process LhsProgressHalf rows at once ----------
+    if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
+    {
+      lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
+      p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
+    }
+    //---------- Process LhsProgressQuarter rows at once ----------
+    if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
+    {
+      lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
+      p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
     }
     //---------- Process remaining rows, 1 at once ----------
-    if(peeled_mc1<rows)
+    if(peeled_mc_quarter<rows)
     {
       // loop on each panel of the rhs
       for(Index j2=0; j2<packet_cols4; j2+=nr)
       {
         // loop on each row of the lhs (1*LhsProgress x depth)
-        for(Index i=peeled_mc1; i<rows; i+=1)
+        for(Index i=peeled_mc_quarter; i<rows; i+=1)
         {
           const LhsScalar* blA = &blockA[i*strideA+offsetA];
           prefetch(&blA[0]);
           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
 
-          // The following piece of code wont work for 512 bit registers
-          // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
-          // as nr (which is currently 4) for the return type.
+          // If LhsProgress is 8 or 16, it assumes that there is a
+          // half or quarter packet, respectively, of the same size as
+          // nr (which is currently 4) for the return type.
           const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
+          const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
           if ((SwappedTraits::LhsProgress % 4) == 0 &&
-              (SwappedTraits::LhsProgress <= 8) &&
-              (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
+              (SwappedTraits::LhsProgress<=16) &&
+              (SwappedTraits::LhsProgress!=8  || SResPacketHalfSize==nr) &&
+              (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
           {
             SAccPacket C0, C1, C2, C3;
             straits.initAcc(C0);
@@ -1560,15 +1945,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 
               straits.loadRhsQuad(blA+0*spk, B_0);
               straits.loadRhsQuad(blA+1*spk, B_1);
-              straits.madd(A0,B_0,C0,B_0);
-              straits.madd(A1,B_1,C1,B_1);
+              straits.madd(A0,B_0,C0,B_0, fix<0>);
+              straits.madd(A1,B_1,C1,B_1, fix<0>);
 
               straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
               straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
               straits.loadRhsQuad(blA+2*spk, B_0);
               straits.loadRhsQuad(blA+3*spk, B_1);
-              straits.madd(A0,B_0,C2,B_0);
-              straits.madd(A1,B_1,C3,B_1);
+              straits.madd(A0,B_0,C2,B_0, fix<0>);
+              straits.madd(A1,B_1,C3,B_1, fix<0>);
 
               blB += 4*SwappedTraits::LhsProgress;
               blA += 4*spk;
@@ -1581,7 +1966,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 
               straits.loadLhsUnaligned(blB, A0);
               straits.loadRhsQuad(blA, B_0);
-              straits.madd(A0,B_0,C0,B_0);
+              straits.madd(A0,B_0,C0,B_0, fix<0>);
 
               blB += SwappedTraits::LhsProgress;
               blA += spk;
@@ -1591,7 +1976,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
               // Special case where we have to first reduce the accumulation register C0
               typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
               typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
-              typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
+              typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
               typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
 
               SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
@@ -1604,16 +1989,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
                 SRhsPacketHalf b0;
                 straits.loadLhsUnaligned(blB, a0);
                 straits.loadRhs(blA, b0);
-                SAccPacketHalf c0 = predux_downto4(C0);
-                straits.madd(a0,b0,c0,b0);
+                SAccPacketHalf c0 = predux_half_dowto4(C0);
+                straits.madd(a0,b0,c0,b0, fix<0>);
                 straits.acc(c0, alphav, R);
               }
               else
               {
-                straits.acc(predux_downto4(C0), alphav, R);
+                straits.acc(predux_half_dowto4(C0), alphav, R);
               }
               res.scatterPacket(i, j2, R);
             }
+            else if (SwappedTraits::LhsProgress==16)
+            {
+              // Special case where we have to first reduce the
+              // accumulation register C0. We specialize the block in
+              // template form, so that LhsProgress < 16 paths don't
+              // fail to compile
+              last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
+	            p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
+            }
             else
             {
               SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
@@ -1636,14 +2030,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 
               B_0 = blB[0];
               B_1 = blB[1];
-              CJMADD(cj,A0,B_0,C0,  B_0);
-              CJMADD(cj,A0,B_1,C1,  B_1);
-              
+              C0 = cj.pmadd(A0,B_0,C0);
+              C1 = cj.pmadd(A0,B_1,C1);
+
               B_0 = blB[2];
               B_1 = blB[3];
-              CJMADD(cj,A0,B_0,C2,  B_0);
-              CJMADD(cj,A0,B_1,C3,  B_1);
-              
+              C2 = cj.pmadd(A0,B_0,C2);
+              C3 = cj.pmadd(A0,B_1,C3);
+
               blB += 4;
             }
             res(i, j2 + 0) += alpha * C0;
@@ -1657,7 +2051,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
       for(Index j2=packet_cols4; j2<cols; j2++)
       {
         // loop on each row of the lhs (1*LhsProgress x depth)
-        for(Index i=peeled_mc1; i<rows; i+=1)
+        for(Index i=peeled_mc_quarter; i<rows; i+=1)
         {
           const LhsScalar* blA = &blockA[i*strideA+offsetA];
           prefetch(&blA[0]);
@@ -1668,7 +2062,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           {
             LhsScalar A0 = blA[k];
             RhsScalar B_0 = blB[k];
-            CJMADD(cj, A0, B_0, C0, B_0);
+            C0 = cj.pmadd(A0, B_0, C0);
           }
           res(i, j2) += alpha * C0;
         }
@@ -1677,8 +2071,6 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
   }
 
 
-#undef CJMADD
-
 // pack a block of the lhs
 // The traversal is as follow (mr==4):
 //   0  4  8 12 ...
@@ -1693,19 +2085,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 //
 //  32 33 34 35 ...
 //  36 36 38 39 ...
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
 {
   typedef typename DataMapper::LinearMapper LinearMapper;
   EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
   ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum { PacketSize = packet_traits<Scalar>::size };
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+  enum { PacketSize = unpacket_traits<Packet>::size,
+         HalfPacketSize = unpacket_traits<HalfPacket>::size,
+         QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+         HasHalf = (int)HalfPacketSize < (int)PacketSize,
+         HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
 
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
   EIGEN_UNUSED_VARIABLE(stride);
@@ -1717,9 +2114,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
 
   const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
   const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
-  const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
-  const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
-                         : Pack2>1             ? (rows/Pack2)*Pack2 : 0;
+  const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
+  const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
+  const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
+  const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
+  const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
+                         : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
 
   Index i=0;
 
@@ -1733,9 +2133,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
       for(Index k=0; k<depth; k++)
       {
         Packet A, B, C;
-        A = lhs.loadPacket(i+0*PacketSize, k);
-        B = lhs.loadPacket(i+1*PacketSize, k);
-        C = lhs.loadPacket(i+2*PacketSize, k);
+        A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
+        B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
+        C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
         pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
         pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
         pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
@@ -1753,8 +2153,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
       for(Index k=0; k<depth; k++)
       {
         Packet A, B;
-        A = lhs.loadPacket(i+0*PacketSize, k);
-        B = lhs.loadPacket(i+1*PacketSize, k);
+        A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
+        B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
         pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
         pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
       }
@@ -1771,27 +2171,67 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
       for(Index k=0; k<depth; k++)
       {
         Packet A;
-        A = lhs.loadPacket(i+0*PacketSize, k);
+        A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
         pstore(blockA+count, cj.pconj(A));
         count+=PacketSize;
       }
       if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
     }
   }
-  // Pack scalars
+  // Pack half packets
+  if(HasHalf && Pack1>=HalfPacketSize)
+  {
+    for(; i<peeled_mc_half; i+=HalfPacketSize)
+    {
+      if(PanelMode) count += (HalfPacketSize) * offset;
+
+      for(Index k=0; k<depth; k++)
+      {
+        HalfPacket A;
+        A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
+        pstoreu(blockA+count, cj.pconj(A));
+        count+=HalfPacketSize;
+      }
+      if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
+    }
+  }
+  // Pack quarter packets
+  if(HasQuarter && Pack1>=QuarterPacketSize)
+  {
+    for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
+    {
+      if(PanelMode) count += (QuarterPacketSize) * offset;
+
+      for(Index k=0; k<depth; k++)
+      {
+        QuarterPacket A;
+        A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
+        pstoreu(blockA+count, cj.pconj(A));
+        count+=QuarterPacketSize;
+      }
+      if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
+    }
+  }
+  // Pack2 may be *smaller* than PacketSize—that happens for
+  // products like real * complex, where we have to go half the
+  // progress on the lhs in order to duplicate those operands to
+  // address both real & imaginary parts on the rhs. This portion will
+  // pack those half ones until they match the number expected on the
+  // last peeling loop at this point (for the rhs).
   if(Pack2<PacketSize && Pack2>1)
   {
-    for(; i<peeled_mc0; i+=Pack2)
+    for(; i<peeled_mc0; i+=last_lhs_progress)
     {
-      if(PanelMode) count += Pack2 * offset;
+      if(PanelMode) count += last_lhs_progress * offset;
 
       for(Index k=0; k<depth; k++)
-        for(Index w=0; w<Pack2; w++)
+        for(Index w=0; w<last_lhs_progress; w++)
           blockA[count++] = cj(lhs(i+w, k));
 
-      if(PanelMode) count += Pack2 * (stride-offset-depth);
+      if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
     }
   }
+  // Pack scalars
   for(; i<rows; i++)
   {
     if(PanelMode) count += offset;
@@ -1801,19 +2241,24 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
   }
 }
 
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
 {
   typedef typename DataMapper::LinearMapper LinearMapper;
   EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
   ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum { PacketSize = packet_traits<Scalar>::size };
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+  enum { PacketSize = unpacket_traits<Packet>::size,
+         HalfPacketSize = unpacket_traits<HalfPacket>::size,
+         QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+         HasHalf = (int)HalfPacketSize < (int)PacketSize,
+         HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
 
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
   EIGEN_UNUSED_VARIABLE(stride);
@@ -1821,37 +2266,51 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro
   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
   Index count = 0;
+  bool gone_half = false, gone_quarter = false, gone_last = false;
 
-//   const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
-//   const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
-//   const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
-
-  int pack = Pack1;
   Index i = 0;
+  int pack = Pack1;
+  int psize = PacketSize;
   while(pack>0)
   {
     Index remaining_rows = rows-i;
-    Index peeled_mc = i+(remaining_rows/pack)*pack;
+    Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
+    Index starting_pos = i;
     for(; i<peeled_mc; i+=pack)
     {
       if(PanelMode) count += pack * offset;
 
-      const Index peeled_k = (depth/PacketSize)*PacketSize;
       Index k=0;
-      if(pack>=PacketSize)
+      if(pack>=psize && psize >= QuarterPacketSize)
       {
-        for(; k<peeled_k; k+=PacketSize)
+        const Index peeled_k = (depth/psize)*psize;
+        for(; k<peeled_k; k+=psize)
         {
-          for (Index m = 0; m < pack; m += PacketSize)
+          for (Index m = 0; m < pack; m += psize)
           {
-            PacketBlock<Packet> kernel;
-            for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
-            ptranspose(kernel);
-            for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
+            if (psize == PacketSize) {
+              PacketBlock<Packet> kernel;
+              for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
+              ptranspose(kernel);
+              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
+            } else if (HasHalf && psize == HalfPacketSize) {
+              gone_half = true;
+              PacketBlock<HalfPacket> kernel_half;
+              for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
+              ptranspose(kernel_half);
+              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
+            } else if (HasQuarter && psize == QuarterPacketSize) {
+              gone_quarter = true;
+              PacketBlock<QuarterPacket> kernel_quarter;
+              for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
+              ptranspose(kernel_quarter);
+              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
+	    }
           }
-          count += PacketSize*pack;
+          count += psize*pack;
         }
       }
+
       for(; k<depth; k++)
       {
         Index w=0;
@@ -1874,9 +2333,28 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro
       if(PanelMode) count += pack * (stride-offset-depth);
     }
 
-    pack -= PacketSize;
-    if(pack<Pack2 && (pack+PacketSize)!=Pack2)
-      pack = Pack2;
+    pack -= psize;
+    Index left = rows - i;
+    if (pack <= 0) {
+      if (!gone_last &&
+          (starting_pos == i || left >= psize/2 || left >= psize/4) &&
+          ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
+           (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
+        psize /= 2;
+        pack = psize;
+        continue;
+      }
+      // Pack2 may be *smaller* than PacketSize—that happens for
+      // products like real * complex, where we have to go half the
+      // progress on the lhs in order to duplicate those operands to
+      // address both real & imaginary parts on the rhs. This portion will
+      // pack those half ones until they match the number expected on the
+      // last peeling loop at this point (for the rhs).
+      if (Pack2 < PacketSize && !gone_last) {
+        gone_last = true;
+        psize = pack = left & ~1;
+      }
+    }
   }
 
   for(; i<rows; i++)
@@ -1932,7 +2410,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
 //       const Scalar* b6 = &rhs[(j2+6)*rhsStride];
 //       const Scalar* b7 = &rhs[(j2+7)*rhsStride];
 //       Index k=0;
-//       if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
+//       if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4
 //       {
 //         for(; k<peeled_k; k+=PacketSize) {
 //           PacketBlock<Packet> kernel;
@@ -1979,10 +2457,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
       {
         for(; k<peeled_k; k+=PacketSize) {
           PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
-          kernel.packet[0] = dm0.loadPacket(k);
-          kernel.packet[1%PacketSize] = dm1.loadPacket(k);
-          kernel.packet[2%PacketSize] = dm2.loadPacket(k);
-          kernel.packet[3%PacketSize] = dm3.loadPacket(k);
+          kernel.packet[0           ] = dm0.template loadPacket<Packet>(k);
+          kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
+          kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
+          kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
           ptranspose(kernel);
           pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
           pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
@@ -2023,94 +2501,104 @@ template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conj
 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
 {
   typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
   typedef typename DataMapper::LinearMapper LinearMapper;
-  enum { PacketSize = packet_traits<Scalar>::size };
-  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
-};
-
-template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
-  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
-  EIGEN_UNUSED_VARIABLE(stride);
-  EIGEN_UNUSED_VARIABLE(offset);
-  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
-  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
-  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
-  Index count = 0;
-
-//   if(nr>=8)
-//   {
-//     for(Index j2=0; j2<packet_cols8; j2+=8)
-//     {
-//       // skip what we have before
-//       if(PanelMode) count += 8 * offset;
-//       for(Index k=0; k<depth; k++)
-//       {
-//         if (PacketSize==8) {
-//           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
-//           pstoreu(blockB+count, cj.pconj(A));
-//         } else if (PacketSize==4) {
-//           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
-//           Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
-//           pstoreu(blockB+count, cj.pconj(A));
-//           pstoreu(blockB+count+PacketSize, cj.pconj(B));
-//         } else {
-//           const Scalar* b0 = &rhs[k*rhsStride + j2];
-//           blockB[count+0] = cj(b0[0]);
-//           blockB[count+1] = cj(b0[1]);
-//           blockB[count+2] = cj(b0[2]);
-//           blockB[count+3] = cj(b0[3]);
-//           blockB[count+4] = cj(b0[4]);
-//           blockB[count+5] = cj(b0[5]);
-//           blockB[count+6] = cj(b0[6]);
-//           blockB[count+7] = cj(b0[7]);
-//         }
-//         count += 8;
-//       }
-//       // skip what we have after
-//       if(PanelMode) count += 8 * (stride-offset-depth);
-//     }
-//   }
-  if(nr>=4)
+  enum { PacketSize = packet_traits<Scalar>::size,
+         HalfPacketSize = unpacket_traits<HalfPacket>::size,
+		 QuarterPacketSize = unpacket_traits<QuarterPacket>::size};
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0)
   {
-    for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
+    EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
+    EIGEN_UNUSED_VARIABLE(stride);
+    EIGEN_UNUSED_VARIABLE(offset);
+    eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+    const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
+    const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
+    conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+    Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+    Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+    Index count = 0;
+
+  //   if(nr>=8)
+  //   {
+  //     for(Index j2=0; j2<packet_cols8; j2+=8)
+  //     {
+  //       // skip what we have before
+  //       if(PanelMode) count += 8 * offset;
+  //       for(Index k=0; k<depth; k++)
+  //       {
+  //         if (PacketSize==8) {
+  //           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+  //           pstoreu(blockB+count, cj.pconj(A));
+  //         } else if (PacketSize==4) {
+  //           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+  //           Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
+  //           pstoreu(blockB+count, cj.pconj(A));
+  //           pstoreu(blockB+count+PacketSize, cj.pconj(B));
+  //         } else {
+  //           const Scalar* b0 = &rhs[k*rhsStride + j2];
+  //           blockB[count+0] = cj(b0[0]);
+  //           blockB[count+1] = cj(b0[1]);
+  //           blockB[count+2] = cj(b0[2]);
+  //           blockB[count+3] = cj(b0[3]);
+  //           blockB[count+4] = cj(b0[4]);
+  //           blockB[count+5] = cj(b0[5]);
+  //           blockB[count+6] = cj(b0[6]);
+  //           blockB[count+7] = cj(b0[7]);
+  //         }
+  //         count += 8;
+  //       }
+  //       // skip what we have after
+  //       if(PanelMode) count += 8 * (stride-offset-depth);
+  //     }
+  //   }
+    if(nr>=4)
     {
-      // skip what we have before
-      if(PanelMode) count += 4 * offset;
-      for(Index k=0; k<depth; k++)
+      for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
       {
-        if (PacketSize==4) {
-          Packet A = rhs.loadPacket(k, j2);
-          pstoreu(blockB+count, cj.pconj(A));
-          count += PacketSize;
-        } else {
-          const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
-          blockB[count+0] = cj(dm0(0));
-          blockB[count+1] = cj(dm0(1));
-          blockB[count+2] = cj(dm0(2));
-          blockB[count+3] = cj(dm0(3));
-          count += 4;
+        // skip what we have before
+        if(PanelMode) count += 4 * offset;
+        for(Index k=0; k<depth; k++)
+        {
+          if (PacketSize==4) {
+            Packet A = rhs.template loadPacket<Packet>(k, j2);
+            pstoreu(blockB+count, cj.pconj(A));
+            count += PacketSize;
+          } else if (HasHalf && HalfPacketSize==4) {
+            HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
+            pstoreu(blockB+count, cj.pconj(A));
+            count += HalfPacketSize;
+          } else if (HasQuarter && QuarterPacketSize==4) {
+            QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
+            pstoreu(blockB+count, cj.pconj(A));
+            count += QuarterPacketSize;
+          } else {
+            const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+            blockB[count+0] = cj(dm0(0));
+            blockB[count+1] = cj(dm0(1));
+            blockB[count+2] = cj(dm0(2));
+            blockB[count+3] = cj(dm0(3));
+            count += 4;
+          }
         }
+        // skip what we have after
+        if(PanelMode) count += 4 * (stride-offset-depth);
       }
-      // skip what we have after
-      if(PanelMode) count += 4 * (stride-offset-depth);
     }
-  }
-  // copy the remaining columns one at a time (nr==1)
-  for(Index j2=packet_cols4; j2<cols; ++j2)
-  {
-    if(PanelMode) count += offset;
-    for(Index k=0; k<depth; k++)
+    // copy the remaining columns one at a time (nr==1)
+    for(Index j2=packet_cols4; j2<cols; ++j2)
     {
-      blockB[count] = cj(rhs(k, j2));
-      count += 1;
+      if(PanelMode) count += offset;
+      for(Index k=0; k<depth; k++)
+      {
+        blockB[count] = cj(rhs(k, j2));
+        count += 1;
+      }
+      if(PanelMode) count += stride-offset-depth;
     }
-    if(PanelMode) count += stride-offset-depth;
   }
-}
+};
 
 } // end namespace internal
 
diff --git a/contrib/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h b/contrib/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h
index ed6234c378e8260088c88027d0cee53a33840574..caa65fcccf56cfbc0de60ae65c98e6975fbb89ea 100644
--- a/contrib/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/contrib/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -77,7 +77,7 @@ static void run(Index rows, Index cols, Index depth,
   Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
   Index nc = (std::min)(cols,blocking.nc());  // cache block size along the N direction
 
-  gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+  gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
   gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
   gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
 
@@ -110,7 +110,7 @@ static void run(Index rows, Index cols, Index depth,
       // i.e., we test that info[tid].users equals 0.
       // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
       while(info[tid].users!=0) {}
-      info[tid].users += threads;
+      info[tid].users = threads;
 
       pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
 
@@ -148,7 +148,9 @@ static void run(Index rows, Index cols, Index depth,
       // Release all the sub blocks A'_i of A' for the current thread,
       // i.e., we simply decrement the number of users by 1
       for(Index i=0; i<threads; ++i)
+#if !EIGEN_HAS_CXX11_ATOMIC
         #pragma omp atomic
+#endif
         info[i].users -= 1;
     }
   }
@@ -429,7 +431,13 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   template<typename Dst>
   static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
-    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
+    // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program
+    // to determine the following heuristic.
+    // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
+    // unless it has been specialized by the user or for a given architecture.
+    // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs.
+    // I'm not sure it is still required.
+    if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
       lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>());
     else
     {
@@ -441,7 +449,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   template<typename Dst>
   static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
-    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
+    if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
       lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>());
     else
       scaleAndAddTo(dst,lhs, rhs, Scalar(1));
@@ -450,7 +458,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   template<typename Dst>
   static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
-    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
+    if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
       lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>());
     else
       scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
@@ -463,11 +471,25 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
     if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0)
       return;
 
+    if (dst.cols() == 1)
+    {
+      // Fallback to GEMV if either the lhs or rhs is a runtime vector
+      typename Dest::ColXpr dst_vec(dst.col(0));
+      return internal::generic_product_impl<Lhs,typename Rhs::ConstColXpr,DenseShape,DenseShape,GemvProduct>
+        ::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha);
+    }
+    else if (dst.rows() == 1)
+    {
+      // Fallback to GEMV if either the lhs or rhs is a runtime vector
+      typename Dest::RowXpr dst_vec(dst.row(0));
+      return internal::generic_product_impl<typename Lhs::ConstRowXpr,Rhs,DenseShape,DenseShape,GemvProduct>
+        ::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha);
+    }
+
     typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
     typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
 
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
-                               * RhsBlasTraits::extractScalarFactor(a_rhs);
+    Scalar actualAlpha = combine_scalar_factors(alpha, a_lhs, a_rhs);
 
     typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
             Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
diff --git a/contrib/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/contrib/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index d68d2f965721237dcdef81b8742e9c3549f644c2..6ba0d9bdb8d1752af714b5c7e326af6bb5e9a8d8 100644
--- a/contrib/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/contrib/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -87,7 +87,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
     ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
 
-    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
     gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
     tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, ResInnerStride, UpLo> sybb;
@@ -302,13 +302,13 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename ProductType>
-TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
+EIGEN_DEVICE_FUNC TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
 {
   EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
   eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
-  
+
   general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
-  
+
   return derived();
 }
 
diff --git a/contrib/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/contrib/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
index 691f95d6979fbd4dd030d8d2c5e34a3e24373929..9a650ec23d5404aff0aefcfbff176c86de9c82d7 100644
--- a/contrib/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
+++ b/contrib/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
@@ -37,7 +37,7 @@ namespace Eigen {
 
 namespace internal {
 
-template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int  UpLo>
+template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo>
 struct general_matrix_matrix_rankupdate :
        general_matrix_matrix_triangular_product<
          Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,1,UpLo,BuiltIn> {};
diff --git a/contrib/eigen/Eigen/src/Core/products/GeneralMatrixVector.h b/contrib/eigen/Eigen/src/Core/products/GeneralMatrixVector.h
index a597c1f4ee68db351c69f1b8c110495383ab574c..dfb6aebced46a246820c6f68efd15a400bfd30e2 100644
--- a/contrib/eigen/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/contrib/eigen/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -14,11 +14,57 @@ namespace Eigen {
 
 namespace internal {
 
+enum GEMVPacketSizeType {
+  GEMVPacketFull = 0,
+  GEMVPacketHalf,
+  GEMVPacketQuarter
+};
+
+template <int N, typename T1, typename T2, typename T3>
+struct gemv_packet_cond { typedef T3 type; };
+
+template <typename T1, typename T2, typename T3>
+struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> { typedef T1 type; };
+
+template <typename T1, typename T2, typename T3>
+struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> { typedef T2 type; };
+
+template<typename LhsScalar, typename RhsScalar, int _PacketSize=GEMVPacketFull>
+class gemv_traits
+{
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size)                        \
+  typedef typename gemv_packet_cond<packet_size,                                  \
+                                    typename packet_traits<name ## Scalar>::type, \
+                                    typename packet_traits<name ## Scalar>::half, \
+                                    typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
+  prefix ## name ## Packet
+
+  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+#undef PACKET_DECL_COND_PREFIX
+
+public:
+  enum {
+        Vectorizable = unpacket_traits<_LhsPacket>::vectorizable &&
+        unpacket_traits<_RhsPacket>::vectorizable &&
+        int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size),
+        LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+        RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+        ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1
+  };
+
+  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+};
+
+
 /* Optimized col-major matrix * vector product:
- * This algorithm processes 4 columns at onces that allows to both reduce
- * the number of load/stores of the result by a factor 4 and to reduce
- * the instruction dependency. Moreover, we know that all bands have the
- * same alignment pattern.
+ * This algorithm processes the matrix per vertical panels,
+ * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments.
  *
  * Mixing type logic: C += alpha * A * B
  *  |  A  |  B  |alpha| comments
@@ -27,56 +73,30 @@ namespace internal {
  *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
  *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
  *
- * Accesses to the matrix coefficients follow the following logic:
- *
- * - if all columns have the same alignment then
- *   - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
- *   - otherwise perform unaligned loads only (-> NoneAligned case)
- * - otherwise
- *   - if even columns have the same alignment then
- *     // odd columns are guaranteed to have the same alignment too
- *     - if even or odd columns have the same alignment as the result, then
- *       // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
- *       - perform half aligned and half unaligned loads (-> EvenAligned case)
- *     - otherwise perform unaligned loads only (-> NoneAligned case)
- *   - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
- *     - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
- *       perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
- *       // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
- *   - otherwise,
- *     // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
- *     // we currently fall back to the NoneAligned case
- *
  * The same reasoning apply for the transposed case.
- *
- * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
- * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
- * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
- * compared to unaligned loads on a 4 byte boundary.
- *
  */
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
+  typedef gemv_traits<LhsScalar,RhsScalar> Traits;
+  typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
+  typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
+
   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
-enum {
-  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
-              && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
-  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
-};
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
 
-typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-typedef typename packet_traits<ResScalar>::type  _ResPacket;
+  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+  typedef typename HalfTraits::ResPacket ResPacketHalf;
 
-typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
-typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
-typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
 
-EIGEN_DONT_INLINE static void run(
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
   const LhsMapper& lhs,
   const RhsMapper& rhs,
@@ -85,244 +105,187 @@ EIGEN_DONT_INLINE static void run(
 };
 
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
-  const LhsMapper& lhs,
+  const LhsMapper& alhs,
   const RhsMapper& rhs,
         ResScalar* res, Index resIncr,
   RhsScalar alpha)
 {
   EIGEN_UNUSED_VARIABLE(resIncr);
   eigen_internal_assert(resIncr==1);
-  #ifdef _EIGEN_ACCUMULATE_PACKETS
-  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
-  #endif
-  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
-    pstore(&res[j], \
-      padd(pload<ResPacket>(&res[j]), \
-        padd( \
-      padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j),    ptmp0), \
-      pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j),   ptmp1)),   \
-      padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j),    ptmp2), \
-      pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j),   ptmp3)) )))
-
-  typedef typename LhsMapper::VectorMapper LhsScalars;
+
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate propoer code.
+  LhsMapper lhs(alhs);
 
   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
-  if(ConjugateRhs)
-    alpha = numext::conj(alpha);
-
-  enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
-  const Index columnsAtOnce = 4;
-  const Index peels = 2;
-  const Index LhsPacketAlignedMask = LhsPacketSize-1;
-  const Index ResPacketAlignedMask = ResPacketSize-1;
-//  const Index PeelAlignedMask = ResPacketSize*peels-1;
-  const Index size = rows;
+  conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
+  conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
 
   const Index lhsStride = lhs.stride();
-
-  // How many coeffs of the result do we have to skip to be aligned.
-  // Here we assume data are at least aligned on the base scalar type.
-  Index alignedStart = internal::first_default_aligned(res,size);
-  Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
-  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
-
-  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
-  Index alignmentPattern = alignmentStep==0 ? AllAligned
-                       : alignmentStep==(LhsPacketSize/2) ? EvenAligned
-                       : FirstAligned;
-
-  // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = lhs.firstAligned(size);
-
-  // find how many columns do we have to skip to be aligned with the result (if possible)
-  Index skipColumns = 0;
-  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) )
-  {
-    alignedSize = 0;
-    alignedStart = 0;
-    alignmentPattern = NoneAligned;
-  }
-  else if(LhsPacketSize > 4)
-  {
-    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
-    // Currently, it seems to be better to perform unaligned loads anyway
-    alignmentPattern = NoneAligned;
-  }
-  else if (LhsPacketSize>1)
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum { LhsAlignment = Unaligned,
+         ResPacketSize = Traits::ResPacketSize,
+         ResPacketSizeHalf = HalfTraits::ResPacketSize,
+         ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
+         LhsPacketSize = Traits::LhsPacketSize,
+         HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
+         HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
+  };
+
+  const Index n8 = rows-8*ResPacketSize+1;
+  const Index n4 = rows-4*ResPacketSize+1;
+  const Index n3 = rows-3*ResPacketSize+1;
+  const Index n2 = rows-2*ResPacketSize+1;
+  const Index n1 = rows-1*ResPacketSize+1;
+  const Index n_half = rows-1*ResPacketSizeHalf+1;
+  const Index n_quarter = rows-1*ResPacketSizeQuarter+1;
+
+  // TODO: improve the following heuristic:
+  const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4);
+  ResPacket palpha = pset1<ResPacket>(alpha);
+  ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
+  ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
+
+  for(Index j2=0; j2<cols; j2+=block_cols)
   {
-  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
-
-    while (skipColumns<LhsPacketSize &&
-          alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
-      ++skipColumns;
-    if (skipColumns==LhsPacketSize)
+    Index jend = numext::mini(j2+block_cols,cols);
+    Index i=0;
+    for(; i<n8; i+=ResPacketSize*8)
     {
-      // nothing can be aligned, no need to skip any column
-      alignmentPattern = NoneAligned;
-      skipColumns = 0;
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+                c1 = pset1<ResPacket>(ResScalar(0)),
+                c2 = pset1<ResPacket>(ResScalar(0)),
+                c3 = pset1<ResPacket>(ResScalar(0)),
+                c4 = pset1<ResPacket>(ResScalar(0)),
+                c5 = pset1<ResPacket>(ResScalar(0)),
+                c6 = pset1<ResPacket>(ResScalar(0)),
+                c7 = pset1<ResPacket>(ResScalar(0));
+
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+        c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
+        c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
+        c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4);
+        c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5);
+        c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6);
+        c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7);
+      }
+      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+      pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+      pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
+      pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
+      pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4)));
+      pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5)));
+      pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6)));
+      pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7)));
     }
-    else
+    if(i<n4)
     {
-      skipColumns = (std::min)(skipColumns,cols);
-      // note that the skiped columns are processed later.
-    }
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+                c1 = pset1<ResPacket>(ResScalar(0)),
+                c2 = pset1<ResPacket>(ResScalar(0)),
+                c3 = pset1<ResPacket>(ResScalar(0));
 
-    /*    eigen_internal_assert(  (alignmentPattern==NoneAligned)
-                      || (skipColumns + columnsAtOnce >= cols)
-                      || LhsPacketSize > size
-                      || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
-  }
-  else if(Vectorizable)
-  {
-    alignedStart = 0;
-    alignedSize = size;
-    alignmentPattern = AllAligned;
-  }
-
-  const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
-  const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+        c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
+        c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
+      }
+      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+      pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+      pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
+      pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
 
-  Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
-  for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
-  {
-    RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
-              ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
-              ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
-              ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
+      i+=ResPacketSize*4;
+    }
+    if(i<n3)
+    {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+                c1 = pset1<ResPacket>(ResScalar(0)),
+                c2 = pset1<ResPacket>(ResScalar(0));
 
-    // this helps a lot generating better binary code
-    const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0),   lhs1 = lhs.getVectorMapper(0, i+offset1),
-                     lhs2 = lhs.getVectorMapper(0, i+2),   lhs3 = lhs.getVectorMapper(0, i+offset3);
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+        c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
+      }
+      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+      pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+      pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
 
-    if (Vectorizable)
+      i+=ResPacketSize*3;
+    }
+    if(i<n2)
     {
-      /* explicit vectorization */
-      // process initial unaligned coeffs
-      for (Index j=0; j<alignedStart; ++j)
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+                c1 = pset1<ResPacket>(ResScalar(0));
+
+      for(Index j=j2; j<jend; j+=1)
       {
-        res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
-        res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
-        res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
-        res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
       }
-
-      if (alignedSize>alignedStart)
+      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+      pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+      i+=ResPacketSize*2;
+    }
+    if(i<n1)
+    {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0));
+      for(Index j=j2; j<jend; j+=1)
       {
-        switch(alignmentPattern)
-        {
-          case AllAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
-            break;
-          case EvenAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
-            break;
-          case FirstAligned:
-          {
-            Index j = alignedStart;
-            if(peels>1)
-            {
-              LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
-              ResPacket T0, T1;
-
-              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
-              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
-              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
-
-              for (; j<peeledSize; j+=peels*ResPacketSize)
-              {
-                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
-                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
-                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
-
-                A00 = lhs0.template load<LhsPacket, Aligned>(j);
-                A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
-                T0  = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
-                T1  = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
-
-                T0  = pcj.pmadd(A01, ptmp1, T0);
-                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
-                T0  = pcj.pmadd(A02, ptmp2, T0);
-                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
-                T0  = pcj.pmadd(A03, ptmp3, T0);
-                pstore(&res[j],T0);
-                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
-                T1  = pcj.pmadd(A11, ptmp1, T1);
-                T1  = pcj.pmadd(A12, ptmp2, T1);
-                T1  = pcj.pmadd(A13, ptmp3, T1);
-                pstore(&res[j+ResPacketSize],T1);
-              }
-            }
-            for (; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
-            break;
-          }
-          default:
-            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
-            break;
-        }
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
       }
-    } // end explicit vectorization
-
-    /* process remaining coeffs (or all if there is no explicit vectorization) */
-    for (Index j=alignedSize; j<size; ++j)
+      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+      i+=ResPacketSize;
+    }
+    if(HasHalf && i<n_half)
     {
-      res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
-      res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
-      res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
-      res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
+      ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
+      for(Index j=j2; j<jend; j+=1)
+      {
+        RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j,0));
+        c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0);
+      }
+      pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu<ResPacketHalf>(res+i+ResPacketSizeHalf*0)));
+      i+=ResPacketSizeHalf;
     }
-  }
-
-  // process remaining first and last columns (at most columnsAtOnce-1)
-  Index end = cols;
-  Index start = columnBound;
-  do
-  {
-    for (Index k=start; k<end; ++k)
+    if(HasQuarter && i<n_quarter)
     {
-      RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
-      const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
-
-      if (Vectorizable)
+      ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
+      for(Index j=j2; j<jend; j+=1)
       {
-        /* explicit vectorization */
-        // process first unaligned result's coeffs
-        for (Index j=0; j<alignedStart; ++j)
-          res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
-        // process aligned result's coeffs
-        if (lhs0.template aligned<LhsPacket>(alignedStart))
-          for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
-        else
-          for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
+        RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j,0));
+        c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0);
       }
-
-      // process remaining scalars (or all if no explicit vectorization)
-      for (Index i=alignedSize; i<size; ++i)
-        res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
+      pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu<ResPacketQuarter>(res+i+ResPacketSizeQuarter*0)));
+      i+=ResPacketSizeQuarter;
     }
-    if (skipColumns)
+    for(;i<rows;++i)
     {
-      start = 0;
-      end = skipColumns;
-      skipColumns = 0;
+      ResScalar c0(0);
+      for(Index j=j2; j<jend; j+=1)
+        c0 += cj.pmul(lhs(i,j), rhs(j,0));
+      res[i] += alpha*c0;
     }
-    else
-      break;
-  } while(Vectorizable);
-  #undef _EIGEN_ACCUMULATE_PACKETS
+  }
 }
 
 /* Optimized row-major matrix * vector product:
- * This algorithm processes 4 rows at onces that allows to both reduce
+ * This algorithm processes 4 rows at once that allows to both reduce
  * the number of load/stores of the result by a factor 4 and to reduce
  * the instruction dependency. Moreover, we know that all bands have the
  * same alignment pattern.
@@ -334,25 +297,25 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
-typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-
-enum {
-  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
-              && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
-  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
-};
+  typedef gemv_traits<LhsScalar,RhsScalar> Traits;
+  typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
+  typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
+
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
 
-typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-typedef typename packet_traits<ResScalar>::type  _ResPacket;
+  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+  typedef typename HalfTraits::ResPacket ResPacketHalf;
 
-typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
-typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
-typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
 
-EIGEN_DONT_INLINE static void run(
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
   const LhsMapper& lhs,
   const RhsMapper& rhs,
@@ -361,255 +324,191 @@ EIGEN_DONT_INLINE static void run(
 };
 
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
-  const LhsMapper& lhs,
+  const LhsMapper& alhs,
   const RhsMapper& rhs,
   ResScalar* res, Index resIncr,
   ResScalar alpha)
 {
-  eigen_internal_assert(rhs.stride()==1);
-
-  #ifdef _EIGEN_ACCUMULATE_PACKETS
-  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
-  #endif
-
-  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
-    RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);  \
-    ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
-    ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
-    ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
-    ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate propoer code.
+  LhsMapper lhs(alhs);
 
+  eigen_internal_assert(rhs.stride()==1);
   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
-
-  typedef typename LhsMapper::VectorMapper LhsScalars;
-
-  enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
-  const Index rowsAtOnce = 4;
-  const Index peels = 2;
-  const Index RhsPacketAlignedMask = RhsPacketSize-1;
-  const Index LhsPacketAlignedMask = LhsPacketSize-1;
-  const Index depth = cols;
-  const Index lhsStride = lhs.stride();
-
-  // How many coeffs of the result do we have to skip to be aligned.
-  // Here we assume data are at least aligned on the base scalar type
-  // if that's not the case then vectorization is discarded, see below.
-  Index alignedStart = rhs.firstAligned(depth);
-  Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
-  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
-
-  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
-  Index alignmentPattern = alignmentStep==0 ? AllAligned
-                           : alignmentStep==(LhsPacketSize/2) ? EvenAligned
-                           : FirstAligned;
-
-  // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = lhs.firstAligned(depth);
-  const Index rhsAlignmentOffset = rhs.firstAligned(rows);
-
-  // find how many rows do we have to skip to be aligned with rhs (if possible)
-  Index skipRows = 0;
-  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
-      (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
-      (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
-  {
-    alignedSize = 0;
-    alignedStart = 0;
-    alignmentPattern = NoneAligned;
-  }
-  else if(LhsPacketSize > 4)
-  {
-    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
-    alignmentPattern = NoneAligned;
-  }
-  else if (LhsPacketSize>1)
+  conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
+  conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
+
+  // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
+  //       processing 8 rows at once might be counter productive wrt cache.
+  const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7;
+  const Index n4 = rows-3;
+  const Index n2 = rows-1;
+
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum { LhsAlignment = Unaligned,
+         ResPacketSize = Traits::ResPacketSize,
+         ResPacketSizeHalf = HalfTraits::ResPacketSize,
+         ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
+         LhsPacketSize = Traits::LhsPacketSize,
+         LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
+         LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
+         HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
+         HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
+  };
+
+  Index i=0;
+  for(; i<n8; i+=8)
   {
-  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
-
-    while (skipRows<LhsPacketSize &&
-           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
-      ++skipRows;
-    if (skipRows==LhsPacketSize)
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+              c1 = pset1<ResPacket>(ResScalar(0)),
+              c2 = pset1<ResPacket>(ResScalar(0)),
+              c3 = pset1<ResPacket>(ResScalar(0)),
+              c4 = pset1<ResPacket>(ResScalar(0)),
+              c5 = pset1<ResPacket>(ResScalar(0)),
+              c6 = pset1<ResPacket>(ResScalar(0)),
+              c7 = pset1<ResPacket>(ResScalar(0));
+
+    Index j=0;
+    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
     {
-      // nothing can be aligned, no need to skip any column
-      alignmentPattern = NoneAligned;
-      skipRows = 0;
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
+
+      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+      c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
+      c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
+      c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
+      c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+4,j),b0,c4);
+      c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+5,j),b0,c5);
+      c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6);
+      c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7);
     }
-    else
+    ResScalar cc0 = predux(c0);
+    ResScalar cc1 = predux(c1);
+    ResScalar cc2 = predux(c2);
+    ResScalar cc3 = predux(c3);
+    ResScalar cc4 = predux(c4);
+    ResScalar cc5 = predux(c5);
+    ResScalar cc6 = predux(c6);
+    ResScalar cc7 = predux(c7);
+    for(; j<cols; ++j)
     {
-      skipRows = (std::min)(skipRows,Index(rows));
-      // note that the skiped columns are processed later.
+      RhsScalar b0 = rhs(j,0);
+
+      cc0 += cj.pmul(lhs(i+0,j), b0);
+      cc1 += cj.pmul(lhs(i+1,j), b0);
+      cc2 += cj.pmul(lhs(i+2,j), b0);
+      cc3 += cj.pmul(lhs(i+3,j), b0);
+      cc4 += cj.pmul(lhs(i+4,j), b0);
+      cc5 += cj.pmul(lhs(i+5,j), b0);
+      cc6 += cj.pmul(lhs(i+6,j), b0);
+      cc7 += cj.pmul(lhs(i+7,j), b0);
     }
-    /*    eigen_internal_assert(  alignmentPattern==NoneAligned
-                      || LhsPacketSize==1
-                      || (skipRows + rowsAtOnce >= rows)
-                      || LhsPacketSize > depth
-                      || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
+    res[(i+0)*resIncr] += alpha*cc0;
+    res[(i+1)*resIncr] += alpha*cc1;
+    res[(i+2)*resIncr] += alpha*cc2;
+    res[(i+3)*resIncr] += alpha*cc3;
+    res[(i+4)*resIncr] += alpha*cc4;
+    res[(i+5)*resIncr] += alpha*cc5;
+    res[(i+6)*resIncr] += alpha*cc6;
+    res[(i+7)*resIncr] += alpha*cc7;
   }
-  else if(Vectorizable)
+  for(; i<n4; i+=4)
   {
-    alignedStart = 0;
-    alignedSize = depth;
-    alignmentPattern = AllAligned;
-  }
-
-  const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
-  const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+              c1 = pset1<ResPacket>(ResScalar(0)),
+              c2 = pset1<ResPacket>(ResScalar(0)),
+              c3 = pset1<ResPacket>(ResScalar(0));
 
-  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
-  for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
-  {
-    // FIXME: what is the purpose of this EIGEN_ALIGN_DEFAULT ??
-    EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
-    ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
-
-    // this helps the compiler generating good binary code
-    const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0),    lhs1 = lhs.getVectorMapper(i+offset1, 0),
-                     lhs2 = lhs.getVectorMapper(i+2, 0),    lhs3 = lhs.getVectorMapper(i+offset3, 0);
+    Index j=0;
+    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+    {
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
 
-    if (Vectorizable)
+      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+      c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
+      c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
+      c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
+    }
+    ResScalar cc0 = predux(c0);
+    ResScalar cc1 = predux(c1);
+    ResScalar cc2 = predux(c2);
+    ResScalar cc3 = predux(c3);
+    for(; j<cols; ++j)
     {
-      /* explicit vectorization */
-      ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
-                ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
+      RhsScalar b0 = rhs(j,0);
 
-      // process initial unaligned coeffs
-      // FIXME this loop get vectorized by the compiler !
-      for (Index j=0; j<alignedStart; ++j)
-      {
-        RhsScalar b = rhs(j, 0);
-        tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
-        tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
-      }
+      cc0 += cj.pmul(lhs(i+0,j), b0);
+      cc1 += cj.pmul(lhs(i+1,j), b0);
+      cc2 += cj.pmul(lhs(i+2,j), b0);
+      cc3 += cj.pmul(lhs(i+3,j), b0);
+    }
+    res[(i+0)*resIncr] += alpha*cc0;
+    res[(i+1)*resIncr] += alpha*cc1;
+    res[(i+2)*resIncr] += alpha*cc2;
+    res[(i+3)*resIncr] += alpha*cc3;
+  }
+  for(; i<n2; i+=2)
+  {
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+              c1 = pset1<ResPacket>(ResScalar(0));
 
-      if (alignedSize>alignedStart)
-      {
-        switch(alignmentPattern)
-        {
-          case AllAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
-            break;
-          case EvenAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
-            break;
-          case FirstAligned:
-          {
-            Index j = alignedStart;
-            if (peels>1)
-            {
-              /* Here we proccess 4 rows with with two peeled iterations to hide
-               * the overhead of unaligned loads. Moreover unaligned loads are handled
-               * using special shift/move operations between the two aligned packets
-               * overlaping the desired unaligned packet. This is *much* more efficient
-               * than basic unaligned loads.
-               */
-              LhsPacket A01, A02, A03, A11, A12, A13;
-              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
-              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
-              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
-
-              for (; j<peeledSize; j+=peels*RhsPacketSize)
-              {
-                RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
-                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
-                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
-                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
-
-                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
-                ptmp1 = pcj.pmadd(A01, b, ptmp1);
-                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
-                ptmp2 = pcj.pmadd(A02, b, ptmp2);
-                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
-                ptmp3 = pcj.pmadd(A03, b, ptmp3);
-                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
-
-                b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
-                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
-                ptmp1 = pcj.pmadd(A11, b, ptmp1);
-                ptmp2 = pcj.pmadd(A12, b, ptmp2);
-                ptmp3 = pcj.pmadd(A13, b, ptmp3);
-              }
-            }
-            for (; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
-            break;
-          }
-          default:
-            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
-            break;
-        }
-        tmp0 += predux(ptmp0);
-        tmp1 += predux(ptmp1);
-        tmp2 += predux(ptmp2);
-        tmp3 += predux(ptmp3);
-      }
-    } // end explicit vectorization
+    Index j=0;
+    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+    {
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
 
-    // process remaining coeffs (or all if no explicit vectorization)
-    // FIXME this loop get vectorized by the compiler !
-    for (Index j=alignedSize; j<depth; ++j)
+      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+      c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
+    }
+    ResScalar cc0 = predux(c0);
+    ResScalar cc1 = predux(c1);
+    for(; j<cols; ++j)
     {
-      RhsScalar b = rhs(j, 0);
-      tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
-      tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
+      RhsScalar b0 = rhs(j,0);
+
+      cc0 += cj.pmul(lhs(i+0,j), b0);
+      cc1 += cj.pmul(lhs(i+1,j), b0);
     }
-    res[i*resIncr]            += alpha*tmp0;
-    res[(i+offset1)*resIncr]  += alpha*tmp1;
-    res[(i+2)*resIncr]        += alpha*tmp2;
-    res[(i+offset3)*resIncr]  += alpha*tmp3;
+    res[(i+0)*resIncr] += alpha*cc0;
+    res[(i+1)*resIncr] += alpha*cc1;
   }
-
-  // process remaining first and last rows (at most columnsAtOnce-1)
-  Index end = rows;
-  Index start = rowBound;
-  do
+  for(; i<rows; ++i)
   {
-    for (Index i=start; i<end; ++i)
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0));
+    ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
+    ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
+    Index j=0;
+    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
     {
-      EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
-      ResPacket ptmp0 = pset1<ResPacket>(tmp0);
-      const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
-      // process first unaligned result's coeffs
-      // FIXME this loop get vectorized by the compiler !
-      for (Index j=0; j<alignedStart; ++j)
-        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
-
-      if (alignedSize>alignedStart)
-      {
-        // process aligned rhs coeffs
-        if (lhs0.template aligned<LhsPacket>(alignedStart))
-          for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
-            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
-        else
-          for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
-            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
-        tmp0 += predux(ptmp0);
-      }
-
-      // process remaining scalars
-      // FIXME this loop get vectorized by the compiler !
-      for (Index j=alignedSize; j<depth; ++j)
-        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
-      res[i*resIncr] += alpha*tmp0;
+      RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0);
+      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
     }
-    if (skipRows)
+    ResScalar cc0 = predux(c0);
+    if (HasHalf) {
+      for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf)
+        {
+          RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0);
+          c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h);
+        }
+      cc0 += predux(c0_h);
+    }
+    if (HasQuarter) {
+      for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter)
+        {
+          RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0);
+          c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q);
+        }
+      cc0 += predux(c0_q);
+    }
+    for(; j<cols; ++j)
     {
-      start = 0;
-      end = skipRows;
-      skipRows = 0;
+      cc0 += cj.pmul(lhs(i,j), rhs(j,0));
     }
-    else
-      break;
-  } while(Vectorizable);
-
-  #undef _EIGEN_ACCUMULATE_PACKETS
+    res[i*resIncr] += alpha*cc0;
+  }
 }
 
 } // end namespace internal
diff --git a/contrib/eigen/Eigen/src/Core/products/Parallelizer.h b/contrib/eigen/Eigen/src/Core/products/Parallelizer.h
index 67b2442b53a114af3c08829af4344acc6be7a42f..8f91879e45f87796bf9113a574d5f89747413edc 100644
--- a/contrib/eigen/Eigen/src/Core/products/Parallelizer.h
+++ b/contrib/eigen/Eigen/src/Core/products/Parallelizer.h
@@ -10,6 +10,10 @@
 #ifndef EIGEN_PARALLELIZER_H
 #define EIGEN_PARALLELIZER_H
 
+#if EIGEN_HAS_CXX11_ATOMIC
+#include <atomic>
+#endif
+
 namespace Eigen {
 
 namespace internal {
@@ -18,7 +22,7 @@ namespace internal {
 inline void manage_multi_threading(Action action, int* v)
 {
   static int m_maxThreads = -1;
-  EIGEN_UNUSED_VARIABLE(m_maxThreads);
+  EIGEN_UNUSED_VARIABLE(m_maxThreads)
 
   if(action==SetAction)
   {
@@ -76,8 +80,17 @@ template<typename Index> struct GemmParallelInfo
 {
   GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
 
+  // volatile is not enough on all architectures (see bug 1572)
+  // to guarantee that when thread A says to thread B that it is
+  // done with packing a block, then all writes have been really
+  // carried out... C++11 memory model+atomic guarantees this.
+#if EIGEN_HAS_CXX11_ATOMIC
+  std::atomic<Index> sync;
+  std::atomic<int> users;
+#else
   Index volatile sync;
   int volatile users;
+#endif
 
   Index lhs_start;
   Index lhs_length;
@@ -88,11 +101,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
 {
   // TODO when EIGEN_USE_BLAS is defined,
   // we should still enable OMP for other scalar types
-#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS)
+  // Without C++11, we have to disable GEMM's parallelization on
+  // non x86 architectures because there volatile is not enough for our purpose.
+  // See bug 1572.
+#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) || ((!EIGEN_HAS_CXX11_ATOMIC) && !(EIGEN_ARCH_i386_OR_x86_64))
   // FIXME the transpose variable is only needed to properly split
   // the matrix product when multithreading is enabled. This is a temporary
   // fix to support row-major destination matrices. This whole
-  // parallelizer mechanism has to be redisigned anyway.
+  // parallelizer mechanism has to be redesigned anyway.
   EIGEN_UNUSED_VARIABLE(depth);
   EIGEN_UNUSED_VARIABLE(transpose);
   func(0,rows, 0,cols);
@@ -113,12 +129,12 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
   double work = static_cast<double>(rows) * static_cast<double>(cols) *
       static_cast<double>(depth);
   double kMinTaskSize = 50000;  // FIXME improve this heuristic.
-  pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, work / kMinTaskSize));
+  pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, static_cast<Index>( work / kMinTaskSize ) ));
 
   // compute the number of threads we are going to use
   Index threads = std::min<Index>(nbThreads(), pb_max_threads);
 
-  // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session,
+  // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session,
   // then abort multi-threading
   // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
   if((!Condition) || (threads==1) || (omp_get_num_threads()>1))
@@ -132,8 +148,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
 
   ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
 
-  int errorCount = 0;
-  #pragma omp parallel num_threads(threads) reduction(+: errorCount)
+  #pragma omp parallel num_threads(threads)
   {
     Index i = omp_get_thread_num();
     // Note that the actual number of threads might be lower than the number of request ones.
@@ -152,14 +167,9 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
     info[i].lhs_start = r0;
     info[i].lhs_length = actualBlockRows;
 
-    EIGEN_TRY {
-      if(transpose) func(c0, actualBlockCols, 0, rows, info);
-      else          func(0, rows, c0, actualBlockCols, info);
-    } EIGEN_CATCH(...) {
-      ++errorCount;
-    }
+    if(transpose) func(c0, actualBlockCols, 0, rows, info);
+    else          func(0, rows, c0, actualBlockCols, info);
   }
-  if (errorCount) EIGEN_THROW_X(Eigen::eigen_assert_exception());
 #endif
 }
 
diff --git a/contrib/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/contrib/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 04c933480bdbb895f0704755048a9d646bdd84f2..33ecf10f610d4515edee6402360a34927d8abf2e 100644
--- a/contrib/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/contrib/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -45,14 +45,23 @@ struct symm_pack_lhs
   }
   void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
   {
-    enum { PacketSize = packet_traits<Scalar>::size };
+    typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket;
+    typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket;
+    enum { PacketSize = packet_traits<Scalar>::size,
+           HalfPacketSize = unpacket_traits<HalfPacket>::size,
+           QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+           HasHalf = (int)HalfPacketSize < (int)PacketSize,
+           HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
+
     const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
     Index count = 0;
     //Index peeled_mc3 = (rows/Pack1)*Pack1;
     
     const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
     const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
-    const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+    const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
+    const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
+    const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0;
     
     if(Pack1>=3*PacketSize)
       for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
@@ -66,8 +75,16 @@ struct symm_pack_lhs
       for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
         pack<1*PacketSize>(blockA, lhs, cols, i, count);
 
+    if(HasHalf && Pack1>=HalfPacketSize)
+      for(Index i=peeled_mc1; i<peeled_mc_half; i+=HalfPacketSize)
+        pack<HalfPacketSize>(blockA, lhs, cols, i, count);
+
+    if(HasQuarter && Pack1>=QuarterPacketSize)
+      for(Index i=peeled_mc_half; i<peeled_mc_quarter; i+=QuarterPacketSize)
+        pack<QuarterPacketSize>(blockA, lhs, cols, i, count);
+
     // do the same with mr==1
-    for(Index i=peeled_mc1; i<rows; i++)
+    for(Index i=peeled_mc_quarter; i<rows; i++)
     {
       for(Index k=0; k<i; k++)
         blockA[count++] = lhs(i, k);                   // normal
@@ -355,7 +372,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
     symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
+    gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
 
     for(Index k2=0; k2<size; k2+=kc)
     {
@@ -390,7 +407,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
       for(Index i2=k2+kc; i2<size; i2+=mc)
       {
         const Index actual_mc = (std::min)(i2+mc,size)-i2;
-        gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
+        gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>()
           (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
         gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
@@ -442,7 +459,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
     symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
 
     for(Index k2=0; k2<size; k2+=kc)
diff --git a/contrib/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h b/contrib/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h
index 3fd180e6c0b6735eba9b2d59f41ff33d6466c38c..d38fd72b22710e84f049273ac2e024bfa9b77930 100644
--- a/contrib/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/contrib/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -15,7 +15,7 @@ namespace Eigen {
 namespace internal {
 
 /* Optimized selfadjoint matrix * vector product:
- * This algorithm processes 2 columns at onces that allows to both reduce
+ * This algorithm processes 2 columns at once that allows to both reduce
  * the number of load/stores of the result by a factor 2 and to reduce
  * the instruction dependency.
  */
@@ -27,7 +27,8 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju
 struct selfadjoint_matrix_vector_product
 
 {
-static EIGEN_DONT_INLINE void run(
+static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
+void run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
   const Scalar*  rhs,
@@ -36,7 +37,8 @@ static EIGEN_DONT_INLINE void run(
 };
 
 template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
+EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
+void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
   const Scalar*  rhs,
@@ -62,8 +64,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
 
   Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
 
-
-  Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
+  Index bound = numext::maxi(Index(0), size-8) & 0xfffffffe;
   if (FirstTriangular)
     bound = size - bound;
 
@@ -175,7 +176,8 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
   enum { LhsUpLo = LhsMode&(Upper|Lower) };
 
   template<typename Dest>
-  static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
   {
     typedef typename Dest::Scalar ResScalar;
     typedef typename Rhs::Scalar RhsScalar;
diff --git a/contrib/eigen/Eigen/src/Core/products/SelfadjointProduct.h b/contrib/eigen/Eigen/src/Core/products/SelfadjointProduct.h
index ef12c98f6cbf1029935185aa51db6402d183baba..a21be80504f4f9412d003e3ead615feac56c0099 100644
--- a/contrib/eigen/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/contrib/eigen/Eigen/src/Core/products/SelfadjointProduct.h
@@ -111,7 +111,7 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
       Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
       IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo>
       ::run(size, depth,
-            &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(),
+            actualOther.data(), actualOther.outerStride(), actualOther.data(), actualOther.outerStride(),
             mat.data(), mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
   }
 };
@@ -120,7 +120,7 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename DerivedU>
-SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
 ::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha)
 {
   selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);
diff --git a/contrib/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h b/contrib/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h
index 2ae3641111c7db3641f7ef95a79446fed9bad4b7..f752a0bf093598a602abee2645a9dc34c4883470 100644
--- a/contrib/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/contrib/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -24,7 +24,8 @@ struct selfadjoint_rank2_update_selector;
 template<typename Scalar, typename Index, typename UType, typename VType>
 struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower>
 {
-  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
   {
     const Index size = u.size();
     for (Index i=0; i<size; ++i)
@@ -57,7 +58,7 @@ template<bool Cond, typename T> struct conj_expr_if
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename DerivedU, typename DerivedV>
-SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
 ::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha)
 {
   typedef internal::blas_traits<DerivedU> UBlasTraits;
@@ -79,8 +80,8 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
   if (IsRowMajor)
     actualAlpha = numext::conj(actualAlpha);
 
-  typedef typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type UType;
-  typedef typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type VType;
+  typedef typename internal::remove_all<typename internal::conj_expr_if<int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate), _ActualUType>::type>::type UType;
+  typedef typename internal::remove_all<typename internal::conj_expr_if<int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate), _ActualVType>::type>::type VType;
   internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType,
     (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)>
     ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha);
diff --git a/contrib/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h b/contrib/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 2fb408d1d73380f229608e7f94d3e13bd743cbbf..f0c60507ab10100e02933e253d25a03ff99256d7 100644
--- a/contrib/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/contrib/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -155,7 +155,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
       triangularBuffer.diagonal().setOnes();
 
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
 
     for(Index k2=IsLower ? depth : 0;
@@ -226,7 +226,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
         for(Index i2=start; i2<end; i2+=mc)
         {
           const Index actual_mc = (std::min)(i2+mc,end)-i2;
-          gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
+          gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>()
             (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
 
           gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
@@ -305,7 +305,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
       triangularBuffer.diagonal().setOnes();
 
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
 
diff --git a/contrib/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h b/contrib/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h
index e3ed2cd19ead6a8cf5bf3fa771408b4c456d6892..6d879ba00fd72eed240e4f2cb50b6023f8c0e0af 100644
--- a/contrib/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/contrib/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -76,7 +76,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
 
     conj_if<Conjugate> conj;
     gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
+    gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, TriStorageOrder> pack_lhs;
     gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
 
     // the goal here is to subdivise the Rhs panels such that we keep some cache
@@ -136,7 +136,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
               }
               else
               {
-                Scalar b = (other(i,j) *= a);
+                Scalar& otherij = other(i,j);
+                otherij *= a;
+                Scalar b = otherij;
                 typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j);
                 typename TriMapper::LinearMapper l = tri.getLinearMapper(s,i);
                 for (Index i3=0;i3<rs;++i3)
@@ -229,7 +231,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
     gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
-    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor, false, true> pack_lhs_panel;
 
     for(Index k2=IsLower ? size : 0;
         IsLower ? k2>0 : k2<size;
diff --git a/contrib/eigen/Eigen/src/Core/products/TriangularSolverVector.h b/contrib/eigen/Eigen/src/Core/products/TriangularSolverVector.h
index b994759b268795bd9678e3ffa83f1c9ee2a40f4e..6473170169372e912f192b46c8962e9718ab3cc6 100644
--- a/contrib/eigen/Eigen/src/Core/products/TriangularSolverVector.h
+++ b/contrib/eigen/Eigen/src/Core/products/TriangularSolverVector.h
@@ -58,7 +58,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
       {
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
-        // 2 - it is slighlty faster at runtime
+        // 2 - it is slightly faster at runtime
         Index startRow = IsLower ? pi : pi-actualPanelWidth;
         Index startCol = IsLower ? 0 : pi;
 
@@ -77,7 +77,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         if (k>0)
           rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
 
-        if(!(Mode & UnitDiag))
+        if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0)))
           rhs[i] /= cjLhs(i,i);
       }
     }
@@ -114,20 +114,23 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
       for(Index k=0; k<actualPanelWidth; ++k)
       {
         Index i = IsLower ? pi+k : pi-k-1;
-        if(!(Mode & UnitDiag))
-          rhs[i] /= cjLhs.coeff(i,i);
-
-        Index r = actualPanelWidth - k - 1; // remaining size
-        Index s = IsLower ? i+1 : i-r;
-        if (r>0)
-          Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+        if(numext::not_equal_strict(rhs[i],RhsScalar(0)))
+        {
+          if(!(Mode & UnitDiag))
+            rhs[i] /= cjLhs.coeff(i,i);
+
+          Index r = actualPanelWidth - k - 1; // remaining size
+          Index s = IsLower ? i+1 : i-r;
+          if (r>0)
+            Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+        }
       }
       Index r = IsLower ? size - endBlock : startBlock; // remaining size
       if (r > 0)
       {
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
-        // 2 - it is slighlty faster at runtime
+        // 2 - it is slightly faster at runtime
         general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
             r, actualPanelWidth,
             LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
diff --git a/contrib/eigen/Eigen/src/Core/util/BlasUtil.h b/contrib/eigen/Eigen/src/Core/util/BlasUtil.h
index 3dff9bc9b331bc839b1490ae34142b44ce0de964..e16a564980726ca200dcaf51e15adc72f89876d5 100755
--- a/contrib/eigen/Eigen/src/Core/util/BlasUtil.h
+++ b/contrib/eigen/Eigen/src/Core/util/BlasUtil.h
@@ -24,7 +24,7 @@ struct gebp_kernel;
 template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
 struct gemm_pack_rhs;
 
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
 struct gemm_pack_lhs;
 
 template<
@@ -39,90 +39,6 @@ template<typename Index,
          typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version=Specialized>
 struct general_matrix_vector_product;
 
-
-template<bool Conjugate> struct conj_if;
-
-template<> struct conj_if<true> {
-  template<typename T>
-  inline T operator()(const T& x) const { return numext::conj(x); }
-  template<typename T>
-  inline T pconj(const T& x) const { return internal::pconj(x); }
-};
-
-template<> struct conj_if<false> {
-  template<typename T>
-  inline const T& operator()(const T& x) const { return x; }
-  template<typename T>
-  inline const T& pconj(const T& x) const { return x; }
-};
-
-// Generic implementation for custom complex types.
-template<typename LhsScalar, typename RhsScalar, bool ConjLhs, bool ConjRhs>
-struct conj_helper
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar>::ReturnType Scalar;
-
-  EIGEN_STRONG_INLINE Scalar pmadd(const LhsScalar& x, const RhsScalar& y, const Scalar& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Scalar pmul(const LhsScalar& x, const RhsScalar& y) const
-  { return conj_if<ConjLhs>()(x) *  conj_if<ConjRhs>()(y); }
-};
-
-template<typename Scalar> struct conj_helper<Scalar,Scalar,false,false>
-{
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return internal::pmadd(x,y,c); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return internal::pmul(x,y); }
-};
-
-template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, false,true>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
-  { return c + pmul(x,y); }
-
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
-  { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::imag(x)*numext::real(y) - numext::real(x)*numext::imag(y)); }
-};
-
-template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, true,false>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
-  { return c + pmul(x,y); }
-
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
-  { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); }
-};
-
-template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, true,true>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
-  { return c + pmul(x,y); }
-
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
-  { return Scalar(numext::real(x)*numext::real(y) - numext::imag(x)*numext::imag(y), - numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); }
-};
-
-template<typename RealScalar,bool Conj> struct conj_helper<std::complex<RealScalar>, RealScalar, Conj,false>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const
-  { return padd(c, pmul(x,y)); }
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const
-  { return conj_if<Conj>()(x)*y; }
-};
-
-template<typename RealScalar,bool Conj> struct conj_helper<RealScalar, std::complex<RealScalar>, false,Conj>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const
-  { return padd(c, pmul(x,y)); }
-  EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const
-  { return x*conj_if<Conj>()(y); }
-};
-
 template<typename From,typename To> struct get_factor {
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); }
 };
@@ -159,11 +75,9 @@ template<typename Scalar, typename Index, int AlignmentType, int Incr=1>
 class BlasLinearMapper;
 
 template<typename Scalar, typename Index, int AlignmentType>
-class BlasLinearMapper<Scalar,Index,AlignmentType,1> {
-  public:
-  typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename packet_traits<Scalar>::half HalfPacket;
-
+class BlasLinearMapper<Scalar,Index,AlignmentType>
+{
+public:
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data, Index incr=1)
     : m_data(data)
   {
@@ -179,19 +93,17 @@ class BlasLinearMapper<Scalar,Index,AlignmentType,1> {
     return m_data[i];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
-    return ploadt<Packet, AlignmentType>(m_data + i);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
-    return ploadt<HalfPacket, AlignmentType>(m_data + i);
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const {
+    return ploadt<PacketType, AlignmentType>(m_data + i);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const {
-    pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
+    pstoret<Scalar, PacketType, AlignmentType>(m_data + i, p);
   }
 
-  protected:
+protected:
   Scalar *m_data;
 };
 
@@ -199,13 +111,59 @@ class BlasLinearMapper<Scalar,Index,AlignmentType,1> {
 template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned, int Incr = 1>
 class blas_data_mapper;
 
+// TMP to help PacketBlock store implementation.
+// There's currently no known use case for PacketBlock load.
+// The default implementation assumes ColMajor order.
+// It always store each packet sequentially one `stride` apart.
+template<typename Index, typename Scalar, typename Packet, int n, int idx, int StorageOrder>
+struct PacketBlockManagement
+{
+  PacketBlockManagement<Index, Scalar, Packet, n, idx - 1, StorageOrder> pbm;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const {
+    pbm.store(to, stride, i, j, block);
+    pstoreu<Scalar>(to + i + (j + idx)*stride, block.packet[idx]);
+  }
+};
+
+// PacketBlockManagement specialization to take care of RowMajor order without ifs.
+template<typename Index, typename Scalar, typename Packet, int n, int idx>
+struct PacketBlockManagement<Index, Scalar, Packet, n, idx, RowMajor>
+{
+  PacketBlockManagement<Index, Scalar, Packet, n, idx - 1, RowMajor> pbm;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const {
+    pbm.store(to, stride, i, j, block);
+    pstoreu<Scalar>(to + j + (i + idx)*stride, block.packet[idx]);
+  }
+};
+
+template<typename Index, typename Scalar, typename Packet, int n, int StorageOrder>
+struct PacketBlockManagement<Index, Scalar, Packet, n, -1, StorageOrder>
+{
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const {
+    EIGEN_UNUSED_VARIABLE(to);
+    EIGEN_UNUSED_VARIABLE(stride);
+    EIGEN_UNUSED_VARIABLE(i);
+    EIGEN_UNUSED_VARIABLE(j);
+    EIGEN_UNUSED_VARIABLE(block);
+  }
+};
+
+template<typename Index, typename Scalar, typename Packet, int n>
+struct PacketBlockManagement<Index, Scalar, Packet, n, -1, RowMajor>
+{
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock<Packet, n> &block) const {
+    EIGEN_UNUSED_VARIABLE(to);
+    EIGEN_UNUSED_VARIABLE(stride);
+    EIGEN_UNUSED_VARIABLE(i);
+    EIGEN_UNUSED_VARIABLE(j);
+    EIGEN_UNUSED_VARIABLE(block);
+  }
+};
+
 template<typename Scalar, typename Index, int StorageOrder, int AlignmentType>
 class blas_data_mapper<Scalar,Index,StorageOrder,AlignmentType,1>
 {
 public:
-  typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename packet_traits<Scalar>::half HalfPacket;
-
   typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
   typedef BlasVectorMapper<Scalar, Index> VectorMapper;
 
@@ -235,12 +193,14 @@ public:
     return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
-    return ploadt<Packet, AlignmentType>(&operator()(i, j));
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const {
+    return ploadt<PacketType, AlignmentType>(&operator()(i, j));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
-    return ploadt<HalfPacket, AlignmentType>(&operator()(i, j));
+  template <typename PacketT, int AlignmentT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
+    return ploadt<PacketT, AlignmentT>(&operator()(i, j));
   }
 
   template<typename SubPacket>
@@ -263,7 +223,12 @@ public:
     return internal::first_default_aligned(m_data, size);
   }
 
-  protected:
+  template<typename SubPacket, int n>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j, const PacketBlock<SubPacket, n> &block) const {
+    PacketBlockManagement<Index, Scalar, SubPacket, n, n-1, StorageOrder> pbm;
+    pbm.store(m_data, m_stride, i, j, block);
+  }
+protected:
   Scalar* EIGEN_RESTRICT m_data;
   const Index m_stride;
 };
@@ -275,9 +240,6 @@ template<typename Scalar, typename Index, int AlignmentType, int Incr>
 class BlasLinearMapper
 {
 public:
-  typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename packet_traits<Scalar>::half HalfPacket;
-
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data,Index incr) : m_data(data), m_incr(incr) {}
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
@@ -288,8 +250,9 @@ public:
     return m_data[i*m_incr.value()];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
-    return pgather<Scalar,Packet>(m_data + i*m_incr.value(), m_incr.value());
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const {
+    return pgather<Scalar,PacketType>(m_data + i*m_incr.value(), m_incr.value());
   }
 
   template<typename PacketType>
@@ -306,9 +269,6 @@ template<typename Scalar, typename Index, int StorageOrder, int AlignmentType,in
 class blas_data_mapper
 {
 public:
-  typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename packet_traits<Scalar>::half HalfPacket;
-
   typedef BlasLinearMapper<Scalar, Index, AlignmentType,Incr> LinearMapper;
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr) : m_data(data), m_stride(stride), m_incr(incr) {}
@@ -327,8 +287,9 @@ public:
     return m_data[StorageOrder==RowMajor ? j*m_incr.value() + i*m_stride : i*m_incr.value() + j*m_stride];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
-    return pgather<Scalar,Packet>(&operator()(i, j),m_incr.value());
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const {
+    return pgather<Scalar,PacketType>(&operator()(i, j),m_incr.value());
   }
 
   template <typename PacketT, int AlignmentT>
@@ -346,6 +307,77 @@ public:
     return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
   }
 
+  // storePacketBlock_helper defines a way to access values inside the PacketBlock, this is essentially required by the Complex types.
+  template<typename SubPacket, typename ScalarT, int n, int idx>
+  struct storePacketBlock_helper
+  {
+    storePacketBlock_helper<SubPacket, ScalarT, n, idx-1> spbh;
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>* sup, Index i, Index j, const PacketBlock<SubPacket, n>& block) const {
+      spbh.store(sup, i,j,block);
+      for(int l = 0; l < unpacket_traits<SubPacket>::size; l++)
+      {
+        ScalarT *v = &sup->operator()(i+l, j+idx);
+        *v = block.packet[idx][l];
+      }
+    }
+  };
+
+  template<typename SubPacket, int n, int idx>
+  struct storePacketBlock_helper<SubPacket, std::complex<float>, n, idx>
+  {
+    storePacketBlock_helper<SubPacket, std::complex<float>, n, idx-1> spbh;
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>* sup, Index i, Index j, const PacketBlock<SubPacket, n>& block) const {
+      spbh.store(sup,i,j,block);
+      for(int l = 0; l < unpacket_traits<SubPacket>::size; l++)
+      {
+        std::complex<float> *v = &sup->operator()(i+l, j+idx);
+        v->real(block.packet[idx].v[2*l+0]);
+        v->imag(block.packet[idx].v[2*l+1]);
+      }
+    }
+  };
+
+  template<typename SubPacket, int n, int idx>
+  struct storePacketBlock_helper<SubPacket, std::complex<double>, n, idx>
+  {
+    storePacketBlock_helper<SubPacket, std::complex<double>, n, idx-1> spbh;
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>* sup, Index i, Index j, const PacketBlock<SubPacket, n>& block) const {
+      spbh.store(sup,i,j,block);
+      for(int l = 0; l < unpacket_traits<SubPacket>::size; l++)
+      {
+        std::complex<double> *v = &sup->operator()(i+l, j+idx);
+        v->real(block.packet[idx].v[2*l+0]);
+        v->imag(block.packet[idx].v[2*l+1]);
+      }
+    }
+  };
+
+  template<typename SubPacket, typename ScalarT, int n>
+  struct storePacketBlock_helper<SubPacket, ScalarT, n, -1>
+  {
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>*, Index, Index, const PacketBlock<SubPacket, n>& ) const {
+    }
+  };
+
+  template<typename SubPacket, int n>
+  struct storePacketBlock_helper<SubPacket, std::complex<float>, n, -1>
+  {
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>*, Index, Index, const PacketBlock<SubPacket, n>& ) const {
+    }
+  };
+
+  template<typename SubPacket, int n>
+  struct storePacketBlock_helper<SubPacket, std::complex<double>, n, -1>
+  {
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>*, Index, Index, const PacketBlock<SubPacket, n>& ) const {
+    }
+  };
+  // This function stores a PacketBlock on m_data, this approach is really quite slow compare to Incr=1 and should be avoided when possible.
+  template<typename SubPacket, int n>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j, const PacketBlock<SubPacket, n>&block) const {
+    storePacketBlock_helper<SubPacket, Scalar, n, n-1> spb;
+    spb.store(this, i,j,block);
+  }
 protected:
   Scalar* EIGEN_RESTRICT m_data;
   const Index m_stride;
@@ -379,14 +411,15 @@ template<typename XprType> struct blas_traits
     HasUsableDirectAccess = (    (int(XprType::Flags)&DirectAccessBit)
                               && (   bool(XprType::IsVectorAtCompileTime)
                                   || int(inner_stride_at_compile_time<XprType>::ret) == 1)
-                             ) ?  1 : 0
+                             ) ?  1 : 0,
+    HasScalarFactor = false
   };
   typedef typename conditional<bool(HasUsableDirectAccess),
     ExtractType,
     typename _ExtractType::PlainObject
     >::type DirectLinearAccessType;
-  static inline ExtractType extract(const XprType& x) { return x; }
-  static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
+  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; }
+  static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
 };
 
 // pop conjugate
@@ -411,17 +444,23 @@ template<typename Scalar, typename NestedXpr, typename Plain>
 struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> >
  : blas_traits<NestedXpr>
 {
+  enum {
+    HasScalarFactor = true
+  };
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
-  static inline Scalar extractScalarFactor(const XprType& x)
+  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
+  static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x)
   { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); }
 };
 template<typename Scalar, typename NestedXpr, typename Plain>
 struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > >
  : blas_traits<NestedXpr>
 {
+  enum {
+    HasScalarFactor = true
+  };
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > XprType;
   typedef typename Base::ExtractType ExtractType;
@@ -440,6 +479,9 @@ template<typename Scalar, typename NestedXpr>
 struct blas_traits<CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> >
  : blas_traits<NestedXpr>
 {
+  enum {
+    HasScalarFactor = true
+  };
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
@@ -476,7 +518,7 @@ struct blas_traits<const T>
 
 template<typename T, bool HasUsableDirectAccess=blas_traits<T>::HasUsableDirectAccess>
 struct extract_data_selector {
-  static const typename T::Scalar* run(const T& m)
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename T::Scalar* run(const T& m)
   {
     return blas_traits<T>::extract(m).data();
   }
@@ -487,11 +529,53 @@ struct extract_data_selector<T,false> {
   static typename T::Scalar* run(const T&) { return 0; }
 };
 
-template<typename T> const typename T::Scalar* extract_data(const T& m)
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename T::Scalar* extract_data(const T& m)
 {
   return extract_data_selector<T>::run(m);
 }
 
+/**
+ * \c combine_scalar_factors extracts and multiplies factors from GEMM and GEMV products.
+ * There is a specialization for booleans
+ */
+template<typename ResScalar, typename Lhs, typename Rhs>
+struct combine_scalar_factors_impl
+{
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const Lhs& lhs, const Rhs& rhs)
+  {
+    return blas_traits<Lhs>::extractScalarFactor(lhs) * blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs)
+  {
+    return alpha * blas_traits<Lhs>::extractScalarFactor(lhs) * blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+};
+template<typename Lhs, typename Rhs>
+struct combine_scalar_factors_impl<bool, Lhs, Rhs>
+{
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const Lhs& lhs, const Rhs& rhs)
+  {
+    return blas_traits<Lhs>::extractScalarFactor(lhs) && blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const bool& alpha, const Lhs& lhs, const Rhs& rhs)
+  {
+    return alpha && blas_traits<Lhs>::extractScalarFactor(lhs) && blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+};
+
+template<typename ResScalar, typename Lhs, typename Rhs>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs)
+{
+  return combine_scalar_factors_impl<ResScalar,Lhs,Rhs>::run(alpha, lhs, rhs);
+}
+template<typename ResScalar, typename Lhs, typename Rhs>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const Lhs& lhs, const Rhs& rhs)
+{
+  return combine_scalar_factors_impl<ResScalar,Lhs,Rhs>::run(lhs, rhs);
+}
+
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/contrib/eigen/Eigen/src/Core/util/Constants.h b/contrib/eigen/Eigen/src/Core/util/Constants.h
index 7587d684243566fb97a61312663d66f4a154b672..35dcaa7b38630b856703f86cc91a58054b1a3d8a 100644
--- a/contrib/eigen/Eigen/src/Core/util/Constants.h
+++ b/contrib/eigen/Eigen/src/Core/util/Constants.h
@@ -3,6 +3,7 @@
 //
 // Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2020, Arm Limited and Contributors
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -25,6 +26,10 @@ const int Dynamic = -1;
   */
 const int DynamicIndex = 0xffffff;
 
+/** This value means that the increment to go from one value to another in a sequence is not constant for each step.
+  */
+const int UndefinedIncr = 0xfffffe;
+
 /** This value means +Infinity; it is currently used only as the p parameter to MatrixBase::lpNorm<int>().
   * The value Infinity there means the L-infinity norm.
   */
@@ -152,7 +157,7 @@ const unsigned int DirectAccessBit = 0x40;
 /** \deprecated \ingroup flags
   *
   * means the first coefficient packet is guaranteed to be aligned.
-  * An expression cannot has the AlignedBit without the PacketAccessBit flag.
+  * An expression cannot have the AlignedBit without the PacketAccessBit flag.
   * In other words, this means we are allow to perform an aligned packet access to the first element regardless
   * of the expression kind:
   * \code
@@ -250,12 +255,6 @@ enum AlignmentType {
 #endif
 };
 
-/** \ingroup enums
- * Enum used by DenseBase::corner() in Eigen2 compatibility mode. */
-// FIXME after the corner() API change, this was not needed anymore, except by AlignedBox
-// TODO: find out what to do with that. Adapt the AlignedBox API ?
-enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
-
 /** \ingroup enums
   * Enum containing possible values for the \p Direction parameter of
   * Reverse, PartialReduxExpr and VectorwiseOp. */
@@ -330,9 +329,20 @@ enum StorageOptions {
   * Enum for specifying whether to apply or solve on the left or right. */
 enum SideType {
   /** Apply transformation on the left. */
-  OnTheLeft = 1,  
+  OnTheLeft = 1,
   /** Apply transformation on the right. */
-  OnTheRight = 2  
+  OnTheRight = 2
+};
+
+/** \ingroup enums
+ * Enum for specifying NaN-propagation behavior, e.g. for coeff-wise min/max. */
+enum NaNPropagationOptions {
+  /**  Implementation defined behavior if NaNs are present. */
+  PropagateFast = 0,
+  /**  Always propagate NaNs. */
+  PropagateNaN,
+  /**  Always propagate not-NaNs. */
+  PropagateNumbers
 };
 
 /* the following used to be written as:
@@ -464,6 +474,8 @@ namespace Architecture
     AltiVec = 0x2,
     VSX = 0x3,
     NEON = 0x4,
+    MSA = 0x5,
+    SVE = 0x6,
 #if defined EIGEN_VECTORIZE_SSE
     Target = SSE
 #elif defined EIGEN_VECTORIZE_ALTIVEC
@@ -472,6 +484,10 @@ namespace Architecture
     Target = VSX
 #elif defined EIGEN_VECTORIZE_NEON
     Target = NEON
+#elif defined EIGEN_VECTORIZE_SVE
+    Target = SVE
+#elif defined EIGEN_VECTORIZE_MSA
+    Target = MSA
 #else
     Target = Generic
 #endif
diff --git a/contrib/eigen/Eigen/src/Core/util/DisableStupidWarnings.h b/contrib/eigen/Eigen/src/Core/util/DisableStupidWarnings.h
index 74f74cc42b84b45ad68ac0d6155d68d15bc4f0f4..fe0cfec0bc2461ac44abca8f3d05b468d3c60fd9 100755
--- a/contrib/eigen/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/contrib/eigen/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -4,7 +4,6 @@
 #ifdef _MSC_VER
   // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
   // 4101 - unreferenced local variable
-  // 4127 - conditional expression is constant
   // 4181 - qualifier applied to reference type ignored
   // 4211 - nonstandard extension used : redefined extern to static
   // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
@@ -20,7 +19,7 @@
   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
     #pragma warning( push )
   #endif
-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
+  #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
 
 #elif defined __INTEL_COMPILER
   // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
@@ -42,6 +41,17 @@
     #pragma clang diagnostic push
   #endif
   #pragma clang diagnostic ignored "-Wconstant-logical-operand"
+  #if __clang_major__ >= 3 && __clang_minor__ >= 5
+    #pragma clang diagnostic ignored "-Wabsolute-value"
+  #endif
+  #if __clang_major__ >= 10
+    #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
+  #endif
+  #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L
+    // warning: generic selections are a C11-specific feature
+    // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
+    #pragma clang diagnostic ignored "-Wc11-extensions"
+  #endif
 
 #elif defined __GNUC__
 
@@ -64,6 +74,7 @@
 #endif
 
 #if defined __NVCC__
+  #pragma diag_suppress boolean_controlling_expr_is_constant
   // Disable the "statement is unreachable" message
   #pragma diag_suppress code_is_unreachable
   // Disable the "dynamic initialization in unreachable code" message
@@ -81,6 +92,7 @@
   #pragma diag_suppress 2671
   #pragma diag_suppress 2735
   #pragma diag_suppress 2737
+  #pragma diag_suppress 2739
 #endif
 
 #else
diff --git a/contrib/eigen/Eigen/src/Core/util/ForwardDeclarations.h b/contrib/eigen/Eigen/src/Core/util/ForwardDeclarations.h
index 134544f9643728148a86c6f37404e92c0c51d772..2f9cc449176fb2ea3cbe83d9c8dc2441153fdfc4 100644
--- a/contrib/eigen/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/contrib/eigen/Eigen/src/Core/util/ForwardDeclarations.h
@@ -79,6 +79,8 @@ template<typename ExpressionType> class ForceAlignedAccess;
 template<typename ExpressionType> class SwapWrapper;
 
 template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool InnerPanel = false> class Block;
+template<typename XprType, typename RowIndices, typename ColIndices> class IndexedView;
+template<typename XprType, int Rows=Dynamic, int Cols=Dynamic, int Order=0> class Reshaped;
 
 template<typename MatrixType, int Size=Dynamic> class VectorBlock;
 template<typename MatrixType> class Transpose;
@@ -108,7 +110,7 @@ template<typename _IndicesType> class TranspositionsWrapper;
 template<typename Derived,
          int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
 > class MapBase;
-template<int InnerStrideAtCompileTime, int OuterStrideAtCompileTime> class Stride;
+template<int OuterStrideAtCompileTime, int InnerStrideAtCompileTime> class Stride;
 template<int Value = Dynamic> class InnerStride;
 template<int Value = Dynamic> class OuterStride;
 template<typename MatrixType, int MapOptions=Unaligned, typename StrideType = Stride<0,0> > class Map;
@@ -129,6 +131,10 @@ template<typename Derived> class SolverBase;
 template<typename XprType> class InnerIterator;
 
 namespace internal {
+template<typename XprType> class generic_randaccess_stl_iterator;
+template<typename XprType> class pointer_based_stl_iterator;
+template<typename XprType, DirectionType Direction> class subvector_stl_iterator;
+template<typename XprType, DirectionType Direction> class subvector_stl_reverse_iterator;
 template<typename DecompositionType> struct kernel_retval_base;
 template<typename DecompositionType> struct kernel_retval;
 template<typename DecompositionType> struct image_retval_base;
@@ -174,14 +180,15 @@ template<typename LhsScalar, typename RhsScalar, bool ConjLhs=false, bool ConjRh
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_sum_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_difference_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_conj_product_op;
-template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_min_op;
-template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_max_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar, int NaNPropagation=PropagateFast> struct scalar_min_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar, int NaNPropagation=PropagateFast> struct scalar_max_op;
 template<typename Scalar> struct scalar_opposite_op;
 template<typename Scalar> struct scalar_conjugate_op;
 template<typename Scalar> struct scalar_real_op;
 template<typename Scalar> struct scalar_imag_op;
 template<typename Scalar> struct scalar_abs_op;
 template<typename Scalar> struct scalar_abs2_op;
+template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_absolute_difference_op;
 template<typename Scalar> struct scalar_sqrt_op;
 template<typename Scalar> struct scalar_rsqrt_op;
 template<typename Scalar> struct scalar_exp_op;
@@ -198,7 +205,7 @@ template<typename Scalar, typename NewType> struct scalar_cast_op;
 template<typename Scalar> struct scalar_random_op;
 template<typename Scalar> struct scalar_constant_op;
 template<typename Scalar> struct scalar_identity_op;
-template<typename Scalar,bool iscpx> struct scalar_sign_op;
+template<typename Scalar,bool is_complex, bool is_integer> struct scalar_sign_op;
 template<typename Scalar,typename ScalarExponent> struct scalar_pow_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_hypot_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
@@ -209,11 +216,27 @@ template<typename Scalar> struct scalar_lgamma_op;
 template<typename Scalar> struct scalar_digamma_op;
 template<typename Scalar> struct scalar_erf_op;
 template<typename Scalar> struct scalar_erfc_op;
+template<typename Scalar> struct scalar_ndtri_op;
 template<typename Scalar> struct scalar_igamma_op;
 template<typename Scalar> struct scalar_igammac_op;
 template<typename Scalar> struct scalar_zeta_op;
 template<typename Scalar> struct scalar_betainc_op;
 
+// Bessel functions in SpecialFunctions module
+template<typename Scalar> struct scalar_bessel_i0_op;
+template<typename Scalar> struct scalar_bessel_i0e_op;
+template<typename Scalar> struct scalar_bessel_i1_op;
+template<typename Scalar> struct scalar_bessel_i1e_op;
+template<typename Scalar> struct scalar_bessel_j0_op;
+template<typename Scalar> struct scalar_bessel_y0_op;
+template<typename Scalar> struct scalar_bessel_j1_op;
+template<typename Scalar> struct scalar_bessel_y1_op;
+template<typename Scalar> struct scalar_bessel_k0_op;
+template<typename Scalar> struct scalar_bessel_k0e_op;
+template<typename Scalar> struct scalar_bessel_k1_op;
+template<typename Scalar> struct scalar_bessel_k1e_op;
+
+
 } // end namespace internal
 
 struct IOFormat;
@@ -251,6 +274,7 @@ template<typename MatrixType> class HouseholderQR;
 template<typename MatrixType> class ColPivHouseholderQR;
 template<typename MatrixType> class FullPivHouseholderQR;
 template<typename MatrixType> class CompleteOrthogonalDecomposition;
+template<typename MatrixType> class SVDBase;
 template<typename MatrixType, int QRPreconditioner = ColPivHouseholderQRPreconditioner> class JacobiSVD;
 template<typename MatrixType> class BDCSVD;
 template<typename MatrixType, int UpLo = Lower> class LLT;
diff --git a/contrib/eigen/Eigen/src/Core/util/MKL_support.h b/contrib/eigen/Eigen/src/Core/util/MKL_support.h
index b7d6ecc76e81e684be9cb3095c705334c16c79ab..17963fad4d42c979c928b053b45fcb3b099b30f0 100755
--- a/contrib/eigen/Eigen/src/Core/util/MKL_support.h
+++ b/contrib/eigen/Eigen/src/Core/util/MKL_support.h
@@ -55,7 +55,11 @@
 
 
 #if defined EIGEN_USE_MKL
-#   include <mkl.h> 
+#   if (!defined MKL_DIRECT_CALL) && (!defined EIGEN_MKL_NO_DIRECT_CALL)
+#       define MKL_DIRECT_CALL
+#       define MKL_DIRECT_CALL_JUST_SET
+#   endif
+#   include <mkl.h>
 /*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
 #   ifndef INTEL_MKL_VERSION
 #       undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */
@@ -69,6 +73,9 @@
 #       undef   EIGEN_USE_MKL_VML
 #       undef   EIGEN_USE_LAPACKE_STRICT
 #       undef   EIGEN_USE_LAPACKE
+#       ifdef   MKL_DIRECT_CALL_JUST_SET
+#           undef MKL_DIRECT_CALL
+#       endif
 #   endif
 #endif
 
diff --git a/contrib/eigen/Eigen/src/Core/util/Macros.h b/contrib/eigen/Eigen/src/Core/util/Macros.h
index 87233eadf8a11c2d7e94750aca5a485a83954332..986c3d44db94c8ba339792b6738c47cdd2c5acbc 100644
--- a/contrib/eigen/Eigen/src/Core/util/Macros.h
+++ b/contrib/eigen/Eigen/src/Core/util/Macros.h
@@ -11,19 +11,56 @@
 #ifndef EIGEN_MACROS_H
 #define EIGEN_MACROS_H
 
+//------------------------------------------------------------------------------------------
+// Eigen version and basic defaults
+//------------------------------------------------------------------------------------------
+
 #define EIGEN_WORLD_VERSION 3
-#define EIGEN_MAJOR_VERSION 3
-#define EIGEN_MINOR_VERSION 8
+#define EIGEN_MAJOR_VERSION 4
+#define EIGEN_MINOR_VERSION 0
 
 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                       (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
                                                                  EIGEN_MINOR_VERSION>=z))))
 
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
+#else
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor
+#endif
+
+#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t
+#endif
+
+// Upperbound on the C++ version to use.
+// Expected values are 03, 11, 14, 17, etc.
+// By default, let's use an arbitrarily large C++ version.
+#ifndef EIGEN_MAX_CPP_VER
+#define EIGEN_MAX_CPP_VER 99
+#endif
+
+/** Allows to disable some optimizations which might affect the accuracy of the result.
+  * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
+  * They currently include:
+  *   - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization.
+  */
+#ifndef EIGEN_FAST_MATH
+#define EIGEN_FAST_MATH 1
+#endif
+
+#ifndef EIGEN_STACK_ALLOCATION_LIMIT
+// 131072 == 128 KB
+#define EIGEN_STACK_ALLOCATION_LIMIT 131072
+#endif
+
+//------------------------------------------------------------------------------------------
 // Compiler identification, EIGEN_COMP_*
+//------------------------------------------------------------------------------------------
 
 /// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC
 #ifdef __GNUC__
-  #define EIGEN_COMP_GNUC 1
+  #define EIGEN_COMP_GNUC (__GNUC__*10+__GNUC_MINOR__)
 #else
   #define EIGEN_COMP_GNUC 0
 #endif
@@ -35,6 +72,12 @@
   #define EIGEN_COMP_CLANG 0
 #endif
 
+/// \internal EIGEN_COMP_CASTXML set to 1 if being preprocessed by CastXML
+#if defined(__castxml__)
+  #define EIGEN_COMP_CASTXML 1
+#else
+  #define EIGEN_COMP_CASTXML 0
+#endif
 
 /// \internal EIGEN_COMP_LLVM set to 1 if the compiler backend is llvm
 #if defined(__llvm__)
@@ -71,14 +114,44 @@
   #define EIGEN_COMP_MSVC 0
 #endif
 
+#if defined(__NVCC__)
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+  #define EIGEN_COMP_NVCC  ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
+#elif defined(__CUDACC_VER__)
+  #define EIGEN_COMP_NVCC __CUDACC_VER__
+#else
+  #error "NVCC did not define compiler version."
+#endif
+#else
+  #define EIGEN_COMP_NVCC 0
+#endif
+
 // For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC:
-//  name  ver   MSC_VER
-//  2008    9      1500
-//  2010   10      1600
-//  2012   11      1700
-//  2013   12      1800
-//  2015   14      1900
-//  "15"   15      1900
+//  name        ver   MSC_VER
+//  2008         9      1500
+//  2010        10      1600
+//  2012        11      1700
+//  2013        12      1800
+//  2015        14      1900
+//  "15"        15      1900
+//  2017-14.1   15.0    1910
+//  2017-14.11  15.3    1911
+//  2017-14.12  15.5    1912
+//  2017-14.13  15.6    1913
+//  2017-14.14  15.7    1914
+
+/// \internal EIGEN_COMP_MSVC_LANG set to _MSVC_LANG if the compiler is Microsoft Visual C++, 0 otherwise.
+#if defined(_MSVC_LANG)
+  #define EIGEN_COMP_MSVC_LANG _MSVC_LANG
+#else
+  #define EIGEN_COMP_MSVC_LANG 0
+#endif
+
+// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC_LANG:
+// MSVC option                          Standard  MSVC_LANG
+// /std:c++14 (default as of VS 2019)   C++14     201402L
+// /std:c++17                           C++17     201703L
+// /std:c++latest                       >C++17    >201703L
 
 /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl
 #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG)
@@ -87,16 +160,21 @@
   #define EIGEN_COMP_MSVC_STRICT 0
 #endif
 
-/// \internal EIGEN_COMP_IBM set to 1 if the compiler is IBM XL C++
-#if defined(__IBMCPP__) || defined(__xlc__)
-  #define EIGEN_COMP_IBM 1
+/// \internal EIGEN_COMP_IBM set to xlc version if the compiler is IBM XL C++
+// XLC   version
+// 3.1   0x0301
+// 4.5   0x0405
+// 5.0   0x0500
+// 12.1  0x0C01
+#if defined(__IBMCPP__) || defined(__xlc__) || defined(__ibmxl__)
+  #define EIGEN_COMP_IBM __xlC__
 #else
   #define EIGEN_COMP_IBM 0
 #endif
 
-/// \internal EIGEN_COMP_PGI set to 1 if the compiler is Portland Group Compiler
+/// \internal EIGEN_COMP_PGI set to PGI version if the compiler is Portland Group Compiler
 #if defined(__PGI)
-  #define EIGEN_COMP_PGI 1
+  #define EIGEN_COMP_PGI (__PGIC__*100+__PGIC_MINOR__)
 #else
   #define EIGEN_COMP_PGI 0
 #endif
@@ -108,7 +186,7 @@
   #define EIGEN_COMP_ARM 0
 #endif
 
-/// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler
+/// \internal EIGEN_COMP_EMSCRIPTEN set to 1 if the compiler is Emscripten Compiler
 #if defined(__EMSCRIPTEN__)
   #define EIGEN_COMP_EMSCRIPTEN 1
 #else
@@ -142,9 +220,13 @@
 #endif
 
 
+
+//------------------------------------------------------------------------------------------
 // Architecture identification, EIGEN_ARCH_*
+//------------------------------------------------------------------------------------------
 
-#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
+
+#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)) || defined(__amd64)
   #define EIGEN_ARCH_x86_64 1
 #else
   #define EIGEN_ARCH_x86_64 0
@@ -170,18 +252,61 @@
 #endif
 
 /// \internal EIGEN_ARCH_ARM64 set to 1 if the architecture is ARM64
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
   #define EIGEN_ARCH_ARM64 1
 #else
   #define EIGEN_ARCH_ARM64 0
 #endif
 
+/// \internal EIGEN_ARCH_ARM_OR_ARM64 set to 1 if the architecture is ARM or ARM64
 #if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64
   #define EIGEN_ARCH_ARM_OR_ARM64 1
 #else
   #define EIGEN_ARCH_ARM_OR_ARM64 0
 #endif
 
+/// \internal EIGEN_ARCH_ARMV8 set to 1 if the architecture is armv8 or greater.
+#if EIGEN_ARCH_ARM_OR_ARM64 && defined(__ARM_ARCH) && __ARM_ARCH >= 8
+#define EIGEN_ARCH_ARMV8 1
+#else
+#define EIGEN_ARCH_ARMV8 0
+#endif
+
+
+/// \internal EIGEN_HAS_ARM64_FP16 set to 1 if the architecture provides an IEEE
+/// compliant Arm fp16 type
+#if EIGEN_ARCH_ARM64
+  #ifndef EIGEN_HAS_ARM64_FP16
+    #if defined(__ARM_FP16_FORMAT_IEEE)
+      #define EIGEN_HAS_ARM64_FP16 1
+    #else
+      #define EIGEN_HAS_ARM64_FP16 0
+    #endif
+  #endif
+#endif
+
+/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture
+/// supports Neon vector intrinsics for fp16.
+#if EIGEN_ARCH_ARM64
+  #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+    #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1
+    #else
+      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0
+    #endif
+  #endif
+#endif
+
+/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture
+/// supports Neon scalar intrinsics for fp16.
+#if EIGEN_ARCH_ARM64
+  #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
+    #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+      #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1
+    #endif
+  #endif
+#endif
+
 /// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS
 #if defined(__mips__) || defined(__mips)
   #define EIGEN_ARCH_MIPS 1
@@ -212,7 +337,9 @@
 
 
 
+//------------------------------------------------------------------------------------------
 // Operating system identification, EIGEN_OS_*
+//------------------------------------------------------------------------------------------
 
 /// \internal EIGEN_OS_UNIX set to 1 if the OS is a unix variant
 #if defined(__unix__) || defined(__unix)
@@ -299,9 +426,17 @@
   #define EIGEN_OS_WIN_STRICT 0
 #endif
 
-/// \internal EIGEN_OS_SUN set to 1 if the OS is SUN
+/// \internal EIGEN_OS_SUN set to __SUNPRO_C if the OS is SUN
+// compiler  solaris   __SUNPRO_C
+// version   studio
+// 5.7       10        0x570
+// 5.8       11        0x580
+// 5.9       12        0x590
+// 5.10	     12.1      0x5100
+// 5.11	     12.2      0x5110
+// 5.12	     12.3      0x5120
 #if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__))
-  #define EIGEN_OS_SUN 1
+  #define EIGEN_OS_SUN __SUNPRO_C
 #else
   #define EIGEN_OS_SUN 0
 #endif
@@ -314,26 +449,137 @@
 #endif
 
 
+//------------------------------------------------------------------------------------------
+// Detect GPU compilers and architectures
+//------------------------------------------------------------------------------------------
 
-#if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG
-  // see bug 89
-  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0
-#else
-  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1
+// NVCC is not supported as the target platform for HIPCC
+// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive
+#if defined(__NVCC__) && defined(__HIPCC__)
+  #error "NVCC as the target platform for HIPCC is currently not supported."
 #endif
 
-// This macro can be used to prevent from macro expansion, e.g.:
-//   std::max EIGEN_NOT_A_MACRO(a,b)
-#define EIGEN_NOT_A_MACRO
+#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
+  // Means the compiler is either nvcc or clang with CUDA enabled
+  #define EIGEN_CUDACC __CUDACC__
+#endif
 
-#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
-#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
+#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
+  // Means we are generating code for the device
+  #define EIGEN_CUDA_ARCH __CUDA_ARCH__
+#endif
+
+#if defined(EIGEN_CUDACC)
+#include <cuda.h>
+  #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
 #else
-#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor
+  #define EIGEN_CUDA_SDK_VER 0
 #endif
 
-#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t
+#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
+  // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
+  #define EIGEN_HIPCC __HIPCC__
+
+  // We need to include hip_runtime.h here because it pulls in
+  // ++ hip_common.h which contains the define for  __HIP_DEVICE_COMPILE__
+  // ++ host_defines.h which contains the defines for the __host__ and __device__ macros
+  #include <hip/hip_runtime.h>
+
+  #if defined(__HIP_DEVICE_COMPILE__)
+    // analogous to EIGEN_CUDA_ARCH, but for HIP
+    #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
+  #endif
+
+  // For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute
+  // value to 1024. The compiler assigns a default value of 256 when the attribute is not
+  // specified. This results in failures on the HIP platform, for cases when a GPU kernel
+  // without an explicit launch_bounds attribute is called with a threads_per_block value
+  // greater than 256.
+  //
+  // This is a regression in functioanlity and is expected to be fixed within the next
+  // couple of ROCm releases (compiler will go back to using 1024 value as the default)
+  //
+  // In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds
+  // attribute.
+
+  #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)
+
+#endif
+
+#if !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
+#define EIGEN_HIP_LAUNCH_BOUNDS_1024
+#endif // !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
+
+// Unify CUDA/HIPCC
+
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
+//
+// If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC
+//
+#define EIGEN_GPUCC
+//
+// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels
+// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels
+//
+// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels.
+// For those cases, the corresponding code should be guarded with
+//      #if defined(EIGEN_GPUCC)
+// instead of
+//      #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
+//
+// For cases where the tweak is specific to HIP, the code should be guarded with
+//      #if defined(EIGEN_HIPCC)
+//
+// For cases where the tweak is specific to CUDA, the code should be guarded with
+//      #if defined(EIGEN_CUDACC)
+//
+#endif
+
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE
+//
+#define EIGEN_GPU_COMPILE_PHASE
+//
+// GPU compilers (HIPCC, NVCC) typically do two passes over the source code,
+//   + one to compile the source for the "host" (ie CPU)
+//   + another to compile the source for the "device" (ie. GPU)
+//
+// Code that needs to enabled only during the either the "host" or "device" compilation phase
+// needs to be guarded with a macro that indicates the current compilation phase
+//
+// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP
+// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA
+//
+// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA
+// For those cases, the code should be guarded with
+//       #if defined(EIGEN_GPU_COMPILE_PHASE)
+// instead of
+//       #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// For cases where the tweak is specific to HIP, the code should be guarded with
+//      #if defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// For cases where the tweak is specific to CUDA, the code should be guarded with
+//      #if defined(EIGEN_CUDA_ARCH)
+//
+#endif
+
+#if defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
+// EIGEN_USE_SYCL is a user-defined macro while __SYCL_DEVICE_ONLY__ is a compiler-defined macro.
+// In most cases we want to check if both macros are defined which can be done using the define below.
+#define SYCL_DEVICE_ONLY
+#endif
+
+//------------------------------------------------------------------------------------------
+// Detect Compiler/Architecture/OS specific features
+//------------------------------------------------------------------------------------------
+
+#if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG
+  // see bug 89
+  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0
+#else
+  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1
 #endif
 
 // Cross compiler wrapper around LLVM's __has_builtin
@@ -349,26 +595,79 @@
 # define __has_feature(x) 0
 #endif
 
-// Upperbound on the C++ version to use.
-// Expected values are 03, 11, 14, 17, etc.
-// By default, let's use an arbitrarily large C++ version.
-#ifndef EIGEN_MAX_CPP_VER
-#define EIGEN_MAX_CPP_VER 99
+// Some old compilers do not support template specializations like:
+// template<typename T,int N> void foo(const T x[N]);
+#if !(   EIGEN_COMP_CLANG && (   (EIGEN_COMP_CLANG<309)                                                       \
+                              || (defined(__apple_build_version__) && (__apple_build_version__ < 9000000)))  \
+      || EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<49)
+#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 1
+#else
+#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0
+#endif
+
+// The macro EIGEN_CPLUSPLUS is a replacement for __cplusplus/_MSVC_LANG that
+// works for both platforms, indicating the C++ standard version number.
+//
+// With MSVC, without defining /Zc:__cplusplus, the __cplusplus macro will
+// report 199711L regardless of the language standard specified via /std.
+// We need to rely on _MSVC_LANG instead, which is only available after
+// VS2015.3.
+#if EIGEN_COMP_MSVC_LANG > 0
+#define EIGEN_CPLUSPLUS EIGEN_COMP_MSVC_LANG
+#elif EIGEN_COMP_MSVC >= 1900
+#define EIGEN_CPLUSPLUS 201103L
+#elif defined(__cplusplus)
+#define EIGEN_CPLUSPLUS __cplusplus
+#else
+#define EIGEN_CPLUSPLUS 0
+#endif
+
+// The macro EIGEN_COMP_CXXVER defines the c++ verson expected by the compiler.
+// For instance, if compiling with gcc and -std=c++17, then EIGEN_COMP_CXXVER
+// is defined to 17.
+#if EIGEN_CPLUSPLUS > 201703L
+  #define EIGEN_COMP_CXXVER 20
+#elif EIGEN_CPLUSPLUS > 201402L
+  #define EIGEN_COMP_CXXVER 17
+#elif EIGEN_CPLUSPLUS > 201103L
+  #define EIGEN_COMP_CXXVER 14
+#elif EIGEN_CPLUSPLUS >= 201103L
+  #define EIGEN_COMP_CXXVER 11
+#else
+  #define EIGEN_COMP_CXXVER 03
+#endif
+
+#ifndef EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
+  #if defined(__cpp_variable_templates) && __cpp_variable_templates >= 201304 && EIGEN_MAX_CPP_VER>=14
+    #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 1
+  #else
+    #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 0
+  #endif
 #endif
 
-#if EIGEN_MAX_CPP_VER>=11 && (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900)
+
+// The macros EIGEN_HAS_CXX?? defines a rough estimate of available c++ features
+// but in practice we should not rely on them but rather on the availabilty of
+// individual features as defined later.
+// This is why there is no EIGEN_HAS_CXX17.
+// FIXME: get rid of EIGEN_HAS_CXX14 and maybe even EIGEN_HAS_CXX11.
+#if EIGEN_MAX_CPP_VER>=11 && EIGEN_COMP_CXXVER>=11
 #define EIGEN_HAS_CXX11 1
 #else
 #define EIGEN_HAS_CXX11 0
 #endif
 
+#if EIGEN_MAX_CPP_VER>=14 && EIGEN_COMP_CXXVER>=14
+#define EIGEN_HAS_CXX14 1
+#else
+#define EIGEN_HAS_CXX14 0
+#endif
 
 // Do we support r-value references?
 #ifndef EIGEN_HAS_RVALUE_REFERENCES
 #if EIGEN_MAX_CPP_VER>=11 && \
     (__has_feature(cxx_rvalue_references) || \
-    (defined(__cplusplus) && __cplusplus >= 201103L) || \
-    (EIGEN_COMP_MSVC >= 1600))
+     (EIGEN_COMP_CXXVER >= 11) || (EIGEN_COMP_MSVC >= 1600))
   #define EIGEN_HAS_RVALUE_REFERENCES 1
 #else
   #define EIGEN_HAS_RVALUE_REFERENCES 0
@@ -376,12 +675,14 @@
 #endif
 
 // Does the compiler support C99?
+// Need to include <cmath> to make sure _GLIBCXX_USE_C99 gets defined
+#include <cmath>
 #ifndef EIGEN_HAS_C99_MATH
 #if EIGEN_MAX_CPP_VER>=11 && \
     ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901))       \
   || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \
   || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) \
-  || (EIGEN_COMP_MSVC >= 1900) )
+  || (EIGEN_COMP_MSVC >= 1900) || defined(SYCL_DEVICE_ONLY))
   #define EIGEN_HAS_C99_MATH 1
 #else
   #define EIGEN_HAS_C99_MATH 0
@@ -389,14 +690,50 @@
 #endif
 
 // Does the compiler support result_of?
+// result_of was deprecated in c++17 and removed in c++ 20
 #ifndef EIGEN_HAS_STD_RESULT_OF
-#if EIGEN_MAX_CPP_VER>=11 && ((__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L)))
+#if EIGEN_HAS_CXX11 && EIGEN_COMP_CXXVER < 17
 #define EIGEN_HAS_STD_RESULT_OF 1
 #else
 #define EIGEN_HAS_STD_RESULT_OF 0
 #endif
 #endif
 
+// Does the compiler support std::hash?
+#ifndef EIGEN_HAS_STD_HASH
+// The std::hash struct is defined in C++11 but is not labelled as a __device__
+// function and is not constexpr, so cannot be used on device.
+#if EIGEN_HAS_CXX11 && !defined(EIGEN_GPU_COMPILE_PHASE)
+#define EIGEN_HAS_STD_HASH 1
+#else
+#define EIGEN_HAS_STD_HASH 0
+#endif
+#endif  // EIGEN_HAS_STD_HASH
+
+#ifndef EIGEN_HAS_STD_INVOKE_RESULT
+#if EIGEN_MAX_CPP_VER >= 17 && EIGEN_COMP_CXXVER >= 17
+#define EIGEN_HAS_STD_INVOKE_RESULT 1
+#else
+#define EIGEN_HAS_STD_INVOKE_RESULT 0
+#endif
+#endif
+
+#ifndef EIGEN_HAS_ALIGNAS
+#if EIGEN_MAX_CPP_VER>=11 && EIGEN_HAS_CXX11 &&   \
+      (     __has_feature(cxx_alignas)            \
+        ||  EIGEN_HAS_CXX14                       \
+        || (EIGEN_COMP_MSVC >= 1800)              \
+        || (EIGEN_GNUC_AT_LEAST(4,8))             \
+        || (EIGEN_COMP_CLANG>=305)                \
+        || (EIGEN_COMP_ICC>=1500)                 \
+        || (EIGEN_COMP_PGI>=1500)                 \
+        || (EIGEN_COMP_SUNCC>=0x5130))
+#define EIGEN_HAS_ALIGNAS 1
+#else
+#define EIGEN_HAS_ALIGNAS 0
+#endif
+#endif
+
 // Does the compiler support type_traits?
 // - full support of type traits was added only to GCC 5.1.0.
 // - 20150626 corresponds to the last release of 4.x libstdc++
@@ -413,11 +750,13 @@
 
 // Does the compiler support variadic templates?
 #ifndef EIGEN_HAS_VARIADIC_TEMPLATES
-#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
-  && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) )
+#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_COMP_CXXVER >= 11) \
+  && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_COMP_NVCC >= 80000) )
     // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
     //    this prevents nvcc from crashing when compiling Eigen on Tegra X1
 #define EIGEN_HAS_VARIADIC_TEMPLATES 1
+#elif  EIGEN_MAX_CPP_VER>=11 && (EIGEN_COMP_CXXVER >= 11) && defined(SYCL_DEVICE_ONLY)
+#define EIGEN_HAS_VARIADIC_TEMPLATES 1
 #else
 #define EIGEN_HAS_VARIADIC_TEMPLATES 0
 #endif
@@ -425,27 +764,33 @@
 
 // Does the compiler fully support const expressions? (as in c++14)
 #ifndef EIGEN_HAS_CONSTEXPR
+  #if defined(EIGEN_CUDACC)
+  // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
+    #if EIGEN_MAX_CPP_VER>=14 && (EIGEN_COMP_CXXVER >= 11 && (EIGEN_COMP_CLANG || EIGEN_COMP_NVCC >= 70500))
+      #define EIGEN_HAS_CONSTEXPR 1
+    #endif
+  #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (EIGEN_COMP_CXXVER >= 14) || \
+    (EIGEN_GNUC_AT_LEAST(4,8) && (EIGEN_COMP_CXXVER >= 11)) || \
+    (EIGEN_COMP_CLANG >= 306 && (EIGEN_COMP_CXXVER >= 11)))
+    #define EIGEN_HAS_CONSTEXPR 1
+  #endif
 
-#ifdef __CUDACC__
-// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
-#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500))
-  #define EIGEN_HAS_CONSTEXPR 1
-#endif
-#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
-  (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L)))
-#define EIGEN_HAS_CONSTEXPR 1
-#endif
+  #ifndef EIGEN_HAS_CONSTEXPR
+    #define EIGEN_HAS_CONSTEXPR 0
+  #endif
 
-#ifndef EIGEN_HAS_CONSTEXPR
-#define EIGEN_HAS_CONSTEXPR 0
-#endif
+#endif // EIGEN_HAS_CONSTEXPR
 
+#if EIGEN_HAS_CONSTEXPR
+#define EIGEN_CONSTEXPR constexpr
+#else
+#define EIGEN_CONSTEXPR
 #endif
 
 // Does the compiler support C++11 math?
 // Let's be conservative and enable the default C++11 implementation only if we are sure it exists
 #ifndef EIGEN_HAS_CXX11_MATH
-  #if EIGEN_MAX_CPP_VER>=11 && ((__cplusplus > 201103L) || (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
+  #if EIGEN_MAX_CPP_VER>=11 && ((EIGEN_COMP_CXXVER > 11) || (EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
       && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC))
     #define EIGEN_HAS_CXX11_MATH 1
   #else
@@ -456,9 +801,8 @@
 // Does the compiler support proper C++11 containers?
 #ifndef EIGEN_HAS_CXX11_CONTAINERS
   #if    EIGEN_MAX_CPP_VER>=11 && \
-         ((__cplusplus > 201103L) \
-      || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \
-      || EIGEN_COMP_MSVC >= 1900)
+         ((EIGEN_COMP_CXXVER > 11) \
+      || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC>=1400)))
     #define EIGEN_HAS_CXX11_CONTAINERS 1
   #else
     #define EIGEN_HAS_CXX11_CONTAINERS 0
@@ -469,24 +813,88 @@
 #ifndef EIGEN_HAS_CXX11_NOEXCEPT
   #if    EIGEN_MAX_CPP_VER>=11 && \
          (__has_feature(cxx_noexcept) \
-      || (__cplusplus > 201103L) \
-      || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \
-      || EIGEN_COMP_MSVC >= 1900)
+      || (EIGEN_COMP_CXXVER > 11) \
+      || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC>=1400)))
     #define EIGEN_HAS_CXX11_NOEXCEPT 1
   #else
     #define EIGEN_HAS_CXX11_NOEXCEPT 0
   #endif
 #endif
 
-/** Allows to disable some optimizations which might affect the accuracy of the result.
-  * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
-  * They currently include:
-  *   - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization.
-  */
-#ifndef EIGEN_FAST_MATH
-#define EIGEN_FAST_MATH 1
+#ifndef EIGEN_HAS_CXX11_ATOMIC
+  #if    EIGEN_MAX_CPP_VER>=11 && \
+         (__has_feature(cxx_atomic) \
+      || (EIGEN_COMP_CXXVER > 11) \
+      || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_MSVC==0 || EIGEN_COMP_MSVC >= 1700)))
+    #define EIGEN_HAS_CXX11_ATOMIC 1
+  #else
+    #define EIGEN_HAS_CXX11_ATOMIC 0
+  #endif
+#endif
+
+#ifndef EIGEN_HAS_CXX11_OVERRIDE_FINAL
+  #if    EIGEN_MAX_CPP_VER>=11 && \
+       (EIGEN_COMP_CXXVER >= 11 || EIGEN_COMP_MSVC >= 1700)
+    #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 1
+  #else
+    #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 0
+  #endif
+#endif
+
+// NOTE: the required Apple's clang version is very conservative
+//       and it could be that XCode 9 works just fine.
+// NOTE: the MSVC version is based on https://en.cppreference.com/w/cpp/compiler_support
+//       and not tested.
+#ifndef EIGEN_HAS_CXX17_OVERALIGN
+#if EIGEN_MAX_CPP_VER>=17 && EIGEN_COMP_CXXVER>=17 && (                                 \
+           (EIGEN_COMP_MSVC >= 1912)                                                    \
+        || (EIGEN_GNUC_AT_LEAST(7,0))                                                   \
+        || ((!defined(__apple_build_version__)) && (EIGEN_COMP_CLANG>=500))             \
+        || (( defined(__apple_build_version__)) && (__apple_build_version__>=10000000)) \
+      )
+#define EIGEN_HAS_CXX17_OVERALIGN 1
+#else
+#define EIGEN_HAS_CXX17_OVERALIGN 0
+#endif
+#endif
+
+#if defined(EIGEN_CUDACC) && EIGEN_HAS_CONSTEXPR
+  // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules
+  #if defined(__NVCC__)
+    // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr
+    #ifdef __CUDACC_RELAXED_CONSTEXPR__
+      #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
+    #endif
+  #elif defined(__clang__) && defined(__CUDA__) && __has_feature(cxx_relaxed_constexpr)
+    // clang++ always considers constexpr functions as implicitly __host__ __device__
+    #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
+  #endif
+#endif
+
+// Does the compiler support the __int128 and __uint128_t extensions for 128-bit
+// integer arithmetic?
+//
+// Clang and GCC define __SIZEOF_INT128__ when these extensions are supported,
+// but we avoid using them in certain cases:
+//
+// * Building using Clang for Windows, where the Clang runtime library has
+//   128-bit support only on LP64 architectures, but Windows is LLP64.
+#ifndef EIGEN_HAS_BUILTIN_INT128
+#if defined(__SIZEOF_INT128__) && !(EIGEN_OS_WIN && EIGEN_COMP_CLANG)
+#define EIGEN_HAS_BUILTIN_INT128 1
+#else
+#define EIGEN_HAS_BUILTIN_INT128 0
+#endif
 #endif
 
+//------------------------------------------------------------------------------------------
+// Preprocessor programming helpers
+//------------------------------------------------------------------------------------------
+
+// This macro can be used to prevent from macro expansion, e.g.:
+//   std::max EIGEN_NOT_A_MACRO(a,b)
+#define EIGEN_NOT_A_MACRO
+
 #define EIGEN_DEBUG_VAR(x) std::cerr << #x << " = " << x << std::endl;
 
 // concatenate two tokens
@@ -503,7 +911,7 @@
 // but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline
 // but GCC is still doing fine with just inline.
 #ifndef EIGEN_STRONG_INLINE
-#if EIGEN_COMP_MSVC || EIGEN_COMP_ICC
+#if (EIGEN_COMP_MSVC || EIGEN_COMP_ICC) && !defined(EIGEN_GPUCC)
 #define EIGEN_STRONG_INLINE __forceinline
 #else
 #define EIGEN_STRONG_INLINE inline
@@ -518,7 +926,7 @@
 //   Eval.h:91: sorry, unimplemented: inlining failed in call to 'const Eigen::Eval<Derived> Eigen::MatrixBase<Scalar, Derived>::eval() const'
 //    : function body not available
 //   See also bug 1367
-#if EIGEN_GNUC_AT_LEAST(4,2)
+#if EIGEN_GNUC_AT_LEAST(4,2) && !defined(SYCL_DEVICE_ONLY)
 #define EIGEN_ALWAYS_INLINE __attribute__((always_inline)) inline
 #else
 #define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE
@@ -538,12 +946,43 @@
 #define EIGEN_PERMISSIVE_EXPR
 #endif
 
+// GPU stuff
+
+// Disable some features when compiling with GPU compilers (NVCC/clang-cuda/SYCL/HIPCC)
+#if defined(EIGEN_CUDACC) || defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIPCC)
+  // Do not try asserts on device code
+  #ifndef EIGEN_NO_DEBUG
+  #define EIGEN_NO_DEBUG
+  #endif
+
+  #ifdef EIGEN_INTERNAL_DEBUGGING
+  #undef EIGEN_INTERNAL_DEBUGGING
+  #endif
+
+  #ifdef EIGEN_EXCEPTIONS
+  #undef EIGEN_EXCEPTIONS
+  #endif
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+  #ifndef EIGEN_DONT_VECTORIZE
+    #define EIGEN_DONT_VECTORIZE
+  #endif
+  #define EIGEN_DEVICE_FUNC __attribute__((flatten)) __attribute__((always_inline))
+// All functions callable from CUDA/HIP code must be qualified with __device__
+#elif defined(EIGEN_GPUCC)
+    #define EIGEN_DEVICE_FUNC __host__ __device__
+#else
+  #define EIGEN_DEVICE_FUNC
+#endif
+
+
 // this macro allows to get rid of linking errors about multiply defined functions.
 //  - static is not very good because it prevents definitions from different object files to be merged.
 //           So static causes the resulting linked executable to be bloated with multiple copies of the same function.
 //  - inline is not perfect either as it unwantedly hints the compiler toward inlining the function.
-#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS inline
+#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC
+#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC inline
 
 #ifdef NDEBUG
 # ifndef EIGEN_NO_DEBUG
@@ -553,7 +992,11 @@
 
 // eigen_plain_assert is where we implement the workaround for the assert() bug in GCC <= 4.3, see bug 89
 #ifdef EIGEN_NO_DEBUG
-  #define eigen_plain_assert(x)
+  #ifdef SYCL_DEVICE_ONLY // used to silence the warning on SYCL device
+    #define eigen_plain_assert(x) EIGEN_UNUSED_VARIABLE(x)
+  #else
+    #define eigen_plain_assert(x)
+  #endif
 #else
   #if EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO
     namespace Eigen {
@@ -627,7 +1070,7 @@
 // Suppresses 'unused variable' warnings.
 namespace Eigen {
   namespace internal {
-    template<typename T> EIGEN_DEVICE_FUNC void ignore_unused_variable(const T&) {}
+    template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ignore_unused_variable(const T&) {}
   }
 }
 #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var);
@@ -641,169 +1084,75 @@ namespace Eigen {
 #endif
 
 
-//------------------------------------------------------------------------------------------
-// Static and dynamic alignment control
-//
-// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
-// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
-// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
-// a default value is automatically computed based on architecture, compiler, and OS.
+// Acts as a barrier preventing operations involving `X` from crossing. This
+// occurs, for example, in the fast rounding trick where a magic constant is
+// added then subtracted, which is otherwise compiled away with -ffast-math.
 //
-// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
-// to be used to declare statically aligned buffers.
-//------------------------------------------------------------------------------------------
-
-
-/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
- * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
- * so that vectorization doesn't affect binary compatibility.
- *
- * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
- * vectorized and non-vectorized code.
- */
-#if (defined __CUDACC__)
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
-#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
-#elif EIGEN_COMP_MSVC
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
-#elif EIGEN_COMP_SUNCC
-  // FIXME not sure about this one:
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
-#else
-  #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler
-#endif
-
-// If the user explicitly disable vectorization, then we also disable alignment
-#if defined(EIGEN_DONT_VECTORIZE)
-  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
-#elif defined(EIGEN_VECTORIZE_AVX512)
-  // 64 bytes static alignmeent is preferred only if really required
-  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
-#elif defined(__AVX__)
-  // 32 bytes static alignmeent is preferred only if really required
-  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
-#else
-  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
-#endif
-
-
-// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
-#define EIGEN_MIN_ALIGN_BYTES 16
-
-// Defined the boundary (in bytes) on which the data needs to be aligned. Note
-// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
-// aligned at all regardless of the value of this #define.
-
-#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN))  && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
-#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
-#endif
-
-// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprectated
-// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
-#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)
-  #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
-    #undef EIGEN_MAX_STATIC_ALIGN_BYTES
-  #endif
-  #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
-#endif
-
-#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
-
-  // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
-
-  // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
-  // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
-  // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
-  // certain common platform (compiler+architecture combinations) to avoid these problems.
-  // Only static alignment is really problematic (relies on nonstandard compiler extensions),
-  // try to keep heap alignment even when we have to disable static alignment.
-  #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64)
-  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
-  #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
-  // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
-  // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use.
-  // 4.8 and newer seem definitely unaffected.
-  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
-  #else
-  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
-  #endif
-
-  // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
-  #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
-  && !EIGEN_GCC3_OR_OLDER \
-  && !EIGEN_COMP_SUNCC \
-  && !EIGEN_OS_QNX
-    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
-  #else
-    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
-  #endif
-
-  #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
-    #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+// See bug 1674
+#if !defined(EIGEN_OPTIMIZATION_BARRIER)
+  #if EIGEN_COMP_GNUC
+    // According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html:
+    //   X: Any operand whatsoever.
+    //   r: A register operand is allowed provided that it is in a general
+    //      register.
+    //   g: Any register, memory or immediate integer operand is allowed, except
+    //      for registers that are not general registers.
+    //   w: (AArch32/AArch64) Floating point register, Advanced SIMD vector
+    //      register or SVE vector register.
+    //   x: (SSE) Any SSE register.
+    //      (AArch64) Like w, but restricted to registers 0 to 15 inclusive.
+    //   v: (PowerPC) An Altivec vector register.
+    //   wa:(PowerPC) A VSX register.
+    //
+    // "X" (uppercase) should work for all cases, though this seems to fail for
+    // some versions of GCC for arm/aarch64 with
+    //   "error: inconsistent operand constraints in an 'asm'"
+    // Clang x86_64/arm/aarch64 seems to require "g" to support both scalars and
+    // vectors, otherwise
+    //   "error: non-trivial scalar-to-vector conversion, possible invalid
+    //    constraint for vector type"
+    //
+    // GCC for ppc64le generates an internal compiler error with x/X/g.
+    // GCC for AVX generates an internal compiler error with X.
+    //
+    // Tested on icc/gcc/clang for sse, avx, avx2, avx512dq
+    //           gcc for arm, aarch64,
+    //           gcc for ppc64le,
+    // both vectors and scalars.
+    //
+    // Note that this is restricted to plain types - this will not work
+    // directly for std::complex<T>, Eigen::half, Eigen::bfloat16. For these,
+    // you will need to apply to the underlying POD type.
+    #if EIGEN_ARCH_PPC && EIGEN_COMP_GNUC_STRICT
+      // This seems to be broken on clang.  Packet4f is loaded into a single
+      //   register rather than a vector, zeroing out some entries.  Integer
+      //   types also generate a compile error.
+      // General, Altivec, VSX.
+      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+r,v,wa" (X));
+    #elif EIGEN_ARCH_ARM_OR_ARM64
+      // General, NEON.
+      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+g,w" (X));
+    #elif EIGEN_ARCH_i386_OR_x86_64
+      // General, SSE.
+      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+g,x" (X));
+    #else
+      // Not implemented for other architectures.
+      #define EIGEN_OPTIMIZATION_BARRIER(X)
+    #endif
   #else
-    #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+    // Not implemented for other compilers.
+    #define EIGEN_OPTIMIZATION_BARRIER(X)
   #endif
-
 #endif
 
-// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES
-#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES<EIGEN_MAX_STATIC_ALIGN_BYTES
-#undef EIGEN_MAX_STATIC_ALIGN_BYTES
-#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
-#endif
-
-#if EIGEN_MAX_STATIC_ALIGN_BYTES==0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
-  #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-#endif
-
-// At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
-// It takes into account both the user choice to explicitly enable/disable alignment (by settting EIGEN_MAX_STATIC_ALIGN_BYTES)
-// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT).
-// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used.
-
-
-// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
-#define EIGEN_ALIGN8  EIGEN_ALIGN_TO_BOUNDARY(8)
-#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
-#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
-#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
-#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
-#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
+#if EIGEN_COMP_MSVC
+  // NOTE MSVC often gives C4127 warnings with compiletime if statements. See bug 1362.
+  // This workaround is ugly, but it does the job.
+#  define EIGEN_CONST_CONDITIONAL(cond)  (void)0, cond
 #else
-#define EIGEN_ALIGN_MAX
+#  define EIGEN_CONST_CONDITIONAL(cond)  cond
 #endif
 
-
-// Dynamic alignment control
-
-#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0
-#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
-#endif
-
-#ifdef EIGEN_DONT_ALIGN
-  #ifdef EIGEN_MAX_ALIGN_BYTES
-    #undef EIGEN_MAX_ALIGN_BYTES
-  #endif
-  #define EIGEN_MAX_ALIGN_BYTES 0
-#elif !defined(EIGEN_MAX_ALIGN_BYTES)
-  #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
-#endif
-
-#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
-#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
-#else
-#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
-#endif
-
-
-#ifndef EIGEN_UNALIGNED_VECTORIZE
-#define EIGEN_UNALIGNED_VECTORIZE 1
-#endif
-
-//----------------------------------------------------------------------
-
-
 #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
   #define EIGEN_RESTRICT
 #endif
@@ -811,10 +1160,6 @@ namespace Eigen {
   #define EIGEN_RESTRICT __restrict
 #endif
 
-#ifndef EIGEN_STACK_ALLOCATION_LIMIT
-// 131072 == 128 KB
-#define EIGEN_STACK_ALLOCATION_LIMIT 131072
-#endif
 
 #ifndef EIGEN_DEFAULT_IO_FORMAT
 #ifdef EIGEN_MAKING_DOCS
@@ -829,8 +1174,23 @@ namespace Eigen {
 // just an empty macro !
 #define EIGEN_EMPTY
 
-#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0)
-  // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
+
+// When compiling CUDA/HIP device code with NVCC or HIPCC
+// pull in math functions from the global namespace.
+// In host mode, and when device code is compiled with clang,
+// use the std versions.
+#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || defined(EIGEN_HIP_DEVICE_COMPILE)
+  #define EIGEN_USING_STD(FUNC) using ::FUNC;
+#else
+  #define EIGEN_USING_STD(FUNC) using std::FUNC;
+#endif
+
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || (EIGEN_COMP_MSVC == 1900 && EIGEN_COMP_NVCC))
+  // For older MSVC versions, as well as 1900 && CUDA 8, using the base operator is necessary,
+  //   otherwise we get duplicate definition errors
+  // For later MSVC versions, we require explicit operator= definition, otherwise we get
+  //   use of implicitly deleted operator errors.
+  // (cf Bugs 920, 1000, 1324, 2291)
   #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
     using Base::operator =;
 #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
@@ -856,7 +1216,7 @@ namespace Eigen {
  * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden.
  */
 #if EIGEN_HAS_CXX11
-#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default;
+#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) CLASS(const CLASS&) = default;
 #else
 #define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS)
 #endif
@@ -881,12 +1241,12 @@ namespace Eigen {
  */
 #if EIGEN_HAS_CXX11
 #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived)  \
-    EIGEN_DEVICE_FUNC Derived() = default; \
-    EIGEN_DEVICE_FUNC ~Derived() = default;
+    Derived() = default; \
+    ~Derived() = default;
 #else
 #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived)  \
-    EIGEN_DEVICE_FUNC Derived() {}; \
-    /* EIGEN_DEVICE_FUNC ~Derived() {}; */
+    Derived() {}; \
+    /* ~Derived() {}; */
 #endif
 
 
@@ -908,7 +1268,8 @@ namespace Eigen {
   typedef typename Eigen::internal::ref_selector<Derived>::type Nested; \
   typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
   typedef typename Eigen::internal::traits<Derived>::StorageIndex StorageIndex; \
-  enum { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
+  enum CompileTimeTraits \
+      { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
         ColsAtCompileTime = Eigen::internal::traits<Derived>::ColsAtCompileTime, \
         Flags = Eigen::internal::traits<Derived>::Flags, \
         SizeAtCompileTime = Base::SizeAtCompileTime, \
@@ -953,6 +1314,14 @@ namespace Eigen {
 
 #define EIGEN_IMPLIES(a,b) (!(a) || (b))
 
+#if EIGEN_HAS_BUILTIN(__builtin_expect) || EIGEN_COMP_GNUC
+#define EIGEN_PREDICT_FALSE(x) (__builtin_expect(x, false))
+#define EIGEN_PREDICT_TRUE(x) (__builtin_expect(false || (x), true))
+#else
+#define EIGEN_PREDICT_FALSE(x) (x)
+#define EIGEN_PREDICT_TRUE(x) (x)
+#endif
+
 // the expression type of a standard coefficient wise binary operation
 #define EIGEN_CWISE_BINARY_RETURN_TYPE(LHS,RHS,OPNAME) \
     CwiseBinaryOp< \
@@ -984,14 +1353,14 @@ namespace Eigen {
                 const typename internal::plain_constant_type<EXPR,SCALAR>::type, const EXPR>
 
 // Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010")
-#if EIGEN_COMP_MSVC_STRICT<=1600
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC_STRICT<=1600)
 #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if<true,X>::type
 #else
 #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X
 #endif
 
 #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) \
-  template <typename T> EIGEN_DEVICE_FUNC inline \
+  template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
   EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type,OPNAME))\
   (METHOD)(const T& scalar) const { \
     typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type PromotedT; \
@@ -1000,7 +1369,7 @@ namespace Eigen {
   }
 
 #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \
-  template <typename T> EIGEN_DEVICE_FUNC inline friend \
+  template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend \
   EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type,Derived,OPNAME)) \
   (METHOD)(const T& scalar, const StorageBaseType& matrix) { \
     typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type PromotedT; \
@@ -1013,15 +1382,23 @@ namespace Eigen {
   EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME)
 
 
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE)
+  #define EIGEN_EXCEPTIONS
+#endif
+
+
 #ifdef EIGEN_EXCEPTIONS
 #  define EIGEN_THROW_X(X) throw X
 #  define EIGEN_THROW throw
 #  define EIGEN_TRY try
 #  define EIGEN_CATCH(X) catch (X)
 #else
-#  ifdef __CUDA_ARCH__
+#  if defined(EIGEN_CUDA_ARCH)
 #    define EIGEN_THROW_X(X) asm("trap;")
 #    define EIGEN_THROW asm("trap;")
+#  elif defined(EIGEN_HIP_DEVICE_COMPILE)
+#    define EIGEN_THROW_X(X) asm("s_trap 0")
+#    define EIGEN_THROW asm("s_trap 0")
 #  else
 #    define EIGEN_THROW_X(X) std::abort()
 #    define EIGEN_THROW std::abort()
@@ -1041,13 +1418,47 @@ namespace Eigen {
 #   define EIGEN_NOEXCEPT
 #   define EIGEN_NOEXCEPT_IF(x)
 #   define EIGEN_NO_THROW throw()
-#   if EIGEN_COMP_MSVC
+#   if EIGEN_COMP_MSVC || EIGEN_COMP_CXXVER>=17
       // MSVC does not support exception specifications (warning C4290),
-      // and they are deprecated in c++11 anyway.
+      // and they are deprecated in c++11 anyway. This is even an error in c++17.
 #     define EIGEN_EXCEPTION_SPEC(X) throw()
 #   else
 #     define EIGEN_EXCEPTION_SPEC(X) throw(X)
 #   endif
 #endif
 
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+// The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input.
+namespace Eigen {
+namespace internal {
+
+inline bool all(){ return true; }
+
+template<typename T, typename ...Ts>
+bool all(T t, Ts ... ts){ return t && all(ts...); }
+
+}
+}
+#endif
+
+#if EIGEN_HAS_CXX11_OVERRIDE_FINAL
+// provide override and final specifiers if they are available:
+#   define EIGEN_OVERRIDE override
+#   define EIGEN_FINAL final
+#else
+#   define EIGEN_OVERRIDE
+#   define EIGEN_FINAL
+#endif
+
+// Wrapping #pragma unroll in a macro since it is required for SYCL
+#if defined(SYCL_DEVICE_ONLY)
+  #if defined(_MSC_VER)
+    #define EIGEN_UNROLL_LOOP __pragma(unroll)
+  #else
+    #define EIGEN_UNROLL_LOOP _Pragma("unroll")
+  #endif
+#else
+  #define EIGEN_UNROLL_LOOP
+#endif
+
 #endif // EIGEN_MACROS_H
diff --git a/contrib/eigen/Eigen/src/Core/util/Memory.h b/contrib/eigen/Eigen/src/Core/util/Memory.h
index 291383c581a7fcc4abe12a67c4eb1ed33c5333f5..875318cdb1ce2e9716a5a744ecd2485848ce5065 100644
--- a/contrib/eigen/Eigen/src/Core/util/Memory.h
+++ b/contrib/eigen/Eigen/src/Core/util/Memory.h
@@ -63,14 +63,28 @@ namespace Eigen {
 
 namespace internal {
 
-EIGEN_DEVICE_FUNC 
+EIGEN_DEVICE_FUNC
 inline void throw_std_bad_alloc()
 {
   #ifdef EIGEN_EXCEPTIONS
     throw std::bad_alloc();
   #else
     std::size_t huge = static_cast<std::size_t>(-1);
-    ::operator new(huge);
+    #if defined(EIGEN_HIPCC)
+    //
+    // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining),
+    // and as a consequence the code in the #else block triggers the hipcc warning :
+    // "no overloaded function has restriction specifiers that are compatible with the ambient context"
+    //
+    // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects
+    // the same on "operator new"
+    // Reverting code back to the old version in this #if block for the hipcc compiler
+    //
+    new int[huge];
+    #else
+    void* unused = ::operator new(huge);
+    EIGEN_UNUSED_VARIABLE(unused);
+    #endif
   #endif
 }
 
@@ -83,19 +97,26 @@ inline void throw_std_bad_alloc()
 /** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
   * Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
   */
-inline void* handmade_aligned_malloc(std::size_t size)
+EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
 {
-  void *original = std::malloc(size+EIGEN_DEFAULT_ALIGN_BYTES);
+  eigen_assert(alignment >= sizeof(void*) && (alignment & (alignment-1)) == 0 && "Alignment must be at least sizeof(void*) and a power of 2");
+
+  EIGEN_USING_STD(malloc)
+  void *original = malloc(size+alignment);
+  
   if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES);
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(alignment-1))) + alignment);
   *(reinterpret_cast<void**>(aligned) - 1) = original;
   return aligned;
 }
 
 /** \internal Frees memory allocated with handmade_aligned_malloc */
-inline void handmade_aligned_free(void *ptr)
+EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr)
 {
-  if (ptr) std::free(*(reinterpret_cast<void**>(ptr) - 1));
+  if (ptr) {
+    EIGEN_USING_STD(free)
+    free(*(reinterpret_cast<void**>(ptr) - 1));
+  }
 }
 
 /** \internal
@@ -114,7 +135,7 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
   void *previous_aligned = static_cast<char *>(original)+previous_offset;
   if(aligned!=previous_aligned)
     std::memmove(aligned, previous_aligned, size);
-  
+
   *(reinterpret_cast<void**>(aligned) - 1) = original;
   return aligned;
 }
@@ -142,7 +163,7 @@ EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {
   eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
 }
-#else 
+#else
 EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {}
 #endif
@@ -156,9 +177,12 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
 
   void *result;
   #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
-    result = std::malloc(size);
+
+    EIGEN_USING_STD(malloc)
+    result = malloc(size);
+
     #if EIGEN_DEFAULT_ALIGN_BYTES==16
-    eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator.");
+    eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade aligned memory allocator.");
     #endif
   #else
     result = handmade_aligned_malloc(size);
@@ -174,7 +198,10 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
 EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
 {
   #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
-    std::free(ptr);
+
+    EIGEN_USING_STD(free)
+    free(ptr);
+
   #else
     handmade_aligned_free(ptr);
   #endif
@@ -187,7 +214,7 @@ EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
   */
 inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size)
 {
-  EIGEN_UNUSED_VARIABLE(old_size);
+  EIGEN_UNUSED_VARIABLE(old_size)
 
   void *result;
 #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
@@ -218,7 +245,9 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std:
 {
   check_that_malloc_is_allowed();
 
-  void *result = std::malloc(size);
+  EIGEN_USING_STD(malloc)
+  void *result = malloc(size);
+
   if(!result && size)
     throw_std_bad_alloc();
   return result;
@@ -232,7 +261,8 @@ template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void
 
 template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
 {
-  std::free(ptr);
+  EIGEN_USING_STD(free)
+  free(ptr);
 }
 
 template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size)
@@ -331,7 +361,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
 template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, std::size_t size)
 {
   destruct_elements_of_array<T>(ptr, size);
-  aligned_free(ptr);
+  Eigen::internal::aligned_free(ptr);
 }
 
 /** \internal Deletes objects constructed with conditional_aligned_new
@@ -471,8 +501,8 @@ EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index
 }
 
 /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size
-  */ 
-template<typename Index> 
+  */
+template<typename Index>
 inline Index first_multiple(Index size, Index base)
 {
   return ((size+base-1)/base)*base;
@@ -493,7 +523,8 @@ template<typename T> struct smart_copy_helper<T,true> {
     IntPtr size = IntPtr(end)-IntPtr(start);
     if(size==0) return;
     eigen_internal_assert(start!=0 && end!=0 && target!=0);
-    std::memcpy(target, start, size);
+    EIGEN_USING_STD(memcpy)
+    memcpy(target, start, size);
   }
 };
 
@@ -502,7 +533,7 @@ template<typename T> struct smart_copy_helper<T,false> {
   { std::copy(start, end, target); }
 };
 
-// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. 
+// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise.
 template<typename T, bool UseMemmove> struct smart_memmove_helper;
 
 template<typename T> void smart_memmove(const T* start, const T* end, T* target)
@@ -522,19 +553,30 @@ template<typename T> struct smart_memmove_helper<T,true> {
 
 template<typename T> struct smart_memmove_helper<T,false> {
   static inline void run(const T* start, const T* end, T* target)
-  { 
+  {
     if (UIntPtr(target) < UIntPtr(start))
     {
       std::copy(start, end, target);
     }
-    else                                 
+    else
     {
       std::ptrdiff_t count = (std::ptrdiff_t(end)-std::ptrdiff_t(start)) / sizeof(T);
-      std::copy_backward(start, end, target + count); 
+      std::copy_backward(start, end, target + count);
     }
   }
 };
 
+#if EIGEN_HAS_RVALUE_REFERENCES
+template<typename T> EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target)
+{
+  return std::move(start, end, target);
+}
+#else
+template<typename T> EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target)
+{
+  return std::copy(start, end, target);
+}
+#endif
 
 /*****************************************************************************
 *** Implementation of runtime stack allocation (falling back to malloc)    ***
@@ -542,7 +584,7 @@ template<typename T> struct smart_memmove_helper<T,false> {
 
 // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
 // to the appropriate stack allocation function
-#ifndef EIGEN_ALLOCA
+#if ! defined EIGEN_ALLOCA && ! defined EIGEN_GPU_COMPILE_PHASE
   #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca)
     #define EIGEN_ALLOCA alloca
   #elif EIGEN_COMP_MSVC
@@ -550,6 +592,15 @@ template<typename T> struct smart_memmove_helper<T,false> {
   #endif
 #endif
 
+// With clang -Oz -mthumb, alloca changes the stack pointer in a way that is
+// not allowed in Thumb2. -DEIGEN_STACK_ALLOCATION_LIMIT=0 doesn't work because
+// the compiler still emits bad code because stack allocation checks use "<=".
+// TODO: Eliminate after https://bugs.llvm.org/show_bug.cgi?id=23772
+// is fixed.
+#if defined(__clang__) && defined(__thumb__)
+  #undef EIGEN_ALLOCA
+#endif
+
 // This helper class construct the allocated memory, and takes care of destructing and freeing the handled data
 // at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions.
 template<typename T> class aligned_stack_memory_handler : noncopyable
@@ -561,12 +612,14 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
      * In this case, the buffer elements will also be destructed when this handler will be destructed.
      * Finally, if \a dealloc is true, then the pointer \a ptr is freed.
      **/
+    EIGEN_DEVICE_FUNC
     aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc)
       : m_ptr(ptr), m_size(size), m_deallocate(dealloc)
     {
       if(NumTraits<T>::RequireInitialization && m_ptr)
         Eigen::internal::construct_elements_of_array(m_ptr, size);
     }
+    EIGEN_DEVICE_FUNC
     ~aligned_stack_memory_handler()
     {
       if(NumTraits<T>::RequireInitialization && m_ptr)
@@ -580,6 +633,60 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
     bool m_deallocate;
 };
 
+#ifdef EIGEN_ALLOCA
+
+template<typename Xpr, int NbEvaluations,
+         bool MapExternalBuffer = nested_eval<Xpr,NbEvaluations>::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic
+         >
+struct local_nested_eval_wrapper
+{
+  static const bool NeedExternalBuffer = false;
+  typedef typename Xpr::Scalar Scalar;
+  typedef typename nested_eval<Xpr,NbEvaluations>::type ObjectType;
+  ObjectType object;
+
+  EIGEN_DEVICE_FUNC
+  local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr)
+  {
+    EIGEN_UNUSED_VARIABLE(ptr);
+    eigen_internal_assert(ptr==0);
+  }
+};
+
+template<typename Xpr, int NbEvaluations>
+struct local_nested_eval_wrapper<Xpr,NbEvaluations,true>
+{
+  static const bool NeedExternalBuffer = true;
+  typedef typename Xpr::Scalar Scalar;
+  typedef typename plain_object_eval<Xpr>::type PlainObject;
+  typedef Map<PlainObject,EIGEN_DEFAULT_ALIGN_BYTES> ObjectType;
+  ObjectType object;
+
+  EIGEN_DEVICE_FUNC
+  local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr)
+    : object(ptr==0 ? reinterpret_cast<Scalar*>(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()),
+      m_deallocate(ptr==0)
+  {
+    if(NumTraits<Scalar>::RequireInitialization && object.data())
+      Eigen::internal::construct_elements_of_array(object.data(), object.size());
+    object = xpr;
+  }
+
+  EIGEN_DEVICE_FUNC
+  ~local_nested_eval_wrapper()
+  {
+    if(NumTraits<Scalar>::RequireInitialization && object.data())
+      Eigen::internal::destruct_elements_of_array(object.data(), object.size());
+    if(m_deallocate)
+      Eigen::internal::aligned_free(object.data());
+  }
+
+private:
+  bool m_deallocate;
+};
+
+#endif // EIGEN_ALLOCA
+
 template<typename T> class scoped_array : noncopyable
 {
   T* m_ptr;
@@ -603,13 +710,15 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 {
   std::swap(a.ptr(),b.ptr());
 }
-    
+
 } // end namespace internal
 
 /** \internal
-  * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
-  * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
-  * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap.
+  *
+  * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates,
+  * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
+  * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
+  * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap.
   * The allocated buffer is automatically deleted when exiting the scope of this declaration.
   * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs.
   * Here is an example:
@@ -620,9 +729,17 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
   * }
   * \endcode
   * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
+  *
+  * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to
+  * \code
+  *   typename internal::nested_eval<XPRT_T,N>::type NAME(XPR);
+  * \endcode
+  * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown.
+  * This is accomplished through alloca if this later is supported and if the required number of bytes
+  * is below EIGEN_STACK_ALLOCATION_LIMIT.
   */
 #ifdef EIGEN_ALLOCA
-  
+
   #if EIGEN_DEFAULT_ALIGN_BYTES>0
     // We always manually re-align the result of EIGEN_ALLOCA.
     // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment.
@@ -639,13 +756,23 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
                     : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) );  \
     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT)
 
+
+  #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \
+    Eigen::internal::local_nested_eval_wrapper<XPR_T,N> EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast<typename XPR_T::Scalar*>( \
+      ( (Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \
+        ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \
+    typename Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object)
+
 #else
 
   #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
     Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
     TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE));    \
     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
-    
+
+
+#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval<XPR_T,N>::type NAME(XPR)
+
 #endif
 
 
@@ -653,32 +780,56 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 *** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF]                ***
 *****************************************************************************/
 
-#if EIGEN_MAX_ALIGN_BYTES!=0
+#if EIGEN_HAS_CXX17_OVERALIGN
+
+// C++17 -> no need to bother about alignment anymore :)
+
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign)
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size)
+
+#else
+
+// HIP does not support new/delete on device.
+#if EIGEN_MAX_ALIGN_BYTES!=0 && !defined(EIGEN_HIP_DEVICE_COMPILE)
   #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+      EIGEN_DEVICE_FUNC \
       void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \
         EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
         EIGEN_CATCH (...) { return 0; } \
       }
   #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \
+      EIGEN_DEVICE_FUNC \
       void *operator new(std::size_t size) { \
         return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
       } \
+      EIGEN_DEVICE_FUNC \
       void *operator new[](std::size_t size) { \
         return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
       } \
+      EIGEN_DEVICE_FUNC \
       void operator delete(void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      EIGEN_DEVICE_FUNC \
       void operator delete[](void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      EIGEN_DEVICE_FUNC \
       void operator delete(void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      EIGEN_DEVICE_FUNC \
       void operator delete[](void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
       /* in-place new and delete. since (at least afaik) there is no actual   */ \
       /* memory allocated we can safely let the default implementation handle */ \
       /* this particular case. */ \
+      EIGEN_DEVICE_FUNC \
       static void *operator new(std::size_t size, void *ptr) { return ::operator new(size,ptr); } \
+      EIGEN_DEVICE_FUNC \
       static void *operator new[](std::size_t size, void* ptr) { return ::operator new[](size,ptr); } \
+      EIGEN_DEVICE_FUNC \
       void operator delete(void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete(memory,ptr); } \
+      EIGEN_DEVICE_FUNC \
       void operator delete[](void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete[](memory,ptr); } \
       /* nothrow-new (returns zero instead of std::bad_alloc) */ \
       EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+      EIGEN_DEVICE_FUNC \
       void operator delete(void *ptr, const std::nothrow_t&) EIGEN_NO_THROW { \
         Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
       } \
@@ -688,8 +839,14 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 #endif
 
 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_MAX_ALIGN_BYTES==0)))
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size)                        \
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(                                                             \
+        ((Size)!=Eigen::Dynamic) &&                                                                    \
+        (((EIGEN_MAX_ALIGN_BYTES>=16) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES  )==0)) ||    \
+         ((EIGEN_MAX_ALIGN_BYTES>=32) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/2)==0)) ||    \
+         ((EIGEN_MAX_ALIGN_BYTES>=64) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/4)==0))   )))
+
+#endif
 
 /****************************************************************************/
 
@@ -703,13 +860,13 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 *  - 32 bytes alignment if AVX is enabled.
 *  - 64 bytes alignment if AVX512 is enabled.
 *
-* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented
+* This can be controlled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented
 * \link TopicPreprocessorDirectivesPerformance there \endlink.
 *
 * Example:
 * \code
 * // Matrix4f requires 16 bytes alignment:
-* std::map< int, Matrix4f, std::less<int>, 
+* std::map< int, Matrix4f, std::less<int>,
 *           aligned_allocator<std::pair<const int, Matrix4f> > > my_map_mat4;
 * // Vector3f does not require 16 bytes alignment, no need to use Eigen's allocator:
 * std::map< int, Vector3f > my_map_vec3;
@@ -744,18 +901,19 @@ public:
 
   ~aligned_allocator() {}
 
+  #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0)
+  // In gcc std::allocator::max_size() is bugged making gcc triggers a warning:
+  // eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807
+  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544
+  size_type max_size() const {
+    return (std::numeric_limits<std::ptrdiff_t>::max)()/sizeof(T);
+  }
+  #endif
+
   pointer allocate(size_type num, const void* /*hint*/ = 0)
   {
     internal::check_size_for_overflow<T>(num);
-    size_type size = num * sizeof(T);
-#if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0)
-    // workaround gcc bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544
-    // It triggered eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807
-    if(size>=std::size_t((std::numeric_limits<std::ptrdiff_t>::max)()))
-      return 0;
-    else
-#endif
-      return static_cast<pointer>( internal::aligned_malloc(size) );
+    return static_cast<pointer>( internal::aligned_malloc(num * sizeof(T)) );
   }
 
   void deallocate(pointer p, size_type /*num*/)
@@ -914,20 +1072,32 @@ inline void queryCacheSizes_intel(int& l1, int& l2, int& l3, int max_std_funcs)
 {
   if(max_std_funcs>=4)
     queryCacheSizes_intel_direct(l1,l2,l3);
-  else
+  else if(max_std_funcs>=2)
     queryCacheSizes_intel_codes(l1,l2,l3);
+  else
+    l1 = l2 = l3 = 0;
 }
 
 inline void queryCacheSizes_amd(int& l1, int& l2, int& l3)
 {
   int abcd[4];
   abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
-  EIGEN_CPUID(abcd,0x80000005,0);
-  l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB
-  abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
-  EIGEN_CPUID(abcd,0x80000006,0);
-  l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB
-  l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB
+  
+  // First query the max supported function.
+  EIGEN_CPUID(abcd,0x80000000,0);
+  if(static_cast<numext::uint32_t>(abcd[0]) >= static_cast<numext::uint32_t>(0x80000006))
+  {
+    EIGEN_CPUID(abcd,0x80000005,0);
+    l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB
+    abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
+    EIGEN_CPUID(abcd,0x80000006,0);
+    l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB
+    l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB
+  }
+  else
+  {
+    l1 = l2 = l3 = 0;
+  }
 }
 #endif
 
@@ -943,7 +1113,7 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3)
 
   // identify the CPU vendor
   EIGEN_CPUID(abcd,0x0,0);
-  int max_std_funcs = abcd[1];
+  int max_std_funcs = abcd[0];
   if(cpuid_is_vendor(abcd,GenuineIntel))
     queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
   else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_))
diff --git a/contrib/eigen/Eigen/src/Core/util/Meta.h b/contrib/eigen/Eigen/src/Core/util/Meta.h
index 9b61ff037aa50e58554cefbc91b75b86622fd3c7..81ae2a32d1d290e3173fb27ed9088a41fcf685de 100755
--- a/contrib/eigen/Eigen/src/Core/util/Meta.h
+++ b/contrib/eigen/Eigen/src/Core/util/Meta.h
@@ -11,13 +11,54 @@
 #ifndef EIGEN_META_H
 #define EIGEN_META_H
 
-#if defined(__CUDA_ARCH__)
-#include <cfloat>
-#include <math_constants.h>
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+
+ #include <cfloat>
+
+ #if defined(EIGEN_CUDA_ARCH)
+  #include <math_constants.h>
+ #endif
+
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+  #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
+  #endif
+
 #endif
 
-#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
+// Recent versions of ICC require <cstdint> for pointer types below.
+#define EIGEN_ICC_NEEDS_CSTDINT (EIGEN_COMP_ICC>=1600 && EIGEN_COMP_CXXVER >= 11)
+
+// Define portable (u)int{32,64} types
+#if EIGEN_HAS_CXX11 || EIGEN_ICC_NEEDS_CSTDINT
 #include <cstdint>
+namespace Eigen {
+namespace numext {
+typedef std::uint8_t  uint8_t;
+typedef std::int8_t   int8_t;
+typedef std::uint16_t uint16_t;
+typedef std::int16_t  int16_t;
+typedef std::uint32_t uint32_t;
+typedef std::int32_t  int32_t;
+typedef std::uint64_t uint64_t;
+typedef std::int64_t  int64_t;
+}
+}
+#else
+// Without c++11, all compilers able to compile Eigen also
+// provide the C99 stdint.h header file.
+#include <stdint.h>
+namespace Eigen {
+namespace numext {
+typedef ::uint8_t  uint8_t;
+typedef ::int8_t   int8_t;
+typedef ::uint16_t uint16_t;
+typedef ::int16_t  int16_t;
+typedef ::uint32_t uint32_t;
+typedef ::int32_t  int32_t;
+typedef ::uint64_t uint64_t;
+typedef ::int64_t  int64_t;
+}
+}
 #endif
 
 namespace Eigen {
@@ -43,26 +84,33 @@ namespace internal {
 
 // Only recent versions of ICC complain about using ptrdiff_t to hold pointers,
 // and older versions do not provide *intptr_t types.
-#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
+#if EIGEN_ICC_NEEDS_CSTDINT
 typedef std::intptr_t  IntPtr;
 typedef std::uintptr_t UIntPtr;
 #else
 typedef std::ptrdiff_t IntPtr;
 typedef std::size_t UIntPtr;
 #endif
+#undef EIGEN_ICC_NEEDS_CSTDINT
 
 struct true_type {  enum { value = 1 }; };
 struct false_type { enum { value = 0 }; };
 
+template<bool Condition>
+struct bool_constant;
+
+template<>
+struct bool_constant<true> : true_type {};
+
+template<>
+struct bool_constant<false> : false_type {};
+
 template<bool Condition, typename Then, typename Else>
 struct conditional { typedef Then type; };
 
 template<typename Then, typename Else>
 struct conditional <false, Then, Else> { typedef Else type; };
 
-template<typename T, typename U> struct is_same { enum { value = 0 }; };
-template<typename T> struct is_same<T,T> { enum { value = 1 }; };
-
 template<typename T> struct remove_reference { typedef T type; };
 template<typename T> struct remove_reference<T&> { typedef T type; };
 
@@ -97,23 +145,31 @@ template<> struct is_arithmetic<unsigned int>  { enum { value = true }; };
 template<> struct is_arithmetic<signed long>   { enum { value = true }; };
 template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
 
+template<typename T, typename U> struct is_same { enum { value = 0 }; };
+template<typename T> struct is_same<T,T> { enum { value = 1 }; };
+
+template< class T >
+struct is_void : is_same<void, typename remove_const<T>::type> {};
+
 #if EIGEN_HAS_CXX11
+template<> struct is_arithmetic<signed long long>   { enum { value = true }; };
+template<> struct is_arithmetic<unsigned long long> { enum { value = true }; };
 using std::is_integral;
 #else
-template<typename T> struct is_integral        { enum { value = false }; };
-template<> struct is_integral<bool>            { enum { value = true }; };
-template<> struct is_integral<char>            { enum { value = true }; };
-template<> struct is_integral<signed char>     { enum { value = true }; };
-template<> struct is_integral<unsigned char>   { enum { value = true }; };
-template<> struct is_integral<signed short>    { enum { value = true }; };
-template<> struct is_integral<unsigned short>  { enum { value = true }; };
-template<> struct is_integral<signed int>      { enum { value = true }; };
-template<> struct is_integral<unsigned int>    { enum { value = true }; };
-template<> struct is_integral<signed long>     { enum { value = true }; };
-template<> struct is_integral<unsigned long>   { enum { value = true }; };
+template<typename T> struct is_integral               { enum { value = false }; };
+template<> struct is_integral<bool>                   { enum { value = true }; };
+template<> struct is_integral<char>                   { enum { value = true }; };
+template<> struct is_integral<signed char>            { enum { value = true }; };
+template<> struct is_integral<unsigned char>          { enum { value = true }; };
+template<> struct is_integral<signed short>           { enum { value = true }; };
+template<> struct is_integral<unsigned short>         { enum { value = true }; };
+template<> struct is_integral<signed int>             { enum { value = true }; };
+template<> struct is_integral<unsigned int>           { enum { value = true }; };
+template<> struct is_integral<signed long>            { enum { value = true }; };
+template<> struct is_integral<unsigned long>          { enum { value = true }; };
 #if EIGEN_COMP_MSVC
-template<> struct is_integral<signed __int64>  { enum { value = true }; };
-template<> struct is_integral<unsigned __int64>{ enum { value = true }; };
+template<> struct is_integral<signed __int64>         { enum { value = true }; };
+template<> struct is_integral<unsigned __int64>       { enum { value = true }; };
 #endif
 #endif
 
@@ -137,6 +193,16 @@ template<> struct make_unsigned<unsigned long>    { typedef unsigned long type;
 template<> struct make_unsigned<signed __int64>   { typedef unsigned __int64 type; };
 template<> struct make_unsigned<unsigned __int64> { typedef unsigned __int64 type; };
 #endif
+
+// Some platforms define int64_t as `long long` even for C++03, where
+// `long long` is not guaranteed by the standard. In this case we are missing
+// the definition for make_unsigned. If we just define it, we run into issues
+// where `long long` doesn't exist in some compilers for C++03. We therefore add
+// the specialization for these platforms only.
+#if EIGEN_OS_MAC || EIGEN_COMP_MINGW
+template<> struct make_unsigned<unsigned long long> { typedef unsigned long long type; };
+template<> struct make_unsigned<long long>          { typedef unsigned long long type; };
+#endif
 #endif
 
 template <typename T> struct add_const { typedef const T type; };
@@ -151,6 +217,11 @@ template<typename T> struct add_const_on_value_type<T*>        { typedef T const
 template<typename T> struct add_const_on_value_type<T* const>  { typedef T const* const type; };
 template<typename T> struct add_const_on_value_type<T const* const>  { typedef T const* const type; };
 
+#if EIGEN_HAS_CXX11
+
+using std::is_convertible;
+
+#else
 
 template<typename From, typename To>
 struct is_convertible_impl
@@ -164,16 +235,19 @@ private:
   struct yes {int a[1];};
   struct no  {int a[2];};
 
-  static yes test(const To&, int);
+  template<typename T>
+  static yes test(T, int);
+
+  template<typename T>
   static no  test(any_conversion, ...);
 
 public:
-  static From ms_from;
+  static typename internal::remove_reference<From>::type* ms_from;
 #ifdef __INTEL_COMPILER
   #pragma warning push
   #pragma warning ( disable : 2259 )
 #endif
-  enum { value = sizeof(test(ms_from, 0))==sizeof(yes) };
+  enum { value = sizeof(test<To>(*ms_from, 0))==sizeof(yes) };
 #ifdef __INTEL_COMPILER
   #pragma warning pop
 #endif
@@ -182,10 +256,17 @@ public:
 template<typename From, typename To>
 struct is_convertible
 {
-  enum { value = is_convertible_impl<typename remove_all<From>::type,
-                                     typename remove_all<To  >::type>::value };
+  enum { value = is_convertible_impl<From,To>::value };
 };
 
+template<typename T>
+struct is_convertible<T,T&> { enum { value = false }; };
+
+template<typename T>
+struct is_convertible<const T,const T&> { enum { value = true }; };
+
+#endif
+
 /** \internal Allows to enable/disable an overload
   * according to a compile time condition.
   */
@@ -194,7 +275,7 @@ template<bool Condition, typename T=void> struct enable_if;
 template<typename T> struct enable_if<true,T>
 { typedef T type; };
 
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11
 #if !defined(__FLT_EPSILON__)
 #define __FLT_EPSILON__ FLT_EPSILON
 #define __DBL_EPSILON__ DBL_EPSILON
@@ -205,7 +286,7 @@ namespace device {
 template<typename T> struct numeric_limits
 {
   EIGEN_DEVICE_FUNC
-  static T epsilon() { return 0; }
+  static EIGEN_CONSTEXPR T epsilon() { return 0; }
   static T (max)() { assert(false && "Highest not supported for this type"); }
   static T (min)() { assert(false && "Lowest not supported for this type"); }
   static T infinity() { assert(false && "Infinity not supported for this type"); }
@@ -213,91 +294,130 @@ template<typename T> struct numeric_limits
 };
 template<> struct numeric_limits<float>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static float epsilon() { return __FLT_EPSILON__; }
   EIGEN_DEVICE_FUNC
-  static float (max)() { return CUDART_MAX_NORMAL_F; }
-  EIGEN_DEVICE_FUNC
+  static float (max)() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_MAX_NORMAL_F;
+  #else
+    return HIPRT_MAX_NORMAL_F;
+  #endif
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static float (min)() { return FLT_MIN; }
   EIGEN_DEVICE_FUNC
-  static float infinity() { return CUDART_INF_F; }
+  static float infinity() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_INF_F;
+  #else
+    return HIPRT_INF_F;
+  #endif
+  }
   EIGEN_DEVICE_FUNC
-  static float quiet_NaN() { return CUDART_NAN_F; }
+  static float quiet_NaN() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_NAN_F;
+  #else
+    return HIPRT_NAN_F;
+  #endif
+  }
 };
 template<> struct numeric_limits<double>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static double epsilon() { return __DBL_EPSILON__; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static double (max)() { return DBL_MAX; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static double (min)() { return DBL_MIN; }
   EIGEN_DEVICE_FUNC
-  static double infinity() { return CUDART_INF; }
+  static double infinity() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_INF;
+  #else
+    return HIPRT_INF;
+  #endif
+  }
   EIGEN_DEVICE_FUNC
-  static double quiet_NaN() { return CUDART_NAN; }
+  static double quiet_NaN() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_NAN;
+  #else
+    return HIPRT_NAN;
+  #endif
+  }
 };
 template<> struct numeric_limits<int>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static int epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static int (max)() { return INT_MAX; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static int (min)() { return INT_MIN; }
 };
 template<> struct numeric_limits<unsigned int>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned int epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned int (max)() { return UINT_MAX; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned int (min)() { return 0; }
 };
 template<> struct numeric_limits<long>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static long epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static long (max)() { return LONG_MAX; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static long (min)() { return LONG_MIN; }
 };
 template<> struct numeric_limits<unsigned long>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned long epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned long (max)() { return ULONG_MAX; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned long (min)() { return 0; }
 };
 template<> struct numeric_limits<long long>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static long long epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static long long (max)() { return LLONG_MAX; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static long long (min)() { return LLONG_MIN; }
 };
 template<> struct numeric_limits<unsigned long long>
 {
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned long long epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned long long (max)() { return ULLONG_MAX; }
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
   static unsigned long long (min)() { return 0; }
 };
+template<> struct numeric_limits<bool>
+{
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static bool epsilon() { return false; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static bool (max)() { return true; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR 
+  static bool (min)() { return false; }
+};
 
 }
 
-#endif
+#endif // defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11
 
 /** \internal
-  * A base class do disable default copy ctor and copy assignement operator.
+  * A base class do disable default copy ctor and copy assignment operator.
   */
 class noncopyable
 {
@@ -309,13 +429,82 @@ protected:
 };
 
 /** \internal
-  * Convenient struct to get the result type of a unary or binary functor.
+  * Provides access to the number of elements in the object of as a compile-time constant expression.
+  * It "returns" Eigen::Dynamic if the size cannot be resolved at compile-time (default).
+  *
+  * Similar to std::tuple_size, but more general.
+  *
+  * It currently supports:
+  *  - any types T defining T::SizeAtCompileTime
+  *  - plain C arrays as T[N]
+  *  - std::array (c++11)
+  *  - some internal types such as SingleRange and AllRange
+  *
+  * The second template parameter eases SFINAE-based specializations.
+  */
+template<typename T, typename EnableIf = void> struct array_size {
+  enum { value = Dynamic };
+};
+
+template<typename T> struct array_size<T,typename internal::enable_if<((T::SizeAtCompileTime&0)==0)>::type> {
+  enum { value = T::SizeAtCompileTime };
+};
+
+template<typename T, int N> struct array_size<const T (&)[N]> {
+  enum { value = N };
+};
+template<typename T, int N> struct array_size<T (&)[N]> {
+  enum { value = N };
+};
+
+#if EIGEN_HAS_CXX11
+template<typename T, std::size_t N> struct array_size<const std::array<T,N> > {
+  enum { value = N };
+};
+template<typename T, std::size_t N> struct array_size<std::array<T,N> > {
+  enum { value = N };
+};
+#endif
+
+/** \internal
+  * Analogue of the std::size free function.
+  * It returns the size of the container or view \a x of type \c T
+  *
+  * It currently supports:
+  *  - any types T defining a member T::size() const
+  *  - plain C arrays as T[N]
   *
-  * It supports both the current STL mechanism (using the result_type member) as well as
-  * upcoming next STL generation (using a templated result member).
-  * If none of these members is provided, then the type of the first argument is returned. FIXME, that behavior is a pretty bad hack.
   */
-#if EIGEN_HAS_STD_RESULT_OF
+template<typename T>
+EIGEN_CONSTEXPR Index size(const T& x) { return x.size(); }
+
+template<typename T,std::size_t N>
+EIGEN_CONSTEXPR Index size(const T (&) [N]) { return N; }
+
+/** \internal
+  * Convenient struct to get the result type of a nullary, unary, binary, or
+  * ternary functor.
+  * 
+  * Pre C++11:
+  * Supports both a Func::result_type member and templated
+  * Func::result<Func(ArgTypes...)>::type member.
+  * 
+  * If none of these members is provided, then the type of the first
+  * argument is returned.
+  * 
+  * Post C++11:
+  * This uses std::result_of. However, note the `type` member removes
+  * const and converts references/pointers to their corresponding value type.
+  */
+#if EIGEN_HAS_STD_INVOKE_RESULT
+template<typename T> struct result_of;
+
+template<typename F, typename... ArgTypes>
+struct result_of<F(ArgTypes...)> {
+  typedef typename std::invoke_result<F, ArgTypes...>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#elif EIGEN_HAS_STD_RESULT_OF
 template<typename T> struct result_of {
   typedef typename std::result_of<T>::type type1;
   typedef typename remove_all<type1>::type type;
@@ -327,6 +516,28 @@ struct has_none {int a[1];};
 struct has_std_result_type {int a[2];};
 struct has_tr1_result {int a[3];};
 
+template<typename Func, int SizeOf>
+struct nullary_result_of_select {};
+
+template<typename Func>
+struct nullary_result_of_select<Func, sizeof(has_std_result_type)> {typedef typename Func::result_type type;};
+
+template<typename Func>
+struct nullary_result_of_select<Func, sizeof(has_tr1_result)> {typedef typename Func::template result<Func()>::type type;};
+
+template<typename Func>
+struct result_of<Func()> {
+    template<typename T>
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T()>::type const * = 0);
+    static has_none               testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename nullary_result_of_select<Func, FunctorType>::type type;
+};
+
 template<typename Func, typename ArgType, int SizeOf=sizeof(has_none)>
 struct unary_result_of_select {typedef typename internal::remove_all<ArgType>::type type;};
 
@@ -396,6 +607,45 @@ struct result_of<Func(ArgType0,ArgType1,ArgType2)> {
     enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
     typedef typename ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, FunctorType>::type type;
 };
+
+#endif
+
+#if EIGEN_HAS_STD_INVOKE_RESULT
+template<typename F, typename... ArgTypes>
+struct invoke_result {
+  typedef typename std::invoke_result<F, ArgTypes...>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#elif EIGEN_HAS_CXX11
+template<typename F, typename... ArgTypes>
+struct invoke_result {
+  typedef typename result_of<F(ArgTypes...)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#else
+template<typename F, typename ArgType0 = void, typename ArgType1 = void, typename ArgType2 = void>
+struct invoke_result {
+  typedef typename result_of<F(ArgType0, ArgType1, ArgType2)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F>
+struct invoke_result<F, void, void, void> {
+  typedef typename result_of<F()>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F, typename ArgType0>
+struct invoke_result<F, ArgType0, void, void> {
+  typedef typename result_of<F(ArgType0)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F, typename ArgType0, typename ArgType1>
+struct invoke_result<F, ArgType0, ArgType1, void> {
+  typedef typename result_of<F(ArgType0, ArgType1)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
 #endif
 
 struct meta_yes { char a[1]; };
@@ -405,10 +655,10 @@ struct meta_no  { char a[2]; };
 template <typename T>
 struct has_ReturnType
 {
-  template <typename C> static meta_yes testFunctor(typename C::ReturnType const *);
-  template <typename C> static meta_no testFunctor(...);
+  template <typename C> static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0);
+  template <typename C> static meta_no  testFunctor(...);
 
-  enum { value = sizeof(testFunctor<T>(0)) == sizeof(meta_yes) };
+  enum { value = sizeof(testFunctor<T>(static_cast<T*>(0))) == sizeof(meta_yes) };
 };
 
 template<typename T> const T* return_ptr();
@@ -465,20 +715,25 @@ class meta_sqrt<Y, InfX, SupX, true> { public:  enum { ret = (SupX*SupX <= Y) ?
 
 
 /** \internal Computes the least common multiple of two positive integer A and B
-  * at compile-time. It implements a naive algorithm testing all multiples of A.
-  * It thus works better if A>=B.
+  * at compile-time. 
   */
-template<int A, int B, int K=1, bool Done = ((A*K)%B)==0>
+template<int A, int B, int K=1, bool Done = ((A*K)%B)==0, bool Big=(A>=B)>
 struct meta_least_common_multiple
 {
   enum { ret = meta_least_common_multiple<A,B,K+1>::ret };
 };
+template<int A, int B, int K, bool Done>
+struct meta_least_common_multiple<A,B,K,Done,false>
+{
+  enum { ret = meta_least_common_multiple<B,A,K>::ret };
+};
 template<int A, int B, int K>
-struct meta_least_common_multiple<A,B,K,true>
+struct meta_least_common_multiple<A,B,K,true,true>
 {
   enum { ret = A*K };
 };
 
+
 /** \internal determines whether the product of two numeric types is allowed and what the return type is */
 template<typename T, typename U> struct scalar_product_traits
 {
@@ -491,17 +746,27 @@ template<typename T, typename U> struct scalar_product_traits
 // typedef typename scalar_product_traits<typename remove_all<ArgType0>::type, typename remove_all<ArgType1>::type>::ReturnType type;
 // };
 
+/** \internal Obtains a POD type suitable to use as storage for an object of a size
+  * of at most Len bytes, aligned as specified by \c Align.
+  */
+template<unsigned Len, unsigned Align>
+struct aligned_storage {
+  struct type {
+    EIGEN_ALIGN_TO_BOUNDARY(Align) unsigned char data[Len];
+  };
+};
+
 } // end namespace internal
 
 namespace numext {
-  
-#if defined(__CUDA_ARCH__)
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T> EIGEN_DEVICE_FUNC   void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
 #else
 template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
 #endif
 
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11
 using internal::device::numeric_limits;
 #else
 using std::numeric_limits;
@@ -510,6 +775,7 @@ using std::numeric_limits;
 // Integer division with rounding up.
 // T is assumed to be an integer type with a>=0, and b>0
 template<typename T>
+EIGEN_DEVICE_FUNC
 T div_ceil(const T &a, const T &b)
 {
   return (a+b-1) / b;
@@ -517,52 +783,30 @@ T div_ceil(const T &a, const T &b)
 
 // The aim of the following functions is to bypass -Wfloat-equal warnings
 // when we really want a strict equality comparison on floating points.
-template<typename X, typename Y> EIGEN_STRONG_INLINE
+template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool equal_strict(const X& x,const Y& y) { return x == y; }
 
-template<> EIGEN_STRONG_INLINE
+#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); }
 
-template<> EIGEN_STRONG_INLINE
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool equal_strict(const double& x,const double& y) { return std::equal_to<double>()(x,y); }
+#endif
 
-template<typename X, typename Y> EIGEN_STRONG_INLINE
+template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool not_equal_strict(const X& x,const Y& y) { return x != y; }
 
-template<> EIGEN_STRONG_INLINE
+#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); }
 
-template<> EIGEN_STRONG_INLINE
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to<double>()(x,y); }
+#endif
 
 } // end namespace numext
 
 } // end namespace Eigen
 
-// Define portable (u)int{32,64} types
-#if EIGEN_HAS_CXX11
-#include <cstdint>
-namespace Eigen {
-namespace numext {
-typedef std::uint32_t uint32_t;
-typedef std::int32_t  int32_t;
-typedef std::uint64_t uint64_t;
-typedef std::int64_t  int64_t;
-}
-}
-#else
-// Without c++11, all compilers able to compile Eigen also
-// provides the C99 stdint.h header file.
-#include <stdint.h>
-namespace Eigen {
-namespace numext {
-typedef ::uint32_t uint32_t;
-typedef ::int32_t  int32_t;
-typedef ::uint64_t uint64_t;
-typedef ::int64_t  int64_t;
-}
-}
-#endif
-
-
 #endif // EIGEN_META_H
diff --git a/contrib/eigen/Eigen/src/Core/util/StaticAssert.h b/contrib/eigen/Eigen/src/Core/util/StaticAssert.h
index 500e47792a437f8ba7c491b085436dc5c63231d6..c45de59016e1aa0ea37424595b8c7286b2e7d18c 100644
--- a/contrib/eigen/Eigen/src/Core/util/StaticAssert.h
+++ b/contrib/eigen/Eigen/src/Core/util/StaticAssert.h
@@ -27,7 +27,7 @@
 #ifndef EIGEN_STATIC_ASSERT
 #ifndef EIGEN_NO_STATIC_ASSERT
 
-  #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600))
+  #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (EIGEN_COMP_CXXVER >= 11) || (EIGEN_COMP_MSVC >= 1600))
 
     // if native static_assert is enabled, let's use it
     #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG);
@@ -103,7 +103,10 @@
         STORAGE_KIND_MUST_MATCH=1,
         STORAGE_INDEX_MUST_MATCH=1,
         CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1,
-        SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1
+        SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1,
+        INVALID_TEMPLATE_PARAMETER=1,
+        GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1,
+        THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE=1
       };
     };
 
@@ -182,7 +185,7 @@
      )
 
 #define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
-    EIGEN_STATIC_ASSERT(!NumTraits<TYPE>::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
+    EIGEN_STATIC_ASSERT(!Eigen::NumTraits<TYPE>::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
 
 
 // static assertion failing if it is guaranteed at compile-time that the two matrix expression types have different sizes
@@ -192,8 +195,8 @@
     YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES)
 
 #define EIGEN_STATIC_ASSERT_SIZE_1x1(TYPE) \
-      EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Dynamic) && \
-                          (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Dynamic), \
+      EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Eigen::Dynamic) && \
+                          (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Eigen::Dynamic), \
                           THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS)
 
 #define EIGEN_STATIC_ASSERT_LVALUE(Derived) \
diff --git a/contrib/eigen/Eigen/src/Core/util/XprHelper.h b/contrib/eigen/Eigen/src/Core/util/XprHelper.h
index 6bb4970828e61096398e11d2f863338b26d35232..71c32b8a116ba86bf04a00e8c260ef912154a01a 100644
--- a/contrib/eigen/Eigen/src/Core/util/XprHelper.h
+++ b/contrib/eigen/Eigen/src/Core/util/XprHelper.h
@@ -49,6 +49,12 @@ template<typename T> struct is_valid_index_type
   };
 };
 
+// true if both types are not valid index types
+template<typename RowIndices, typename ColIndices>
+struct valid_indexed_view_overload {
+  enum { value = !(internal::is_valid_index_type<RowIndices>::value && internal::is_valid_index_type<ColIndices>::value) };
+};
+
 // promote_scalar_arg is an helper used in operation between an expression and a scalar, like:
 //    expression * scalar
 // Its role is to determine how the type T of the scalar operand should be promoted given the scalar type ExprScalar of the given expression.
@@ -123,19 +129,23 @@ struct promote_index_type
 template<typename T, int Value> class variable_if_dynamic
 {
   public:
-    EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamic)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(variable_if_dynamic)
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
-    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {}
+    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    T value() { return T(Value); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    operator T() const { return T(Value); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void setValue(T v) const { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
 };
 
 template<typename T> class variable_if_dynamic<T, Dynamic>
 {
     T m_value;
-    EIGEN_DEVICE_FUNC variable_if_dynamic() { eigen_assert(false); }
   public:
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value) : m_value(value) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) EIGEN_NO_THROW : m_value(value) {}
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
 };
 
@@ -146,8 +156,10 @@ template<typename T, int Value> class variable_if_dynamicindex
   public:
     EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamicindex)
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
-    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {}
+    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+    T value() { return T(Value); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    void setValue(T) {}
 };
 
 template<typename T> class variable_if_dynamicindex<T, DynamicIndex>
@@ -172,16 +184,7 @@ template<typename T> struct functor_traits
 
 template<typename T> struct packet_traits;
 
-template<typename T> struct unpacket_traits
-{
-  typedef T type;
-  typedef T half;
-  enum
-  {
-    size = 1,
-    alignment = 1
-  };
-};
+template<typename T> struct unpacket_traits;
 
 template<int Size, typename PacketType,
          bool Stop = Size==Dynamic || (Size%unpacket_traits<PacketType>::size)==0 || is_same<PacketType,typename unpacket_traits<PacketType>::half>::value>
@@ -400,7 +403,7 @@ template<typename T> struct plain_matrix_type_row_major
   typedef Matrix<typename traits<T>::Scalar,
                 Rows,
                 Cols,
-                (MaxCols==1&&MaxRows!=1) ? RowMajor : ColMajor,
+                (MaxCols==1&&MaxRows!=1) ? ColMajor : RowMajor,
                 MaxRows,
                 MaxCols
           > type;
@@ -417,7 +420,7 @@ struct ref_selector
     T const&,
     const T
   >::type type;
-  
+
   typedef typename conditional<
     bool(traits<T>::Flags & NestByRefBit),
     T &,
@@ -455,7 +458,7 @@ template<typename T, int n, typename PlainObject = typename plain_object_eval<T>
 {
   enum {
     ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
-    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a tempory?
+    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a temporary?
                                                   //      Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1.
                                                   //      This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON
                                                   //      for all evaluator creating a temporary. This flag is then propagated by the parent evaluators.
@@ -596,14 +599,14 @@ template<typename ExpressionType, typename Scalar = typename ExpressionType::Sca
 struct plain_row_type
 {
   typedef Matrix<Scalar, 1, ExpressionType::ColsAtCompileTime,
-                 ExpressionType::PlainObject::Options | RowMajor, 1, ExpressionType::MaxColsAtCompileTime> MatrixRowType;
+                 int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> MatrixRowType;
   typedef Array<Scalar, 1, ExpressionType::ColsAtCompileTime,
-                 ExpressionType::PlainObject::Options | RowMajor, 1, ExpressionType::MaxColsAtCompileTime> ArrayRowType;
+                 int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> ArrayRowType;
 
   typedef typename conditional<
     is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
     MatrixRowType,
-    ArrayRowType 
+    ArrayRowType
   >::type type;
 };
 
@@ -618,7 +621,7 @@ struct plain_col_type
   typedef typename conditional<
     is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
     MatrixColType,
-    ArrayColType 
+    ArrayColType
   >::type type;
 };
 
@@ -634,7 +637,7 @@ struct plain_diag_type
   typedef typename conditional<
     is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
     MatrixDiagType,
-    ArrayDiagType 
+    ArrayDiagType
   >::type type;
 };
 
@@ -671,24 +674,39 @@ template<typename T> struct is_diagonal<DiagonalWrapper<T> >
 template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
 { enum { ret = true }; };
 
+
+template<typename T> struct is_identity
+{ enum { value = false }; };
+
+template<typename T> struct is_identity<CwiseNullaryOp<internal::scalar_identity_op<typename T::Scalar>, T> >
+{ enum { value = true }; };
+
+
 template<typename S1, typename S2> struct glue_shapes;
 template<> struct glue_shapes<DenseShape,TriangularShape> { typedef TriangularShape type;  };
 
 template<typename T1, typename T2>
-bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<has_direct_access<T1>::ret&&has_direct_access<T2>::ret, T1>::type * = 0)
+struct possibly_same_dense {
+  enum { value = has_direct_access<T1>::ret && has_direct_access<T2>::ret && is_same<typename T1::Scalar,typename T2::Scalar>::value };
+};
+
+template<typename T1, typename T2>
+EIGEN_DEVICE_FUNC
+bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<possibly_same_dense<T1,T2>::value>::type * = 0)
 {
   return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride());
 }
 
 template<typename T1, typename T2>
-bool is_same_dense(const T1 &, const T2 &, typename enable_if<!(has_direct_access<T1>::ret&&has_direct_access<T2>::ret), T1>::type * = 0)
+EIGEN_DEVICE_FUNC
+bool is_same_dense(const T1 &, const T2 &, typename enable_if<!possibly_same_dense<T1,T2>::value>::type * = 0)
 {
   return false;
 }
 
 // Internal helper defining the cost of a scalar division for the type T.
 // The default heuristic can be specialized for each scalar type and architecture.
-template<typename T,bool Vectorized=false,typename EnaleIf = void>
+template<typename T,bool Vectorized=false,typename EnableIf = void>
 struct scalar_div_cost {
   enum { value = 8*NumTraits<T>::MulCost };
 };
@@ -735,7 +753,7 @@ std::string demangle_flags(int f)
   if(f&DirectAccessBit)             res += " | Direct";
   if(f&NestByRefBit)                res += " | NestByRef";
   if(f&NoPreferredStorageOrderBit)  res += " | NoPreferredStorageOrderBit";
-  
+
   return res;
 }
 #endif
@@ -832,7 +850,7 @@ struct ScalarBinaryOpTraits<void,void,BinaryOp>
 #define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
   EIGEN_STATIC_ASSERT((Eigen::internal::has_ReturnType<ScalarBinaryOpTraits<LHS, RHS,BINOP> >::value), \
     YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-    
+
 } // end namespace Eigen
 
 #endif // EIGEN_XPRHELPER_H
diff --git a/contrib/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/contrib/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h
index dc5fae06a3c0545a2abc207ad14e05eb40c1ebf9..081e918f1329b162604fc5bf9a36302feaa479b0 100644
--- a/contrib/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h
+++ b/contrib/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h
@@ -214,7 +214,7 @@ template<typename _MatrixType> class ComplexEigenSolver
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
       */
     ComputationInfo info() const
     {
diff --git a/contrib/eigen/Eigen/src/Eigenvalues/ComplexSchur.h b/contrib/eigen/Eigen/src/Eigenvalues/ComplexSchur.h
index 4354e4018fda4a163431559d9b330498908681e6..fc71468f8dfcb9316c5ded0ea2c569094eedf22c 100644
--- a/contrib/eigen/Eigen/src/Eigenvalues/ComplexSchur.h
+++ b/contrib/eigen/Eigen/src/Eigenvalues/ComplexSchur.h
@@ -212,7 +212,7 @@ template<typename _MatrixType> class ComplexSchur
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
       */
     ComputationInfo info() const
     {
diff --git a/contrib/eigen/Eigen/src/Eigenvalues/EigenSolver.h b/contrib/eigen/Eigen/src/Eigenvalues/EigenSolver.h
index f205b185deef348a0aca395a7bfd95c61e20a474..572b29e4e983aba8e87bf2f262d6bb28a5bec3d9 100644
--- a/contrib/eigen/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/contrib/eigen/Eigen/src/Eigenvalues/EigenSolver.h
@@ -110,7 +110,7 @@ template<typename _MatrixType> class EigenSolver
       *
       * \sa compute() for an example.
       */
-    EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_realSchur(), m_matT(), m_tmp() {}
+    EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_eigenvectorsOk(false), m_realSchur(), m_matT(), m_tmp() {}
 
     /** \brief Default constructor with memory preallocation
       *
@@ -277,7 +277,7 @@ template<typename _MatrixType> class EigenSolver
     template<typename InputType>
     EigenSolver& compute(const EigenBase<InputType>& matrix, bool computeEigenvectors = true);
 
-    /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */
+    /** \returns NumericalIssue if the input contains INF or NaN values or overflow occurred. Returns Success otherwise. */
     ComputationInfo info() const
     {
       eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
diff --git a/contrib/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/contrib/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
index 5f6bb82898b2d30b700f1e12a2f349afd1f34022..d0f9091bebef445bcf6ac5c1cb3aa559281ced98 100644
--- a/contrib/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
+++ b/contrib/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
@@ -121,7 +121,7 @@ class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixT
       *
       * \returns    Reference to \c *this
       *
-      * Accoring to \p options, this function computes eigenvalues and (if requested)
+      * According to \p options, this function computes eigenvalues and (if requested)
       * the eigenvectors of one of the following three generalized eigenproblems:
       * - \c Ax_lBx: \f$ Ax = \lambda B x \f$
       * - \c ABx_lx: \f$ ABx = \lambda x \f$
diff --git a/contrib/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/contrib/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h
index f647f69b0601e7470ecb19eebaa54f83b13666a2..1f21139346e7394b6c522dba3a4368125232fa99 100644
--- a/contrib/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h
+++ b/contrib/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h
@@ -267,7 +267,7 @@ template<typename _MatrixType> class HessenbergDecomposition
 
   private:
 
-    typedef Matrix<Scalar, 1, Size, Options | RowMajor, 1, MaxSize> VectorType;
+    typedef Matrix<Scalar, 1, Size, int(Options) | int(RowMajor), 1, MaxSize> VectorType;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     static void _compute(MatrixType& matA, CoeffVectorType& hCoeffs, VectorType& temp);
 
@@ -315,7 +315,7 @@ void HessenbergDecomposition<MatrixType>::_compute(MatrixType& matA, CoeffVector
 
     // A = A H'
     matA.rightCols(remainingSize)
-        .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1).conjugate(), numext::conj(h), &temp.coeffRef(0));
+        .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1), numext::conj(h), &temp.coeffRef(0));
   }
 }
 
diff --git a/contrib/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/contrib/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
index e4e426071135f7bbe727e76da8c5c2ce9298574c..66e5a3dbb047cb7365fe8eae7cd2547f2293b5a0 100644
--- a/contrib/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
+++ b/contrib/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
@@ -84,7 +84,7 @@ MatrixBase<Derived>::eigenvalues() const
   * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues()
   */
 template<typename MatrixType, unsigned int UpLo> 
-inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
+EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
 SelfAdjointView<MatrixType, UpLo>::eigenvalues() const
 {
   PlainObject thisAsMatrix(*this);
@@ -147,7 +147,7 @@ MatrixBase<Derived>::operatorNorm() const
   * \sa eigenvalues(), MatrixBase::operatorNorm()
   */
 template<typename MatrixType, unsigned int UpLo>
-inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
+EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
 SelfAdjointView<MatrixType, UpLo>::operatorNorm() const
 {
   return eigenvalues().cwiseAbs().maxCoeff();
diff --git a/contrib/eigen/Eigen/src/Eigenvalues/RealQZ.h b/contrib/eigen/Eigen/src/Eigenvalues/RealQZ.h
index b3a910dd9f3088de4a489679f2199ef2bb0129c0..509130184b9e5424d229c2b9f99f4529fd52ebee 100644
--- a/contrib/eigen/Eigen/src/Eigenvalues/RealQZ.h
+++ b/contrib/eigen/Eigen/src/Eigenvalues/RealQZ.h
@@ -90,8 +90,9 @@ namespace Eigen {
         m_Z(size, size),
         m_workspace(size*2),
         m_maxIters(400),
-        m_isInitialized(false)
-        { }
+        m_isInitialized(false),
+        m_computeQZ(true)
+      {}
 
       /** \brief Constructor; computes real QZ decomposition of given matrices
        * 
@@ -108,9 +109,11 @@ namespace Eigen {
         m_Z(A.rows(),A.cols()),
         m_workspace(A.rows()*2),
         m_maxIters(400),
-        m_isInitialized(false) {
-          compute(A, B, computeQZ);
-        }
+        m_isInitialized(false),
+        m_computeQZ(true)
+      {
+        compute(A, B, computeQZ);
+      }
 
       /** \brief Returns matrix Q in the QZ decomposition. 
        *
@@ -161,7 +164,7 @@ namespace Eigen {
 
       /** \brief Reports whether previous computation was successful.
        *
-       * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+       * \returns \c Success if computation was successful, \c NoConvergence otherwise.
        */
       ComputationInfo info() const
       {
diff --git a/contrib/eigen/Eigen/src/Eigenvalues/RealSchur.h b/contrib/eigen/Eigen/src/Eigenvalues/RealSchur.h
index 9191519abe9b1e87eea7f88b7e11c32532c9001c..7304ef3449e1e6a2d983dcb97da02af13497f813 100644
--- a/contrib/eigen/Eigen/src/Eigenvalues/RealSchur.h
+++ b/contrib/eigen/Eigen/src/Eigenvalues/RealSchur.h
@@ -190,7 +190,7 @@ template<typename _MatrixType> class RealSchur
     RealSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ,  bool computeU);
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
       */
     ComputationInfo info() const
     {
@@ -270,8 +270,13 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const EigenBase<InputType>
   // Step 1. Reduce to Hessenberg form
   m_hess.compute(matrix.derived()/scale);
 
-  // Step 2. Reduce to real Schur form  
-  computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU);
+  // Step 2. Reduce to real Schur form
+  // Note: we copy m_hess.matrixQ() into m_matU here and not in computeFromHessenberg
+  //       to be able to pass our working-space buffer for the Householder to Dense evaluation.
+  m_workspaceVector.resize(matrix.cols());
+  if(computeU)
+    m_hess.matrixQ().evalTo(m_matU, m_workspaceVector);
+  computeFromHessenberg(m_hess.matrixH(), m_matU, computeU);
 
   m_matT *= scale;
   
@@ -284,13 +289,13 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
   using std::abs;
 
   m_matT = matrixH;
-  if(computeU)
+  m_workspaceVector.resize(m_matT.cols());
+  if(computeU && !internal::is_same_dense(m_matU,matrixQ))
     m_matU = matrixQ;
   
   Index maxIters = m_maxIters;
   if (maxIters == -1)
     maxIters = m_maxIterationsPerRow * matrixH.rows();
-  m_workspaceVector.resize(m_matT.cols());
   Scalar* workspace = &m_workspaceVector.coeffRef(0);
 
   // The matrix m_matT is divided in three parts. 
diff --git a/contrib/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/contrib/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index d37656fa207a9b8915c1d5aff450532e19594c2a..14692365ffbf3a53eb67c0d64a961e098d14578b 100644
--- a/contrib/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/contrib/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -20,7 +20,9 @@ class GeneralizedSelfAdjointEigenSolver;
 
 namespace internal {
 template<typename SolverType,int Size,bool IsComplex> struct direct_selfadjoint_eigenvalues;
+
 template<typename MatrixType, typename DiagType, typename SubDiagType>
+EIGEN_DEVICE_FUNC
 ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec);
 }
 
@@ -42,10 +44,14 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag
   * \f$ v \f$ such that \f$ Av = \lambda v \f$.  The eigenvalues of a
   * selfadjoint matrix are always real. If \f$ D \f$ is a diagonal matrix with
   * the eigenvalues on the diagonal, and \f$ V \f$ is a matrix with the
-  * eigenvectors as its columns, then \f$ A = V D V^{-1} \f$ (for selfadjoint
-  * matrices, the matrix \f$ V \f$ is always invertible). This is called the
+  * eigenvectors as its columns, then \f$ A = V D V^{-1} \f$. This is called the
   * eigendecomposition.
   *
+  * For a selfadjoint matrix, \f$ V \f$ is unitary, meaning its inverse is equal
+  * to its adjoint, \f$ V^{-1} = V^{\dagger} \f$. If \f$ A \f$ is real, then
+  * \f$ V \f$ is also real and therefore orthogonal, meaning its inverse is
+  * equal to its transpose, \f$ V^{-1} = V^T \f$.
+  *
   * The algorithm exploits the fact that the matrix is selfadjoint, making it
   * faster and more accurate than the general purpose eigenvalue algorithms
   * implemented in EigenSolver and ComplexEigenSolver.
@@ -119,7 +125,10 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
         : m_eivec(),
           m_eivalues(),
           m_subdiag(),
-          m_isInitialized(false)
+          m_hcoeffs(),
+          m_info(InvalidInput),
+          m_isInitialized(false),
+          m_eigenvectorsOk(false)
     { }
 
     /** \brief Constructor, pre-allocates memory for dynamic-size matrices.
@@ -139,7 +148,9 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
         : m_eivec(size, size),
           m_eivalues(size),
           m_subdiag(size > 1 ? size - 1 : 1),
-          m_isInitialized(false)
+          m_hcoeffs(size > 1 ? size - 1 : 1),
+          m_isInitialized(false),
+          m_eigenvectorsOk(false)
     {}
 
     /** \brief Constructor; computes eigendecomposition of given matrix.
@@ -163,7 +174,9 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       : m_eivec(matrix.rows(), matrix.cols()),
         m_eivalues(matrix.cols()),
         m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1),
-        m_isInitialized(false)
+        m_hcoeffs(matrix.cols() > 1 ? matrix.cols() - 1 : 1),
+        m_isInitialized(false),
+        m_eigenvectorsOk(false)
     {
       compute(matrix.derived(), options);
     }
@@ -250,6 +263,11 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       * matrix \f$ A \f$, then the matrix returned by this function is the
       * matrix \f$ V \f$ in the eigendecomposition \f$ A = V D V^{-1} \f$.
       *
+      * For a selfadjoint matrix, \f$ V \f$ is unitary, meaning its inverse is equal
+      * to its adjoint, \f$ V^{-1} = V^{\dagger} \f$. If \f$ A \f$ is real, then
+      * \f$ V \f$ is also real and therefore orthogonal, meaning its inverse is
+      * equal to its transpose, \f$ V^{-1} = V^T \f$.
+      *
       * Example: \include SelfAdjointEigenSolver_eigenvectors.cpp
       * Output: \verbinclude SelfAdjointEigenSolver_eigenvectors.out
       *
@@ -337,7 +355,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
       */
     EIGEN_DEVICE_FUNC
     ComputationInfo info() const
@@ -354,7 +372,8 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
     static const int m_maxIterations = 30;
 
   protected:
-    static void check_template_parameters()
+    static EIGEN_DEVICE_FUNC
+    void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
@@ -362,6 +381,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
     EigenvectorsType m_eivec;
     RealVectorType m_eivalues;
     typename TridiagonalizationType::SubDiagonalType m_subdiag;
+    typename TridiagonalizationType::CoeffVectorType m_hcoeffs;
     ComputationInfo m_info;
     bool m_isInitialized;
     bool m_eigenvectorsOk;
@@ -403,7 +423,7 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
   
   const InputType &matrix(a_matrix.derived());
   
-  using std::abs;
+  EIGEN_USING_STD(abs);
   eigen_assert(matrix.cols() == matrix.rows());
   eigen_assert((options&~(EigVecMask|GenEigMask))==0
           && (options&EigVecMask)!=EigVecMask
@@ -434,7 +454,8 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
   if(scale==RealScalar(0)) scale = RealScalar(1);
   mat.template triangularView<Lower>() /= scale;
   m_subdiag.resize(n-1);
-  internal::tridiagonalization_inplace(mat, diag, m_subdiag, computeEigenvectors);
+  m_hcoeffs.resize(n-1);
+  internal::tridiagonalization_inplace(mat, diag, m_subdiag, m_hcoeffs, computeEigenvectors);
 
   m_info = internal::computeFromTridiagonal_impl(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
   
@@ -479,10 +500,9 @@ namespace internal {
   * \returns \c Success or \c NoConvergence
   */
 template<typename MatrixType, typename DiagType, typename SubDiagType>
+EIGEN_DEVICE_FUNC
 ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec)
 {
-  using std::abs;
-
   ComputationInfo info;
   typedef typename MatrixType::Scalar Scalar;
 
@@ -493,15 +513,23 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag
   
   typedef typename DiagType::RealScalar RealScalar;
   const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
-  const RealScalar precision = RealScalar(2)*NumTraits<RealScalar>::epsilon();
-  
+  const RealScalar precision_inv = RealScalar(1)/NumTraits<RealScalar>::epsilon();
   while (end>0)
   {
-    for (Index i = start; i<end; ++i)
-      if (internal::isMuchSmallerThan(abs(subdiag[i]),(abs(diag[i])+abs(diag[i+1])),precision) || abs(subdiag[i]) <= considerAsZero)
-        subdiag[i] = 0;
+    for (Index i = start; i<end; ++i) {
+      if (numext::abs(subdiag[i]) < considerAsZero) {
+        subdiag[i] = RealScalar(0);
+      } else {
+        // abs(subdiag[i]) <= epsilon * sqrt(abs(diag[i]) + abs(diag[i+1]))
+        // Scaled to prevent underflows.
+        const RealScalar scaled_subdiag = precision_inv * subdiag[i];
+        if (scaled_subdiag * scaled_subdiag <= (numext::abs(diag[i])+numext::abs(diag[i+1]))) {
+          subdiag[i] = RealScalar(0);
+        }
+      }
+    }
 
-    // find the largest unreduced block
+    // find the largest unreduced block at the end of the matrix.
     while (end>0 && subdiag[end-1]==RealScalar(0))
     {
       end--;
@@ -535,7 +563,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag
       diag.segment(i,n-i).minCoeff(&k);
       if (k > 0)
       {
-        std::swap(diag[i], diag[k+i]);
+        numext::swap(diag[i], diag[k+i]);
         if(computeEigenvectors)
           eivec.col(i).swap(eivec.col(k+i));
       }
@@ -566,10 +594,10 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
   EIGEN_DEVICE_FUNC
   static inline void computeRoots(const MatrixType& m, VectorType& roots)
   {
-    EIGEN_USING_STD_MATH(sqrt)
-    EIGEN_USING_STD_MATH(atan2)
-    EIGEN_USING_STD_MATH(cos)
-    EIGEN_USING_STD_MATH(sin)
+    EIGEN_USING_STD(sqrt)
+    EIGEN_USING_STD(atan2)
+    EIGEN_USING_STD(cos)
+    EIGEN_USING_STD(sin)
     const Scalar s_inv3 = Scalar(1)/Scalar(3);
     const Scalar s_sqrt3 = sqrt(Scalar(3));
 
@@ -605,8 +633,8 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
   EIGEN_DEVICE_FUNC
   static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res, Ref<VectorType> representative)
   {
-    EIGEN_USING_STD_MATH(sqrt)
-    EIGEN_USING_STD_MATH(abs)
+    EIGEN_USING_STD(abs);
+    EIGEN_USING_STD(sqrt);
     Index i0;
     // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal):
     mat.diagonal().cwiseAbs().maxCoeff(&i0);
@@ -720,7 +748,7 @@ struct direct_selfadjoint_eigenvalues<SolverType,2,false>
   EIGEN_DEVICE_FUNC
   static inline void computeRoots(const MatrixType& m, VectorType& roots)
   {
-    using std::sqrt;
+    EIGEN_USING_STD(sqrt);
     const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*numext::abs2(m(1,0)));
     const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1));
     roots(0) = t1 - t0;
@@ -730,8 +758,8 @@ struct direct_selfadjoint_eigenvalues<SolverType,2,false>
   EIGEN_DEVICE_FUNC
   static inline void run(SolverType& solver, const MatrixType& mat, int options)
   {
-    EIGEN_USING_STD_MATH(sqrt);
-    EIGEN_USING_STD_MATH(abs);
+    EIGEN_USING_STD(sqrt);
+    EIGEN_USING_STD(abs);
     
     eigen_assert(mat.cols() == 2 && mat.cols() == mat.rows());
     eigen_assert((options&~(EigVecMask|GenEigMask))==0
@@ -804,32 +832,38 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
 }
 
 namespace internal {
+
+// Francis implicit QR step.
 template<int StorageOrder,typename RealScalar, typename Scalar, typename Index>
 EIGEN_DEVICE_FUNC
 static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n)
 {
-  using std::abs;
+  // Wilkinson Shift.
   RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5);
   RealScalar e = subdiag[end-1];
   // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still
   // underflow thus leading to inf/NaN values when using the following commented code:
-//   RealScalar e2 = numext::abs2(subdiag[end-1]);
-//   RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2));
+  //   RealScalar e2 = numext::abs2(subdiag[end-1]);
+  //   RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2));
   // This explain the following, somewhat more complicated, version:
   RealScalar mu = diag[end];
-  if(td==RealScalar(0))
-    mu -= abs(e);
-  else
-  {
-    RealScalar e2 = numext::abs2(subdiag[end-1]);
-    RealScalar h = numext::hypot(td,e);
-    if(e2==RealScalar(0)) mu -= (e / (td + (td>RealScalar(0) ? RealScalar(1) : RealScalar(-1)))) * (e / h);
-    else                  mu -= e2 / (td + (td>RealScalar(0) ? h : -h));
+  if(td==RealScalar(0)) {
+    mu -= numext::abs(e);
+  } else if (e != RealScalar(0)) {
+    const RealScalar e2 = numext::abs2(e);
+    const RealScalar h = numext::hypot(td,e);
+    if(e2 == RealScalar(0)) {
+      mu -= e / ((td + (td>RealScalar(0) ? h : -h)) / e);
+    } else {
+      mu -= e2 / (td + (td>RealScalar(0) ? h : -h)); 
+    }
   }
-  
+
   RealScalar x = diag[start] - mu;
   RealScalar z = subdiag[start];
-  for (Index k = start; k < end; ++k)
+  // If z ever becomes zero, the Givens rotation will be the identity and
+  // z will stay zero for all future iterations.
+  for (Index k = start; k < end && z != RealScalar(0); ++k)
   {
     JacobiRotation<RealScalar> rot;
     rot.makeGivens(x, z);
@@ -842,12 +876,11 @@ static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index sta
     diag[k+1] = rot.s() * sdk + rot.c() * dkp1;
     subdiag[k] = rot.c() * sdk - rot.s() * dkp1;
     
-
     if (k > start)
       subdiag[k - 1] = rot.c() * subdiag[k-1] - rot.s() * z;
 
+    // "Chasing the bulge" to return to triangular form.
     x = subdiag[k];
-
     if (k < end - 1)
     {
       z = -rot.s() * subdiag[k+1];
diff --git a/contrib/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h b/contrib/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h
index 1d102c17bc1fe331ec26d2964458bd1b53b578dd..674c92a39bd63e03a57841b66a7f6478d28e02e5 100644
--- a/contrib/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/contrib/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -11,10 +11,10 @@
 #ifndef EIGEN_TRIDIAGONALIZATION_H
 #define EIGEN_TRIDIAGONALIZATION_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
-  
+
 template<typename MatrixType> struct TridiagonalizationMatrixTReturnType;
 template<typename MatrixType>
 struct traits<TridiagonalizationMatrixTReturnType<MatrixType> >
@@ -25,6 +25,7 @@ struct traits<TridiagonalizationMatrixTReturnType<MatrixType> >
 };
 
 template<typename MatrixType, typename CoeffVectorType>
+EIGEN_DEVICE_FUNC
 void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs);
 }
 
@@ -344,6 +345,7 @@ namespace internal {
   * \sa Tridiagonalization::packedMatrix()
   */
 template<typename MatrixType, typename CoeffVectorType>
+EIGEN_DEVICE_FUNC
 void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs)
 {
   using numext::conj;
@@ -352,7 +354,7 @@ void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs)
   Index n = matA.rows();
   eigen_assert(n==matA.cols());
   eigen_assert(n==hCoeffs.size()+1 || n==1);
-  
+
   for (Index i = 0; i<n-1; ++i)
   {
     Index remainingSize = n-i-1;
@@ -423,11 +425,13 @@ struct tridiagonalization_inplace_selector;
   *
   * \sa class Tridiagonalization
   */
-template<typename MatrixType, typename DiagonalType, typename SubDiagonalType>
-void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
+template<typename MatrixType, typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType>
+EIGEN_DEVICE_FUNC
+void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag,
+                                CoeffVectorType& hcoeffs, bool extractQ)
 {
   eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1);
-  tridiagonalization_inplace_selector<MatrixType>::run(mat, diag, subdiag, extractQ);
+  tridiagonalization_inplace_selector<MatrixType>::run(mat, diag, subdiag, hcoeffs, extractQ);
 }
 
 /** \internal
@@ -439,10 +443,10 @@ struct tridiagonalization_inplace_selector
   typedef typename Tridiagonalization<MatrixType>::CoeffVectorType CoeffVectorType;
   typedef typename Tridiagonalization<MatrixType>::HouseholderSequenceType HouseholderSequenceType;
   template<typename DiagonalType, typename SubDiagonalType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
+  static EIGEN_DEVICE_FUNC
+      void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType& hCoeffs, bool extractQ)
   {
-    CoeffVectorType hCoeffs(mat.cols()-1);
-    tridiagonalization_inplace(mat,hCoeffs);
+    tridiagonalization_inplace(mat, hCoeffs);
     diag = mat.diagonal().real();
     subdiag = mat.template diagonal<-1>().real();
     if(extractQ)
@@ -462,8 +466,8 @@ struct tridiagonalization_inplace_selector<MatrixType,3,false>
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
 
-  template<typename DiagonalType, typename SubDiagonalType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
+  template<typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType>
+  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType&, bool extractQ)
   {
     using std::sqrt;
     const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
@@ -507,8 +511,9 @@ struct tridiagonalization_inplace_selector<MatrixType,1,IsComplex>
 {
   typedef typename MatrixType::Scalar Scalar;
 
-  template<typename DiagonalType, typename SubDiagonalType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ)
+  template<typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType>
+  static EIGEN_DEVICE_FUNC
+  void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, CoeffVectorType&, bool extractQ)
   {
     diag(0,0) = numext::real(mat(0,0));
     if(extractQ)
@@ -542,8 +547,8 @@ template<typename MatrixType> struct TridiagonalizationMatrixTReturnType
       result.template diagonal<-1>() = m_matrix.template diagonal<-1>();
     }
 
-    Index rows() const { return m_matrix.rows(); }
-    Index cols() const { return m_matrix.cols(); }
+    EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); }
+    EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
   protected:
     typename MatrixType::Nested m_matrix;
diff --git a/contrib/eigen/Eigen/src/Geometry/AlignedBox.h b/contrib/eigen/Eigen/src/Geometry/AlignedBox.h
index 066eae4f92170f43e7a9941a2153c92b88fcfa26..55a9d0ae134a6ab2a9f5a6ede2dad5843ceda518 100644
--- a/contrib/eigen/Eigen/src/Geometry/AlignedBox.h
+++ b/contrib/eigen/Eigen/src/Geometry/AlignedBox.h
@@ -7,10 +7,46 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+// Function void Eigen::AlignedBox::transform(const Transform& transform)
+// is provided under the following license agreement:
+//
+// Software License Agreement (BSD License)
+//
+// Copyright (c) 2011-2014, Willow Garage, Inc.
+// Copyright (c) 2014-2015, Open Source Robotics Foundation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//  * Neither the name of Open Source Robotics Foundation nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
 #ifndef EIGEN_ALIGNEDBOX_H
 #define EIGEN_ALIGNEDBOX_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \geometry_module \ingroup Geometry_Module
   *
@@ -63,7 +99,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
 
   /** Default constructor initializing a null box. */
   EIGEN_DEVICE_FUNC inline AlignedBox()
-  { if (AmbientDimAtCompileTime!=Dynamic) setEmpty(); }
+  { if (EIGEN_CONST_CONDITIONAL(AmbientDimAtCompileTime!=Dynamic)) setEmpty(); }
 
   /** Constructs a null box with \a _dim the dimension of the ambient space. */
   EIGEN_DEVICE_FUNC inline explicit AlignedBox(Index _dim) : m_min(_dim), m_max(_dim)
@@ -231,7 +267,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
   {return AlignedBox(m_min.cwiseMax(b.m_min), m_max.cwiseMin(b.m_max)); }
 
   /** Returns an AlignedBox that is the union of \a b and \c *this.
-   * \note Merging with an empty box may result in a box bigger than \c *this. 
+   * \note Merging with an empty box may result in a box bigger than \c *this.
    * \sa extend(const AlignedBox&) */
   EIGEN_DEVICE_FUNC inline AlignedBox merged(const AlignedBox& b) const
   { return AlignedBox(m_min.cwiseMin(b.m_min), m_max.cwiseMax(b.m_max)); }
@@ -246,6 +282,15 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
     return *this;
   }
 
+  /** \returns a copy of \c *this translated by the vector \a t. */
+  template<typename Derived>
+  EIGEN_DEVICE_FUNC inline AlignedBox translated(const MatrixBase<Derived>& a_t) const
+  {
+    AlignedBox result(m_min, m_max);
+    result.translate(a_t);
+    return result;
+  }
+
   /** \returns the squared distance between the point \a p and the box \c *this,
     * and zero if \a p is inside the box.
     * \sa exteriorDistance(const MatrixBase&), squaredExteriorDistance(const AlignedBox&)
@@ -265,14 +310,63 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
     */
   template<typename Derived>
   EIGEN_DEVICE_FUNC inline NonInteger exteriorDistance(const MatrixBase<Derived>& p) const
-  { EIGEN_USING_STD_MATH(sqrt) return sqrt(NonInteger(squaredExteriorDistance(p))); }
+  { EIGEN_USING_STD(sqrt) return sqrt(NonInteger(squaredExteriorDistance(p))); }
 
   /** \returns the distance between the boxes \a b and \c *this,
     * and zero if the boxes intersect.
     * \sa squaredExteriorDistance(const AlignedBox&), exteriorDistance(const MatrixBase&)
     */
   EIGEN_DEVICE_FUNC inline NonInteger exteriorDistance(const AlignedBox& b) const
-  { EIGEN_USING_STD_MATH(sqrt) return sqrt(NonInteger(squaredExteriorDistance(b))); }
+  { EIGEN_USING_STD(sqrt) return sqrt(NonInteger(squaredExteriorDistance(b))); }
+
+  /**
+   * Specialization of transform for pure translation.
+   */
+  template<int Mode, int Options>
+  EIGEN_DEVICE_FUNC inline void transform(
+      const typename Transform<Scalar, AmbientDimAtCompileTime, Mode, Options>::TranslationType& translation)
+  {
+    this->translate(translation);
+  }
+
+  /**
+   * Transforms this box by \a transform and recomputes it to
+   * still be an axis-aligned box.
+   *
+   * \note This method is provided under BSD license (see the top of this file).
+   */
+  template<int Mode, int Options>
+  EIGEN_DEVICE_FUNC inline void transform(const Transform<Scalar, AmbientDimAtCompileTime, Mode, Options>& transform)
+  {
+    // Only Affine and Isometry transforms are currently supported.
+    EIGEN_STATIC_ASSERT(Mode == Affine || Mode == AffineCompact || Mode == Isometry, THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS);
+
+    // Method adapted from FCL src/shape/geometric_shapes_utility.cpp#computeBV<AABB, Box>(...)
+    // https://github.com/flexible-collision-library/fcl/blob/fcl-0.4/src/shape/geometric_shapes_utility.cpp#L292
+    //
+    // Here's a nice explanation why it works: https://zeuxcg.org/2010/10/17/aabb-from-obb-with-component-wise-abs/
+
+    // two times rotated extent
+    const VectorType rotated_extent_2 = transform.linear().cwiseAbs() * sizes();
+    // two times new center
+    const VectorType rotated_center_2 = transform.linear() * (this->m_max + this->m_min) +
+        Scalar(2) * transform.translation();
+
+    this->m_max = (rotated_center_2 + rotated_extent_2) / Scalar(2);
+    this->m_min = (rotated_center_2 - rotated_extent_2) / Scalar(2);
+  }
+
+  /**
+   * \returns a copy of \c *this transformed by \a transform and recomputed to
+   * still be an axis-aligned box.
+   */
+  template<int Mode, int Options>
+  EIGEN_DEVICE_FUNC AlignedBox transformed(const Transform<Scalar, AmbientDimAtCompileTime, Mode, Options>& transform) const
+  {
+    AlignedBox result(m_min, m_max);
+    result.transform(transform);
+    return result;
+  }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
     *
diff --git a/contrib/eigen/Eigen/src/Geometry/AngleAxis.h b/contrib/eigen/Eigen/src/Geometry/AngleAxis.h
index 83ee1be4616fa9d004adcc521faaec7a59160098..78328b6b572c69447a59b42d6aa2a1f07afcdc2b 100644
--- a/contrib/eigen/Eigen/src/Geometry/AngleAxis.h
+++ b/contrib/eigen/Eigen/src/Geometry/AngleAxis.h
@@ -169,8 +169,8 @@ template<typename Scalar>
 template<typename QuatDerived>
 EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
 {
-  EIGEN_USING_STD_MATH(atan2)
-  EIGEN_USING_STD_MATH(abs)
+  EIGEN_USING_STD(atan2)
+  EIGEN_USING_STD(abs)
   Scalar n = q.vec().norm();
   if(n<NumTraits<Scalar>::epsilon())
     n = q.vec().stableNorm();
@@ -217,8 +217,8 @@ template<typename Scalar>
 typename AngleAxis<Scalar>::Matrix3
 EIGEN_DEVICE_FUNC AngleAxis<Scalar>::toRotationMatrix(void) const
 {
-  EIGEN_USING_STD_MATH(sin)
-  EIGEN_USING_STD_MATH(cos)
+  EIGEN_USING_STD(sin)
+  EIGEN_USING_STD(cos)
   Matrix3 res;
   Vector3 sin_axis  = sin(m_angle) * m_axis;
   Scalar c = cos(m_angle);
diff --git a/contrib/eigen/Eigen/src/Geometry/EulerAngles.h b/contrib/eigen/Eigen/src/Geometry/EulerAngles.h
index c633268af2c5b857d7de3610dadca3d31859a3f2..19b734ca7e002da98e829e666340bf847245ef53 100644
--- a/contrib/eigen/Eigen/src/Geometry/EulerAngles.h
+++ b/contrib/eigen/Eigen/src/Geometry/EulerAngles.h
@@ -36,9 +36,9 @@ template<typename Derived>
 EIGEN_DEVICE_FUNC inline Matrix<typename MatrixBase<Derived>::Scalar,3,1>
 MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
 {
-  EIGEN_USING_STD_MATH(atan2)
-  EIGEN_USING_STD_MATH(sin)
-  EIGEN_USING_STD_MATH(cos)
+  EIGEN_USING_STD(atan2)
+  EIGEN_USING_STD(sin)
+  EIGEN_USING_STD(cos)
   /* Implemented from Graphics Gems IV */
   EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived,3,3)
 
diff --git a/contrib/eigen/Eigen/src/Geometry/Homogeneous.h b/contrib/eigen/Eigen/src/Geometry/Homogeneous.h
index 5f0da1a9e86971c482ea0f466c5f4c8c701c33ac..94083ac5413a183bb8c0f4fa7b7f41e6d13ba891 100644
--- a/contrib/eigen/Eigen/src/Geometry/Homogeneous.h
+++ b/contrib/eigen/Eigen/src/Geometry/Homogeneous.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_HOMOGENEOUS_H
 #define EIGEN_HOMOGENEOUS_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \geometry_module \ingroup Geometry_Module
   *
@@ -72,9 +72,11 @@ template<typename MatrixType,int _Direction> class Homogeneous
       : m_matrix(matrix)
     {}
 
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows() + (int(Direction)==Vertical   ? 1 : 0); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols() + (int(Direction)==Horizontal ? 1 : 0); }
-    
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows() + (int(Direction)==Vertical   ? 1 : 0); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols() + (int(Direction)==Horizontal ? 1 : 0); }
+
     EIGEN_DEVICE_FUNC const NestedExpression& nestedExpression() const { return m_matrix; }
 
     template<typename Rhs>
@@ -262,8 +264,10 @@ struct homogeneous_left_product_impl<Homogeneous<MatrixType,Vertical>,Lhs>
       m_rhs(rhs)
   {}
 
-  EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
-  EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
 
   template<typename Dest> EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const
   {
@@ -300,8 +304,8 @@ struct homogeneous_right_product_impl<Homogeneous<MatrixType,Horizontal>,Rhs>
     : m_lhs(lhs), m_rhs(rhs)
   {}
 
-  EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
-  EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
 
   template<typename Dest> EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const
   {
@@ -322,7 +326,7 @@ template<typename ArgType,int Direction>
 struct evaluator_traits<Homogeneous<ArgType,Direction> >
 {
   typedef typename storage_kind_to_evaluator_kind<typename ArgType::StorageKind>::Kind Kind;
-  typedef HomogeneousShape Shape;  
+  typedef HomogeneousShape Shape;
 };
 
 template<> struct AssignmentKind<DenseShape,HomogeneousShape> { typedef Dense2Dense Kind; };
@@ -414,7 +418,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, Homogeneous
   typedef typename helper::ConstantBlock ConstantBlock;
   typedef typename helper::Xpr RefactoredXpr;
   typedef evaluator<RefactoredXpr> Base;
-  
+
   EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
     : Base(  xpr.lhs().nestedExpression() .lazyProduct(  xpr.rhs().template topRows<helper::Dim>(xpr.lhs().nestedExpression().cols()) )
             + ConstantBlock(xpr.rhs().row(xpr.rhs().rows()-1),xpr.lhs().rows(), 1) )
@@ -467,7 +471,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   typedef typename helper::ConstantBlock ConstantBlock;
   typedef typename helper::Xpr RefactoredXpr;
   typedef evaluator<RefactoredXpr> Base;
-  
+
   EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
     : Base(   xpr.lhs().template leftCols<helper::Dim>(xpr.rhs().nestedExpression().rows()) .lazyProduct( xpr.rhs().nestedExpression() )
             + ConstantBlock(xpr.lhs().col(xpr.lhs().cols()-1),1,xpr.rhs().cols()) )
diff --git a/contrib/eigen/Eigen/src/Geometry/Hyperplane.h b/contrib/eigen/Eigen/src/Geometry/Hyperplane.h
index 05929b2994103b8b37b6c32ef6b78bd4f2f6d4b3..cebe0355702f2d6ec0313fa6010605da0e025a06 100644
--- a/contrib/eigen/Eigen/src/Geometry/Hyperplane.h
+++ b/contrib/eigen/Eigen/src/Geometry/Hyperplane.h
@@ -119,7 +119,7 @@ public:
     * If the dimension of the ambient space is greater than 2, then there isn't uniqueness,
     * so an arbitrary choice is made.
     */
-  // FIXME to be consitent with the rest this could be implemented as a static Through function ??
+  // FIXME to be consistent with the rest this could be implemented as a static Through function ??
   EIGEN_DEVICE_FUNC explicit Hyperplane(const ParametrizedLine<Scalar, AmbientDimAtCompileTime>& parametrized)
   {
     normal() = parametrized.direction().unitOrthogonal();
diff --git a/contrib/eigen/Eigen/src/Geometry/OrthoMethods.h b/contrib/eigen/Eigen/src/Geometry/OrthoMethods.h
index a035e6310a75d9f3227ad27967aff0fda946d1fa..524aebe1b96a53a5fac41cffd09a7efb9281f63e 100644
--- a/contrib/eigen/Eigen/src/Geometry/OrthoMethods.h
+++ b/contrib/eigen/Eigen/src/Geometry/OrthoMethods.h
@@ -27,9 +27,10 @@ namespace Eigen {
 template<typename Derived>
 template<typename OtherDerived>
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::template cross_product_return_type<OtherDerived>::type
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename MatrixBase<Derived>::template cross_product_return_type<OtherDerived>::type
 #else
-inline typename MatrixBase<Derived>::PlainObject
+typename MatrixBase<Derived>::PlainObject
 #endif
 MatrixBase<Derived>::cross(const MatrixBase<OtherDerived>& other) const
 {
diff --git a/contrib/eigen/Eigen/src/Geometry/ParametrizedLine.h b/contrib/eigen/Eigen/src/Geometry/ParametrizedLine.h
index 1e985d8cde2ee65762ceaf0142ee720f55a0d59e..584f50087bdf79be820dd7bb30bf2f75a0697955 100644
--- a/contrib/eigen/Eigen/src/Geometry/ParametrizedLine.h
+++ b/contrib/eigen/Eigen/src/Geometry/ParametrizedLine.h
@@ -87,7 +87,7 @@ public:
   /** \returns the distance of a point \a p to its projection onto the line \c *this.
     * \sa squaredDistance()
     */
-  EIGEN_DEVICE_FUNC RealScalar distance(const VectorType& p) const { EIGEN_USING_STD_MATH(sqrt) return sqrt(squaredDistance(p)); }
+  EIGEN_DEVICE_FUNC RealScalar distance(const VectorType& p) const { EIGEN_USING_STD(sqrt) return sqrt(squaredDistance(p)); }
 
   /** \returns the projection of a point \a p onto the line \c *this. */
   EIGEN_DEVICE_FUNC VectorType projection(const VectorType& p) const
@@ -104,7 +104,44 @@ public:
   template <int OtherOptions>
   EIGEN_DEVICE_FUNC VectorType intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
 
-  /** \returns \c *this with scalar type casted to \a NewScalarType
+  /** Applies the transformation matrix \a mat to \c *this and returns a reference to \c *this.
+    *
+    * \param mat the Dim x Dim transformation matrix
+    * \param traits specifies whether the matrix \a mat represents an #Isometry
+    *               or a more generic #Affine transformation. The default is #Affine.
+    */
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC inline ParametrizedLine& transform(const MatrixBase<XprType>& mat, TransformTraits traits = Affine)
+  {
+    if (traits==Affine)
+      direction() = (mat * direction()).normalized();
+    else if (traits==Isometry)
+      direction() = mat * direction();
+    else
+    {
+      eigen_assert(0 && "invalid traits value in ParametrizedLine::transform()");
+    }
+    origin() = mat * origin();
+    return *this;
+  }
+
+  /** Applies the transformation \a t to \c *this and returns a reference to \c *this.
+    *
+    * \param t the transformation of dimension Dim
+    * \param traits specifies whether the transformation \a t represents an #Isometry
+    *               or a more generic #Affine transformation. The default is #Affine.
+    *               Other kind of transformations are not supported.
+    */
+  template<int TrOptions>
+  EIGEN_DEVICE_FUNC inline ParametrizedLine& transform(const Transform<Scalar,AmbientDimAtCompileTime,Affine,TrOptions>& t,
+                                                       TransformTraits traits = Affine)
+  {
+    transform(t.linear(), traits);
+    origin() += t.translation();
+    return *this;
+  }
+
+/** \returns \c *this with scalar type casted to \a NewScalarType
     *
     * Note that if \a NewScalarType is equal to the current scalar type of \c *this
     * then this function smartly returns a const reference to \c *this.
diff --git a/contrib/eigen/Eigen/src/Geometry/Quaternion.h b/contrib/eigen/Eigen/src/Geometry/Quaternion.h
index b818206568e7e6b89ee5da1003c0ba1409053f52..3259e592df85afa2cc1a1e3518776f2ada1474c1 100644
--- a/contrib/eigen/Eigen/src/Geometry/Quaternion.h
+++ b/contrib/eigen/Eigen/src/Geometry/Quaternion.h
@@ -141,7 +141,7 @@ class QuaternionBase : public RotationBase<Derived, 3>
   template<class OtherDerived> EIGEN_DEVICE_FUNC Scalar angularDistance(const QuaternionBase<OtherDerived>& other) const;
 
   /** \returns an equivalent 3x3 rotation matrix */
-  EIGEN_DEVICE_FUNC Matrix3 toRotationMatrix() const;
+  EIGEN_DEVICE_FUNC inline Matrix3 toRotationMatrix() const;
 
   /** \returns the quaternion which transform \a a into \a b through a rotation */
   template<typename Derived1, typename Derived2>
@@ -158,6 +158,22 @@ class QuaternionBase : public RotationBase<Derived, 3>
 
   template<class OtherDerived> EIGEN_DEVICE_FUNC Quaternion<Scalar> slerp(const Scalar& t, const QuaternionBase<OtherDerived>& other) const;
 
+  /** \returns true if each coefficients of \c *this and \a other are all exactly equal.
+    * \warning When using floating point scalar values you probably should rather use a
+    *          fuzzy comparison such as isApprox()
+    * \sa isApprox(), operator!= */
+  template<class OtherDerived>
+  EIGEN_DEVICE_FUNC inline bool operator==(const QuaternionBase<OtherDerived>& other) const
+  { return coeffs() == other.coeffs(); }
+
+  /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
+    * \warning When using floating point scalar values you probably should rather use a
+    *          fuzzy comparison such as isApprox()
+    * \sa isApprox(), operator== */
+  template<class OtherDerived>
+  EIGEN_DEVICE_FUNC inline bool operator!=(const QuaternionBase<OtherDerived>& other) const
+  { return coeffs() != other.coeffs(); }
+
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
     * determined by \a prec.
     *
@@ -181,20 +197,27 @@ class QuaternionBase : public RotationBase<Derived, 3>
   #else
 
   template<typename NewScalarType>
-  EIGEN_DEVICE_FUNC inline 
+  EIGEN_DEVICE_FUNC inline
   typename internal::enable_if<internal::is_same<Scalar,NewScalarType>::value,const Derived&>::type cast() const
   {
     return derived();
   }
 
   template<typename NewScalarType>
-  EIGEN_DEVICE_FUNC inline 
+  EIGEN_DEVICE_FUNC inline
   typename internal::enable_if<!internal::is_same<Scalar,NewScalarType>::value,Quaternion<NewScalarType> >::type cast() const
   {
     return Quaternion<NewScalarType>(coeffs().template cast<NewScalarType>());
   }
   #endif
 
+#ifndef EIGEN_NO_IO
+  friend std::ostream& operator<<(std::ostream& s, const QuaternionBase<Derived>& q) {
+    s << q.x() << "i + " << q.y() << "j + " << q.z() << "k" << " + " << q.w();
+    return s;
+  }
+#endif
+
 #ifdef EIGEN_QUATERNIONBASE_PLUGIN
 # include EIGEN_QUATERNIONBASE_PLUGIN
 #endif
@@ -294,6 +317,21 @@ public:
   EIGEN_DEVICE_FUNC explicit inline Quaternion(const Quaternion<OtherScalar, OtherOptions>& other)
   { m_coeffs = other.coeffs().template cast<Scalar>(); }
 
+#if EIGEN_HAS_RVALUE_REFERENCES
+  // We define a copy constructor, which means we don't get an implicit move constructor or assignment operator.
+  /** Default move constructor */
+  EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
+    : m_coeffs(std::move(other.coeffs()))
+  {}
+
+  /** Default move assignment operator */
+  EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
+  {
+    m_coeffs = std::move(other.coeffs());
+    return *this;
+  }
+#endif
+
   EIGEN_DEVICE_FUNC static Quaternion UnitRandom();
 
   template<typename Derived1, typename Derived2>
@@ -522,8 +560,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator
 template<class Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(const AngleAxisType& aa)
 {
-  EIGEN_USING_STD_MATH(cos)
-  EIGEN_USING_STD_MATH(sin)
+  EIGEN_USING_STD(cos)
+  EIGEN_USING_STD(sin)
   Scalar ha = Scalar(0.5)*aa.angle(); // Scalar(0.5) to suppress precision loss warnings
   this->w() = cos(ha);
   this->vec() = sin(ha) * aa.axis();
@@ -599,7 +637,7 @@ template<class Derived>
 template<typename Derived1, typename Derived2>
 EIGEN_DEVICE_FUNC inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
 {
-  EIGEN_USING_STD_MATH(sqrt)
+  EIGEN_USING_STD(sqrt)
   Vector3 v0 = a.normalized();
   Vector3 v1 = b.normalized();
   Scalar c = v1.dot(v0);
@@ -640,13 +678,13 @@ EIGEN_DEVICE_FUNC inline Derived& QuaternionBase<Derived>::setFromTwoVectors(con
 template<typename Scalar, int Options>
 EIGEN_DEVICE_FUNC Quaternion<Scalar,Options> Quaternion<Scalar,Options>::UnitRandom()
 {
-  EIGEN_USING_STD_MATH(sqrt)
-  EIGEN_USING_STD_MATH(sin)
-  EIGEN_USING_STD_MATH(cos)
+  EIGEN_USING_STD(sqrt)
+  EIGEN_USING_STD(sin)
+  EIGEN_USING_STD(cos)
   const Scalar u1 = internal::random<Scalar>(0, 1),
                u2 = internal::random<Scalar>(0, 2*EIGEN_PI),
                u3 = internal::random<Scalar>(0, 2*EIGEN_PI);
-  const Scalar a = sqrt(1 - u1),
+  const Scalar a = sqrt(Scalar(1) - u1),
                b = sqrt(u1);
   return Quaternion (a * sin(u2), a * cos(u2), b * sin(u3), b * cos(u3));
 }
@@ -725,7 +763,7 @@ template <class OtherDerived>
 EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar
 QuaternionBase<Derived>::angularDistance(const QuaternionBase<OtherDerived>& other) const
 {
-  EIGEN_USING_STD_MATH(atan2)
+  EIGEN_USING_STD(atan2)
   Quaternion<Scalar> d = (*this) * other.conjugate();
   return Scalar(2) * atan2( d.vec().norm(), numext::abs(d.w()) );
 }
@@ -743,8 +781,8 @@ template <class OtherDerived>
 EIGEN_DEVICE_FUNC Quaternion<typename internal::traits<Derived>::Scalar>
 QuaternionBase<Derived>::slerp(const Scalar& t, const QuaternionBase<OtherDerived>& other) const
 {
-  EIGEN_USING_STD_MATH(acos)
-  EIGEN_USING_STD_MATH(sin)
+  EIGEN_USING_STD(acos)
+  EIGEN_USING_STD(sin)
   const Scalar one = Scalar(1) - NumTraits<Scalar>::epsilon();
   Scalar d = this->dot(other);
   Scalar absD = numext::abs(d);
@@ -781,7 +819,7 @@ struct quaternionbase_assign_impl<Other,3,3>
   template<class Derived> EIGEN_DEVICE_FUNC static inline void run(QuaternionBase<Derived>& q, const Other& a_mat)
   {
     const typename internal::nested_eval<Other,2>::type mat(a_mat);
-    EIGEN_USING_STD_MATH(sqrt)
+    EIGEN_USING_STD(sqrt)
     // This algorithm comes from  "Quaternion Calculus and Fast Animation",
     // Ken Shoemake, 1987 SIGGRAPH course notes
     Scalar t = mat.trace();
diff --git a/contrib/eigen/Eigen/src/Geometry/Rotation2D.h b/contrib/eigen/Eigen/src/Geometry/Rotation2D.h
index 884b7d0ee95a2a8eb427068df02d18db24fa5e75..d0bd57569f2e24fb9438909a11fb552f52f95d89 100644
--- a/contrib/eigen/Eigen/src/Geometry/Rotation2D.h
+++ b/contrib/eigen/Eigen/src/Geometry/Rotation2D.h
@@ -175,7 +175,7 @@ template<typename Scalar>
 template<typename Derived>
 EIGEN_DEVICE_FUNC Rotation2D<Scalar>& Rotation2D<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
 {
-  EIGEN_USING_STD_MATH(atan2)
+  EIGEN_USING_STD(atan2)
   EIGEN_STATIC_ASSERT(Derived::RowsAtCompileTime==2 && Derived::ColsAtCompileTime==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
   m_angle = atan2(mat.coeff(1,0), mat.coeff(0,0));
   return *this;
@@ -187,8 +187,8 @@ template<typename Scalar>
 typename Rotation2D<Scalar>::Matrix2
 EIGEN_DEVICE_FUNC Rotation2D<Scalar>::toRotationMatrix(void) const
 {
-  EIGEN_USING_STD_MATH(sin)
-  EIGEN_USING_STD_MATH(cos)
+  EIGEN_USING_STD(sin)
+  EIGEN_USING_STD(cos)
   Scalar sinA = sin(m_angle);
   Scalar cosA = cos(m_angle);
   return (Matrix2() << cosA, -sinA, sinA, cosA).finished();
diff --git a/contrib/eigen/Eigen/src/Geometry/Scaling.h b/contrib/eigen/Eigen/src/Geometry/Scaling.h
old mode 100755
new mode 100644
index f58ca03d94a0dc4de32e35dc2cdcf0bf3e04c6db..d352f1f2b8abcbf23c5a184b1e09f22fa16af89b
--- a/contrib/eigen/Eigen/src/Geometry/Scaling.h
+++ b/contrib/eigen/Eigen/src/Geometry/Scaling.h
@@ -14,7 +14,7 @@ namespace Eigen {
 
 /** \geometry_module \ingroup Geometry_Module
   *
-  * \class Scaling
+  * \class UniformScaling
   *
   * \brief Represents a generic uniform scaling transformation
   *
@@ -29,6 +29,22 @@ namespace Eigen {
   *
   * \sa Scaling(), class DiagonalMatrix, MatrixBase::asDiagonal(), class Translation, class Transform
   */
+
+namespace internal
+{
+  // This helper helps nvcc+MSVC to properly parse this file.
+  // See bug 1412.
+  template <typename Scalar, int Dim, int Mode>
+  struct uniformscaling_times_affine_returntype
+  {
+    enum
+    {
+      NewMode = int(Mode) == int(Isometry) ? Affine : Mode
+    };
+    typedef Transform <Scalar, Dim, NewMode> type;
+  };
+}
+
 template<typename _Scalar>
 class UniformScaling
 {
@@ -60,9 +76,11 @@ public:
 
   /** Concatenates a uniform scaling and an affine transformation */
   template<int Dim, int Mode, int Options>
-  inline Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> operator* (const Transform<Scalar,Dim, Mode, Options>& t) const
+  inline typename
+	internal::uniformscaling_times_affine_returntype<Scalar,Dim,Mode>::type
+	operator* (const Transform<Scalar, Dim, Mode, Options>& t) const
   {
-    Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> res = t;
+    typename internal::uniformscaling_times_affine_returntype<Scalar,Dim,Mode>::type res = t;
     res.prescale(factor());
     return res;
   }
@@ -70,7 +88,7 @@ public:
   /** Concatenates a uniform scaling and a linear transformation matrix */
   // TODO returns an expression
   template<typename Derived>
-  inline typename internal::plain_matrix_type<Derived>::type operator* (const MatrixBase<Derived>& other) const
+  inline typename Eigen::internal::plain_matrix_type<Derived>::type operator* (const MatrixBase<Derived>& other) const
   { return other * m_factor; }
 
   template<typename Derived,int Dim>
@@ -110,7 +128,7 @@ public:
 /** Concatenates a linear transformation matrix and a uniform scaling
   * \relates UniformScaling
   */
-// NOTE this operator is defiend in MatrixBase and not as a friend function
+// NOTE this operator is defined in MatrixBase and not as a friend function
 // of UniformScaling to fix an internal crash of Intel's ICC
 template<typename Derived,typename Scalar>
 EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,Scalar,product)
diff --git a/contrib/eigen/Eigen/src/Geometry/Transform.h b/contrib/eigen/Eigen/src/Geometry/Transform.h
index c21d9e550e8f74c22c9b272f2cadd5cb9c870caa..52b8c2a4eb8e57f9ebd7ca27ee6412a89908314b 100644
--- a/contrib/eigen/Eigen/src/Geometry/Transform.h
+++ b/contrib/eigen/Eigen/src/Geometry/Transform.h
@@ -12,7 +12,7 @@
 #ifndef EIGEN_TRANSFORM_H
 #define EIGEN_TRANSFORM_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -47,7 +47,7 @@ struct transform_left_product_impl;
 
 template< typename Lhs,
           typename Rhs,
-          bool AnyProjective = 
+          bool AnyProjective =
             transform_traits<Lhs>::IsProjective ||
             transform_traits<Rhs>::IsProjective>
 struct transform_transform_product_impl;
@@ -97,6 +97,9 @@ template<int Mode> struct transform_make_affine;
   *              - #AffineCompact: the transformation is stored as a (Dim)x(Dim+1) matrix.
   *              - #Projective: the transformation is stored as a (Dim+1)^2 matrix
   *                             without any assumption.
+  *              - #Isometry: same as #Affine with the additional assumption that
+  *                           the linear part represents a rotation. This assumption is exploited
+  *                           to speed up some functions such as inverse() and rotation().
   * \tparam _Options has the same meaning as in class Matrix. It allows to specify DontAlign and/or RowMajor.
   *                  These Options are passed directly to the underlying matrix type.
   *
@@ -115,7 +118,7 @@ template<int Mode> struct transform_make_affine;
   * \end{array} \right) \f$
   *
   * Note that for a projective transformation the last row can be anything,
-  * and then the interpretation of different parts might be sightly different.
+  * and then the interpretation of different parts might be slightly different.
   *
   * However, unlike a plain matrix, the Transform class provides many features
   * simplifying both its assembly and usage. In particular, it can be composed
@@ -220,9 +223,9 @@ public:
   /** type of the matrix used to represent the linear part of the transformation */
   typedef Matrix<Scalar,Dim,Dim,Options> LinearMatrixType;
   /** type of read/write reference to the linear part of the transformation */
-  typedef Block<MatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> LinearPart;
+  typedef Block<MatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (int(Options)&RowMajor)==0> LinearPart;
   /** type of read reference to the linear part of the transformation */
-  typedef const Block<ConstMatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> ConstLinearPart;
+  typedef const Block<ConstMatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (int(Options)&RowMajor)==0> ConstLinearPart;
   /** type of read/write reference to the affine part of the transformation */
   typedef typename internal::conditional<int(Mode)==int(AffineCompact),
                               MatrixType&,
@@ -239,7 +242,7 @@ public:
   typedef const Block<ConstMatrixType,Dim,1,!(internal::traits<MatrixType>::Flags & RowMajorBit)> ConstTranslationPart;
   /** corresponding translation type */
   typedef Translation<Scalar,Dim> TranslationType;
-  
+
   // this intermediate enum is needed to avoid an ICE with gcc 3.4 and 4.0
   enum { TransformTimeDiagonalMode = ((Mode==int(Isometry))?Affine:int(Mode)) };
   /** The return type of the product between a diagonal matrix and a transform */
@@ -259,12 +262,6 @@ public:
     internal::transform_make_affine<(int(Mode)==Affine || int(Mode)==Isometry) ? Affine : AffineCompact>::run(m_matrix);
   }
 
-  EIGEN_DEVICE_FUNC inline Transform(const Transform& other)
-  {
-    check_template_params();
-    m_matrix = other.m_matrix;
-  }
-
   EIGEN_DEVICE_FUNC inline explicit Transform(const TranslationType& t)
   {
     check_template_params();
@@ -282,9 +279,6 @@ public:
     *this = r;
   }
 
-  EIGEN_DEVICE_FUNC inline Transform& operator=(const Transform& other)
-  { m_matrix = other.m_matrix; return *this; }
-
   typedef internal::transform_take_affine_part<Transform> take_affine_part;
 
   /** Constructs and initializes a transformation from a Dim^2 or a (Dim+1)^2 matrix. */
@@ -308,7 +302,7 @@ public:
     internal::transform_construct_from_matrix<OtherDerived,Mode,Options,Dim,HDim>::run(this, other.derived());
     return *this;
   }
-  
+
   template<int OtherOptions>
   EIGEN_DEVICE_FUNC inline Transform(const Transform<Scalar,Dim,Mode,OtherOptions>& other)
   {
@@ -335,7 +329,7 @@ public:
            OtherModeIsAffineCompact = OtherMode == int(AffineCompact)
     };
 
-    if(ModeIsAffineCompact == OtherModeIsAffineCompact)
+    if(EIGEN_CONST_CONDITIONAL(ModeIsAffineCompact == OtherModeIsAffineCompact))
     {
       // We need the block expression because the code is compiled for all
       // combinations of transformations and will trigger a compile time error
@@ -343,7 +337,7 @@ public:
       m_matrix.template block<Dim,Dim+1>(0,0) = other.matrix().template block<Dim,Dim+1>(0,0);
       makeAffine();
     }
-    else if(OtherModeIsAffineCompact)
+    else if(EIGEN_CONST_CONDITIONAL(OtherModeIsAffineCompact))
     {
       typedef typename Transform<Scalar,Dim,OtherMode,OtherOptions>::MatrixType OtherMatrixType;
       internal::transform_construct_from_matrix<OtherMatrixType,Mode,Options,Dim,HDim>::run(this, other.matrix());
@@ -380,9 +374,9 @@ public:
   inline Transform& operator=(const QTransform& other);
   inline QTransform toQTransform(void) const;
   #endif
-  
-  EIGEN_DEVICE_FUNC Index rows() const { return int(Mode)==int(Projective) ? m_matrix.cols() : (m_matrix.cols()-1); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_matrix.cols(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return int(Mode)==int(Projective) ? m_matrix.cols() : (m_matrix.cols()-1); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); }
 
   /** shortcut for m_matrix(row,col);
     * \sa MatrixBase::operator(Index,Index) const */
@@ -456,7 +450,7 @@ public:
   /** \returns The product expression of a transform \a a times a diagonal matrix \a b
     *
     * The rhs diagonal matrix is interpreted as an affine scaling transformation. The
-    * product results in a Transform of the same type (mode) as the lhs only if the lhs 
+    * product results in a Transform of the same type (mode) as the lhs only if the lhs
     * mode is no isometry. In that case, the returned transform is an affinity.
     */
   template<typename DiagonalDerived>
@@ -471,7 +465,7 @@ public:
   /** \returns The product expression of a diagonal matrix \a a times a transform \a b
     *
     * The lhs diagonal matrix is interpreted as an affine scaling transformation. The
-    * product results in a Transform of the same type (mode) as the lhs only if the lhs 
+    * product results in a Transform of the same type (mode) as the lhs only if the lhs
     * mode is no isometry. In that case, the returned transform is an affinity.
     */
   template<typename DiagonalDerived>
@@ -481,7 +475,7 @@ public:
     TransformTimeDiagonalReturnType res;
     res.linear().noalias() = a*b.linear();
     res.translation().noalias() = a*b.translation();
-    if (Mode!=int(AffineCompact))
+    if (EIGEN_CONST_CONDITIONAL(Mode!=int(AffineCompact)))
       res.matrix().row(Dim) = b.matrix().row(Dim);
     return res;
   }
@@ -494,7 +488,7 @@ public:
   {
     return internal::transform_transform_product_impl<Transform,Transform>::run(*this,other);
   }
-  
+
   #if EIGEN_COMP_ICC
 private:
   // this intermediate structure permits to workaround a bug in ICC 11:
@@ -503,13 +497,13 @@ private:
   //  (the meaning of a name may have changed since the template declaration -- the type of the template is:
   // "Eigen::internal::transform_transform_product_impl<Eigen::Transform<double, 3, 32, 0>,
   //     Eigen::Transform<double, 3, Mode, Options>, <expression>>::ResultType (const Eigen::Transform<double, 3, Mode, Options> &) const")
-  // 
+  //
   template<int OtherMode,int OtherOptions> struct icc_11_workaround
   {
     typedef internal::transform_transform_product_impl<Transform,Transform<Scalar,Dim,OtherMode,OtherOptions> > ProductType;
     typedef typename ProductType::ResultType ResultType;
   };
-  
+
 public:
   /** Concatenates two different transformations */
   template<int OtherMode,int OtherOptions>
@@ -542,7 +536,7 @@ public:
   }
 
   template<typename OtherDerived>
-  EIGEN_DEVICE_FUNC 
+  EIGEN_DEVICE_FUNC
   inline Transform& scale(const MatrixBase<OtherDerived> &other);
 
   template<typename OtherDerived>
@@ -572,18 +566,18 @@ public:
   EIGEN_DEVICE_FUNC Transform& preshear(const Scalar& sx, const Scalar& sy);
 
   EIGEN_DEVICE_FUNC inline Transform& operator=(const TranslationType& t);
-  
+
   EIGEN_DEVICE_FUNC
   inline Transform& operator*=(const TranslationType& t) { return translate(t.vector()); }
-  
+
   EIGEN_DEVICE_FUNC inline Transform operator*(const TranslationType& t) const;
 
-  EIGEN_DEVICE_FUNC 
+  EIGEN_DEVICE_FUNC
   inline Transform& operator=(const UniformScaling<Scalar>& t);
-  
+
   EIGEN_DEVICE_FUNC
   inline Transform& operator*=(const UniformScaling<Scalar>& s) { return scale(s.factor()); }
-  
+
   EIGEN_DEVICE_FUNC
   inline TransformTimeDiagonalReturnType operator*(const UniformScaling<Scalar>& s) const
   {
@@ -602,7 +596,9 @@ public:
   template<typename Derived>
   EIGEN_DEVICE_FUNC inline Transform operator*(const RotationBase<Derived,Dim>& r) const;
 
-  EIGEN_DEVICE_FUNC const LinearMatrixType rotation() const;
+  typedef typename internal::conditional<int(Mode)==Isometry,ConstLinearPart,const LinearMatrixType>::type RotationReturnType;
+  EIGEN_DEVICE_FUNC RotationReturnType rotation() const;
+
   template<typename RotationMatrixType, typename ScalingMatrixType>
   EIGEN_DEVICE_FUNC
   void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const;
@@ -684,7 +680,7 @@ public:
   #ifdef EIGEN_TRANSFORM_PLUGIN
   #include EIGEN_TRANSFORM_PLUGIN
   #endif
-  
+
 protected:
   #ifndef EIGEN_PARSED_BY_DOXYGEN
     EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void check_template_params()
@@ -755,7 +751,7 @@ template<typename Scalar, int Dim, int Mode,int Options>
 Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const QMatrix& other)
 {
   EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  if (Mode == int(AffineCompact))
+  if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact)))
     m_matrix << other.m11(), other.m21(), other.dx(),
                 other.m12(), other.m22(), other.dy();
   else
@@ -801,7 +797,7 @@ Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator
 {
   check_template_params();
   EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  if (Mode == int(AffineCompact))
+  if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact)))
     m_matrix << other.m11(), other.m21(), other.dx(),
                 other.m12(), other.m22(), other.dy();
   else
@@ -819,7 +815,7 @@ template<typename Scalar, int Dim, int Mode, int Options>
 QTransform Transform<Scalar,Dim,Mode,Options>::toQTransform(void) const
 {
   EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  if (Mode == int(AffineCompact))
+  if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact)))
     return QTransform(m_matrix.coeff(0,0), m_matrix.coeff(1,0),
                       m_matrix.coeff(0,1), m_matrix.coeff(1,1),
                       m_matrix.coeff(0,2), m_matrix.coeff(1,2));
@@ -912,7 +908,7 @@ EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::pretranslate(const MatrixBase<OtherDerived> &other)
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  if(int(Mode)==int(Projective))
+  if(EIGEN_CONST_CONDITIONAL(int(Mode)==int(Projective)))
     affine() += other * m_matrix.row(Dim);
   else
     translation() += other;
@@ -1046,20 +1042,43 @@ EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim
 *** Special functions ***
 ************************/
 
+namespace internal {
+template<int Mode> struct transform_rotation_impl {
+  template<typename TransformType>
+  EIGEN_DEVICE_FUNC static inline
+  const typename TransformType::LinearMatrixType run(const TransformType& t)
+  {
+    typedef typename TransformType::LinearMatrixType LinearMatrixType;
+    LinearMatrixType result;
+    t.computeRotationScaling(&result, (LinearMatrixType*)0);
+    return result;
+  }
+};
+template<> struct transform_rotation_impl<Isometry> {
+  template<typename TransformType>
+  EIGEN_DEVICE_FUNC static inline
+  typename TransformType::ConstLinearPart run(const TransformType& t)
+  {
+    return t.linear();
+  }
+};
+}
 /** \returns the rotation part of the transformation
   *
+  * If Mode==Isometry, then this method is an alias for linear(),
+  * otherwise it calls computeRotationScaling() to extract the rotation
+  * through a SVD decomposition.
   *
   * \svd_module
   *
   * \sa computeRotationScaling(), computeScalingRotation(), class SVD
   */
 template<typename Scalar, int Dim, int Mode, int Options>
-EIGEN_DEVICE_FUNC const typename Transform<Scalar,Dim,Mode,Options>::LinearMatrixType
+EIGEN_DEVICE_FUNC
+typename Transform<Scalar,Dim,Mode,Options>::RotationReturnType
 Transform<Scalar,Dim,Mode,Options>::rotation() const
 {
-  LinearMatrixType result;
-  computeRotationScaling(&result, (LinearMatrixType*)0);
-  return result;
+  return internal::transform_rotation_impl<Mode>::run(*this);
 }
 
 
@@ -1078,17 +1097,18 @@ template<typename Scalar, int Dim, int Mode, int Options>
 template<typename RotationMatrixType, typename ScalingMatrixType>
 EIGEN_DEVICE_FUNC void Transform<Scalar,Dim,Mode,Options>::computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const
 {
+  // Note that JacobiSVD is faster than BDCSVD for small matrices.
   JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU | ComputeFullV);
 
-  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
+  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant() < Scalar(0) ? Scalar(-1) : Scalar(1); // so x has absolute value 1
   VectorType sv(svd.singularValues());
-  sv.coeffRef(0) *= x;
-  if(scaling) scaling->lazyAssign(svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint());
+  sv.coeffRef(Dim-1) *= x;
+  if(scaling) *scaling = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint();
   if(rotation)
   {
     LinearMatrixType m(svd.matrixU());
-    m.col(0) /= x;
-    rotation->lazyAssign(m * svd.matrixV().adjoint());
+    m.col(Dim-1) *= x;
+    *rotation = m * svd.matrixV().adjoint();
   }
 }
 
@@ -1107,17 +1127,18 @@ template<typename Scalar, int Dim, int Mode, int Options>
 template<typename ScalingMatrixType, typename RotationMatrixType>
 EIGEN_DEVICE_FUNC void Transform<Scalar,Dim,Mode,Options>::computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const
 {
+  // Note that JacobiSVD is faster than BDCSVD for small matrices.
   JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU | ComputeFullV);
 
-  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
+  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant() < Scalar(0) ? Scalar(-1) : Scalar(1); // so x has absolute value 1
   VectorType sv(svd.singularValues());
-  sv.coeffRef(0) *= x;
-  if(scaling) scaling->lazyAssign(svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint());
+  sv.coeffRef(Dim-1) *= x;
+  if(scaling) *scaling = svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint();
   if(rotation)
   {
     LinearMatrixType m(svd.matrixU());
-    m.col(0) /= x;
-    rotation->lazyAssign(m * svd.matrixV().adjoint());
+    m.col(Dim-1) *= x;
+    *rotation = m * svd.matrixV().adjoint();
   }
 }
 
@@ -1156,7 +1177,7 @@ struct transform_make_affine<AffineCompact>
 {
   template<typename MatrixType> EIGEN_DEVICE_FUNC static void run(MatrixType &) { }
 };
-    
+
 // selector needed to avoid taking the inverse of a 3x4 matrix
 template<typename TransformType, int Mode=TransformType::Mode>
 struct projective_transform_inverse
@@ -1297,8 +1318,8 @@ struct transform_construct_from_matrix<Other, AffineCompact,Options,Dim,HDim, HD
 template<int LhsMode,int RhsMode>
 struct transform_product_result
 {
-  enum 
-  { 
+  enum
+  {
     Mode =
       (LhsMode == (int)Projective    || RhsMode == (int)Projective    ) ? Projective :
       (LhsMode == (int)Affine        || RhsMode == (int)Affine        ) ? Affine :
@@ -1312,7 +1333,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 0, RhsCols>
 {
   typedef typename MatrixType::PlainObject ResultType;
 
-  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
   {
     return T.matrix() * other;
   }
@@ -1321,8 +1342,8 @@ struct transform_right_product_impl< TransformType, MatrixType, 0, RhsCols>
 template< typename TransformType, typename MatrixType, int RhsCols>
 struct transform_right_product_impl< TransformType, MatrixType, 1, RhsCols>
 {
-  enum { 
-    Dim = TransformType::Dim, 
+  enum {
+    Dim = TransformType::Dim,
     HDim = TransformType::HDim,
     OtherRows = MatrixType::RowsAtCompileTime,
     OtherCols = MatrixType::ColsAtCompileTime
@@ -1330,7 +1351,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 1, RhsCols>
 
   typedef typename MatrixType::PlainObject ResultType;
 
-  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
   {
     EIGEN_STATIC_ASSERT(OtherRows==HDim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
 
@@ -1339,7 +1360,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 1, RhsCols>
     ResultType res(other.rows(),other.cols());
     TopLeftLhs(res, 0, 0, Dim, other.cols()).noalias() = T.affine() * other;
     res.row(OtherRows-1) = other.row(OtherRows-1);
-    
+
     return res;
   }
 };
@@ -1347,8 +1368,8 @@ struct transform_right_product_impl< TransformType, MatrixType, 1, RhsCols>
 template< typename TransformType, typename MatrixType, int RhsCols>
 struct transform_right_product_impl< TransformType, MatrixType, 2, RhsCols>
 {
-  enum { 
-    Dim = TransformType::Dim, 
+  enum {
+    Dim = TransformType::Dim,
     HDim = TransformType::HDim,
     OtherRows = MatrixType::RowsAtCompileTime,
     OtherCols = MatrixType::ColsAtCompileTime
@@ -1356,7 +1377,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 2, RhsCols>
 
   typedef typename MatrixType::PlainObject ResultType;
 
-  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
   {
     EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
 
@@ -1381,7 +1402,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 2, 1> // rhs is
 
   typedef typename MatrixType::PlainObject ResultType;
 
-  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
   {
     EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
 
diff --git a/contrib/eigen/Eigen/src/Geometry/Translation.h b/contrib/eigen/Eigen/src/Geometry/Translation.h
index 0e99ce68e265348b280103e81ec8fe2847bc31dc..8c22901219002fc071c2dbc3d0c3788b904239b8 100644
--- a/contrib/eigen/Eigen/src/Geometry/Translation.h
+++ b/contrib/eigen/Eigen/src/Geometry/Translation.h
@@ -70,18 +70,18 @@ public:
   /** Constructs and initialize the translation transformation from a vector of translation coefficients */
   EIGEN_DEVICE_FUNC explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
 
-  /** \brief Retruns the x-translation by value. **/
+  /** \brief Returns the x-translation by value. **/
   EIGEN_DEVICE_FUNC inline Scalar x() const { return m_coeffs.x(); }
-  /** \brief Retruns the y-translation by value. **/
+  /** \brief Returns the y-translation by value. **/
   EIGEN_DEVICE_FUNC inline Scalar y() const { return m_coeffs.y(); }
-  /** \brief Retruns the z-translation by value. **/
+  /** \brief Returns the z-translation by value. **/
   EIGEN_DEVICE_FUNC inline Scalar z() const { return m_coeffs.z(); }
 
-  /** \brief Retruns the x-translation as a reference. **/
+  /** \brief Returns the x-translation as a reference. **/
   EIGEN_DEVICE_FUNC inline Scalar& x() { return m_coeffs.x(); }
-  /** \brief Retruns the y-translation as a reference. **/
+  /** \brief Returns the y-translation as a reference. **/
   EIGEN_DEVICE_FUNC inline Scalar& y() { return m_coeffs.y(); }
-  /** \brief Retruns the z-translation as a reference. **/
+  /** \brief Returns the z-translation as a reference. **/
   EIGEN_DEVICE_FUNC inline Scalar& z() { return m_coeffs.z(); }
 
   EIGEN_DEVICE_FUNC const VectorType& vector() const { return m_coeffs; }
diff --git a/contrib/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h b/contrib/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h
new file mode 100644
index 0000000000000000000000000000000000000000..9af6a9af720d8177759139b33f8f7db80e39f6fa
--- /dev/null
+++ b/contrib/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h
@@ -0,0 +1,168 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Rohit Garg <rpg.314@gmail.com>
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GEOMETRY_SIMD_H
+#define EIGEN_GEOMETRY_SIMD_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<class Derived, class OtherDerived>
+struct quat_product<Architecture::Target, Derived, OtherDerived, float>
+{
+  enum {
+    AAlignment = traits<Derived>::Alignment,
+    BAlignment = traits<OtherDerived>::Alignment,
+    ResAlignment = traits<Quaternion<float> >::Alignment
+  };
+  static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
+  {
+    evaluator<typename Derived::Coefficients> ae(_a.coeffs());
+    evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
+    Quaternion<float> res;
+    const float neg_zero = numext::bit_cast<float>(0x80000000u);
+    const float arr[4] = {0.f, 0.f, 0.f, neg_zero};
+    const Packet4f mask = ploadu<Packet4f>(arr);
+    Packet4f a = ae.template packet<AAlignment,Packet4f>(0);
+    Packet4f b = be.template packet<BAlignment,Packet4f>(0);
+    Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
+    Packet4f s2 = pmul(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
+    pstoret<float,Packet4f,ResAlignment>(
+              &res.x(),
+              padd(psub(pmul(a,vec4f_swizzle1(b,3,3,3,3)),
+                                    pmul(vec4f_swizzle1(a,2,0,1,0),
+                                               vec4f_swizzle1(b,1,2,0,0))),
+                         pxor(mask,padd(s1,s2))));
+    
+    return res;
+  }
+};
+
+template<class Derived>
+struct quat_conj<Architecture::Target, Derived, float>
+{
+  enum {
+    ResAlignment = traits<Quaternion<float> >::Alignment
+  };
+  static inline Quaternion<float> run(const QuaternionBase<Derived>& q)
+  {
+    evaluator<typename Derived::Coefficients> qe(q.coeffs());
+    Quaternion<float> res;
+    const float neg_zero = numext::bit_cast<float>(0x80000000u);
+    const float arr[4] = {neg_zero, neg_zero, neg_zero,0.f};
+    const Packet4f mask = ploadu<Packet4f>(arr);
+    pstoret<float,Packet4f,ResAlignment>(&res.x(), pxor(mask, qe.template packet<traits<Derived>::Alignment,Packet4f>(0)));
+    return res;
+  }
+};
+
+
+template<typename VectorLhs,typename VectorRhs>
+struct cross3_impl<Architecture::Target,VectorLhs,VectorRhs,float,true>
+{
+  enum {
+    ResAlignment = traits<typename plain_matrix_type<VectorLhs>::type>::Alignment
+  };
+  static inline typename plain_matrix_type<VectorLhs>::type
+  run(const VectorLhs& lhs, const VectorRhs& rhs)
+  {
+    evaluator<VectorLhs> lhs_eval(lhs);
+    evaluator<VectorRhs> rhs_eval(rhs);
+    Packet4f a = lhs_eval.template packet<traits<VectorLhs>::Alignment,Packet4f>(0);
+    Packet4f b = rhs_eval.template packet<traits<VectorRhs>::Alignment,Packet4f>(0);
+    Packet4f mul1 = pmul(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3));
+    Packet4f mul2 = pmul(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3));
+    typename plain_matrix_type<VectorLhs>::type res;
+    pstoret<float,Packet4f,ResAlignment>(&res.x(),psub(mul1,mul2));
+    return res;
+  }
+};
+
+
+
+#if (defined EIGEN_VECTORIZE_SSE) || (EIGEN_ARCH_ARM64)
+
+template<class Derived, class OtherDerived>
+struct quat_product<Architecture::Target, Derived, OtherDerived, double>
+{
+  enum {
+    BAlignment = traits<OtherDerived>::Alignment,
+    ResAlignment = traits<Quaternion<double> >::Alignment
+  };
+
+  static inline Quaternion<double> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
+  {
+  Quaternion<double> res;
+
+  evaluator<typename Derived::Coefficients> ae(_a.coeffs());
+  evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
+
+  const double* a = _a.coeffs().data();
+  Packet2d b_xy = be.template packet<BAlignment,Packet2d>(0);
+  Packet2d b_zw = be.template packet<BAlignment,Packet2d>(2);
+  Packet2d a_xx = pset1<Packet2d>(a[0]);
+  Packet2d a_yy = pset1<Packet2d>(a[1]);
+  Packet2d a_zz = pset1<Packet2d>(a[2]);
+  Packet2d a_ww = pset1<Packet2d>(a[3]);
+
+  // two temporaries:
+  Packet2d t1, t2;
+
+  /*
+   * t1 = ww*xy + yy*zw
+   * t2 = zz*xy - xx*zw
+   * res.xy = t1 +/- swap(t2)
+   */
+  t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw));
+  t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw));
+  pstoret<double,Packet2d,ResAlignment>(&res.x(), paddsub(t1, preverse(t2)));
+  
+  /*
+   * t1 = ww*zw - yy*xy
+   * t2 = zz*zw + xx*xy
+   * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2)
+   */
+  t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy));
+  t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy));
+  pstoret<double,Packet2d,ResAlignment>(&res.z(), preverse(paddsub(preverse(t1), t2)));
+
+  return res;
+}
+};
+
+template<class Derived>
+struct quat_conj<Architecture::Target, Derived, double>
+{
+  enum {
+    ResAlignment = traits<Quaternion<double> >::Alignment
+  };
+  static inline Quaternion<double> run(const QuaternionBase<Derived>& q)
+  {
+    evaluator<typename Derived::Coefficients> qe(q.coeffs());
+    Quaternion<double> res;
+    const double neg_zero = numext::bit_cast<double>(0x8000000000000000ull);
+    const double arr1[2] = {neg_zero, neg_zero};
+    const double arr2[2] = {neg_zero,  0.0};
+    const Packet2d mask0 = ploadu<Packet2d>(arr1);
+    const Packet2d mask2 = ploadu<Packet2d>(arr2);
+    pstoret<double,Packet2d,ResAlignment>(&res.x(), pxor(mask0, qe.template packet<traits<Derived>::Alignment,Packet2d>(0)));
+    pstoret<double,Packet2d,ResAlignment>(&res.z(), pxor(mask2, qe.template packet<traits<Derived>::Alignment,Packet2d>(2)));
+    return res;
+  }
+};
+
+#endif // end EIGEN_VECTORIZE_SSE_OR_EIGEN_ARCH_ARM64
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GEOMETRY_SIMD_H
diff --git a/contrib/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h b/contrib/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h
deleted file mode 100644
index f68cab5834fd0fd4b7beffc08e552fb6dfd29dd7..0000000000000000000000000000000000000000
--- a/contrib/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ /dev/null
@@ -1,161 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Rohit Garg <rpg.314@gmail.com>
-// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_GEOMETRY_SSE_H
-#define EIGEN_GEOMETRY_SSE_H
-
-namespace Eigen { 
-
-namespace internal {
-
-template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, float>
-{
-  enum {
-    AAlignment = traits<Derived>::Alignment,
-    BAlignment = traits<OtherDerived>::Alignment,
-    ResAlignment = traits<Quaternion<float> >::Alignment
-  };
-  static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
-  {
-    Quaternion<float> res;
-    const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
-    __m128 a = _a.coeffs().template packet<AAlignment>(0);
-    __m128 b = _b.coeffs().template packet<BAlignment>(0);
-    __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
-    __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
-    pstoret<float,Packet4f,ResAlignment>(
-              &res.x(),
-              _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)),
-                                    _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0),
-                                               vec4f_swizzle1(b,1,2,0,0))),
-                         _mm_xor_ps(mask,_mm_add_ps(s1,s2))));
-    
-    return res;
-  }
-};
-
-template<class Derived>
-struct quat_conj<Architecture::SSE, Derived, float>
-{
-  enum {
-    ResAlignment = traits<Quaternion<float> >::Alignment
-  };
-  static inline Quaternion<float> run(const QuaternionBase<Derived>& q)
-  {
-    Quaternion<float> res;
-    const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f);
-    pstoret<float,Packet4f,ResAlignment>(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet<traits<Derived>::Alignment>(0)));
-    return res;
-  }
-};
-
-
-template<typename VectorLhs,typename VectorRhs>
-struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
-{
-  enum {
-    ResAlignment = traits<typename plain_matrix_type<VectorLhs>::type>::Alignment
-  };
-  static inline typename plain_matrix_type<VectorLhs>::type
-  run(const VectorLhs& lhs, const VectorRhs& rhs)
-  {
-    __m128 a = lhs.template packet<traits<VectorLhs>::Alignment>(0);
-    __m128 b = rhs.template packet<traits<VectorRhs>::Alignment>(0);
-    __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3));
-    __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3));
-    typename plain_matrix_type<VectorLhs>::type res;
-    pstoret<float,Packet4f,ResAlignment>(&res.x(),_mm_sub_ps(mul1,mul2));
-    return res;
-  }
-};
-
-
-
-
-template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, double>
-{
-  enum {
-    BAlignment = traits<OtherDerived>::Alignment,
-    ResAlignment = traits<Quaternion<double> >::Alignment
-  };
-
-  static inline Quaternion<double> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
-  {
-  const Packet2d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
-
-  Quaternion<double> res;
-
-  const double* a = _a.coeffs().data();
-  Packet2d b_xy = _b.coeffs().template packet<BAlignment>(0);
-  Packet2d b_zw = _b.coeffs().template packet<BAlignment>(2);
-  Packet2d a_xx = pset1<Packet2d>(a[0]);
-  Packet2d a_yy = pset1<Packet2d>(a[1]);
-  Packet2d a_zz = pset1<Packet2d>(a[2]);
-  Packet2d a_ww = pset1<Packet2d>(a[3]);
-
-  // two temporaries:
-  Packet2d t1, t2;
-
-  /*
-   * t1 = ww*xy + yy*zw
-   * t2 = zz*xy - xx*zw
-   * res.xy = t1 +/- swap(t2)
-   */
-  t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw));
-  t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw));
-#ifdef EIGEN_VECTORIZE_SSE3
-  EIGEN_UNUSED_VARIABLE(mask)
-  pstoret<double,Packet2d,ResAlignment>(&res.x(), _mm_addsub_pd(t1, preverse(t2)));
-#else
-  pstoret<double,Packet2d,ResAlignment>(&res.x(), padd(t1, pxor(mask,preverse(t2))));
-#endif
-  
-  /*
-   * t1 = ww*zw - yy*xy
-   * t2 = zz*zw + xx*xy
-   * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2)
-   */
-  t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy));
-  t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy));
-#ifdef EIGEN_VECTORIZE_SSE3
-  EIGEN_UNUSED_VARIABLE(mask)
-  pstoret<double,Packet2d,ResAlignment>(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2)));
-#else
-  pstoret<double,Packet2d,ResAlignment>(&res.z(), psub(t1, pxor(mask,preverse(t2))));
-#endif
-
-  return res;
-}
-};
-
-template<class Derived>
-struct quat_conj<Architecture::SSE, Derived, double>
-{
-  enum {
-    ResAlignment = traits<Quaternion<double> >::Alignment
-  };
-  static inline Quaternion<double> run(const QuaternionBase<Derived>& q)
-  {
-    Quaternion<double> res;
-    const __m128d mask0 = _mm_setr_pd(-0.,-0.);
-    const __m128d mask2 = _mm_setr_pd(-0.,0.);
-    pstoret<double,Packet2d,ResAlignment>(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet<traits<Derived>::Alignment>(0)));
-    pstoret<double,Packet2d,ResAlignment>(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet<traits<Derived>::Alignment>(2)));
-    return res;
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_GEOMETRY_SSE_H
diff --git a/contrib/eigen/Eigen/src/Householder/BlockHouseholder.h b/contrib/eigen/Eigen/src/Householder/BlockHouseholder.h
index 01a7ed1884d22d09af3d8668455f9d8bb2fb9ab0..39ce1c2a0efa24f1a86866dd699c4a7216728f47 100644
--- a/contrib/eigen/Eigen/src/Householder/BlockHouseholder.h
+++ b/contrib/eigen/Eigen/src/Householder/BlockHouseholder.h
@@ -63,8 +63,15 @@ void make_block_householder_triangular_factor(TriangularFactorType& triFactor, c
       triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint()
                                                         * vectors.bottomRightCorner(rs, rt).template triangularView<UnitLower>();
             
-      // FIXME add .noalias() once the triangular product can work inplace
-      triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>();
+      // FIXME use the following line with .noalias() once the triangular product can work inplace
+      // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>();
+      for(Index j=nbVecs-1; j>i; --j)
+      {
+        typename TriangularFactorType::Scalar z = triFactor(i,j);
+        triFactor(i,j) = z * triFactor(j,j);
+        if(nbVecs-j-1>0)
+          triFactor.row(i).tail(nbVecs-j-1) += z * triFactor.row(j).tail(nbVecs-j-1);
+      }
       
     }
     triFactor(i,i) = hCoeffs(i);
diff --git a/contrib/eigen/Eigen/src/Householder/Householder.h b/contrib/eigen/Eigen/src/Householder/Householder.h
index 80de2c3052c08b3a333221f37adb43f67c5808a4..5bc037f00d18c992606fed9fef11f989fec373d5 100644
--- a/contrib/eigen/Eigen/src/Householder/Householder.h
+++ b/contrib/eigen/Eigen/src/Householder/Householder.h
@@ -39,6 +39,7 @@ template<int n> struct decrement_size
   *     MatrixBase::applyHouseholderOnTheRight()
   */
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta)
 {
   VectorBlock<Derived, internal::decrement_size<Base::SizeAtCompileTime>::ret> essentialPart(derived(), 1, size()-1);
@@ -62,6 +63,7 @@ void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta)
   */
 template<typename Derived>
 template<typename EssentialPart>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::makeHouseholder(
   EssentialPart& essential,
   Scalar& tau,
@@ -103,13 +105,14 @@ void MatrixBase<Derived>::makeHouseholder(
   * \param essential the essential part of the vector \c v
   * \param tau the scaling factor of the Householder transformation
   * \param workspace a pointer to working space with at least
-  *                  this->cols() * essential.size() entries
+  *                  this->cols() entries
   *
   * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), 
   *     MatrixBase::applyHouseholderOnTheRight()
   */
 template<typename Derived>
 template<typename EssentialPart>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::applyHouseholderOnTheLeft(
   const EssentialPart& essential,
   const Scalar& tau,
@@ -140,13 +143,14 @@ void MatrixBase<Derived>::applyHouseholderOnTheLeft(
   * \param essential the essential part of the vector \c v
   * \param tau the scaling factor of the Householder transformation
   * \param workspace a pointer to working space with at least
-  *                  this->cols() * essential.size() entries
+  *                  this->rows() entries
   *
   * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), 
   *     MatrixBase::applyHouseholderOnTheLeft()
   */
 template<typename Derived>
 template<typename EssentialPart>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::applyHouseholderOnTheRight(
   const EssentialPart& essential,
   const Scalar& tau,
@@ -160,10 +164,10 @@ void MatrixBase<Derived>::applyHouseholderOnTheRight(
   {
     Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace,rows());
     Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(), cols()-1);
-    tmp.noalias() = right * essential.conjugate();
+    tmp.noalias() = right * essential;
     tmp += this->col(0);
     this->col(0) -= tau * tmp;
-    right.noalias() -= tau * tmp * essential.transpose();
+    right.noalias() -= tau * tmp * essential.adjoint();
   }
 }
 
diff --git a/contrib/eigen/Eigen/src/Householder/HouseholderSequence.h b/contrib/eigen/Eigen/src/Householder/HouseholderSequence.h
index 3ce0a693d9bf5a19394a92b584021dcda701ae97..022f6c3dba8e45e97f68753809eef1708daf5537 100644
--- a/contrib/eigen/Eigen/src/Householder/HouseholderSequence.h
+++ b/contrib/eigen/Eigen/src/Householder/HouseholderSequence.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_HOUSEHOLDER_SEQUENCE_H
 #define EIGEN_HOUSEHOLDER_SEQUENCE_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \ingroup Householder_Module
   * \householder_module
@@ -34,8 +34,8 @@ namespace Eigen {
   * form \f$ H = \prod_{i=0}^{n-1} H_i \f$ where the i-th Householder reflection is \f$ H_i = I - h_i v_i
   * v_i^* \f$. The i-th Householder coefficient \f$ h_i \f$ is a scalar and the i-th Householder vector \f$
   * v_i \f$ is a vector of the form
-  * \f[ 
-  * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. 
+  * \f[
+  * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ].
   * \f]
   * The last \f$ n-i \f$ entries of \f$ v_i \f$ are called the essential part of the Householder vector.
   *
@@ -87,7 +87,7 @@ struct hseq_side_dependent_impl
 {
   typedef Block<const VectorsType, Dynamic, 1> EssentialVectorType;
   typedef HouseholderSequence<VectorsType, CoeffsType, OnTheLeft> HouseholderSequenceType;
-  static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
+  static EIGEN_DEVICE_FUNC inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
   {
     Index start = k+1+h.m_shift;
     return Block<const VectorsType,Dynamic,1>(h.m_vectors, start, k, h.rows()-start, 1);
@@ -120,7 +120,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
   : public EigenBase<HouseholderSequence<VectorsType,CoeffsType,Side> >
 {
     typedef typename internal::hseq_side_dependent_impl<VectorsType,CoeffsType,Side>::EssentialVectorType EssentialVectorType;
-  
+
   public:
     enum {
       RowsAtCompileTime = internal::traits<HouseholderSequence>::RowsAtCompileTime,
@@ -140,6 +140,28 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       Side
     > ConjugateReturnType;
 
+    typedef HouseholderSequence<
+      VectorsType,
+      typename internal::conditional<NumTraits<Scalar>::IsComplex,
+        typename internal::remove_all<typename CoeffsType::ConjugateReturnType>::type,
+        CoeffsType>::type,
+      Side
+    > AdjointReturnType;
+
+    typedef HouseholderSequence<
+      typename internal::conditional<NumTraits<Scalar>::IsComplex,
+        typename internal::remove_all<typename VectorsType::ConjugateReturnType>::type,
+        VectorsType>::type,
+      CoeffsType,
+      Side
+    > TransposeReturnType;
+
+    typedef HouseholderSequence<
+      typename internal::add_const<VectorsType>::type,
+      typename internal::add_const<CoeffsType>::type,
+      Side
+    > ConstHouseholderSequence;
+
     /** \brief Constructor.
       * \param[in]  v      %Matrix containing the essential parts of the Householder vectors
       * \param[in]  h      Vector containing the Householder coefficients
@@ -157,33 +179,37 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa setLength(), setShift()
       */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence(const VectorsType& v, const CoeffsType& h)
-      : m_vectors(v), m_coeffs(h), m_trans(false), m_length(v.diagonalSize()),
+      : m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()),
         m_shift(0)
     {
     }
 
     /** \brief Copy constructor. */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence(const HouseholderSequence& other)
       : m_vectors(other.m_vectors),
         m_coeffs(other.m_coeffs),
-        m_trans(other.m_trans),
+        m_reverse(other.m_reverse),
         m_length(other.m_length),
         m_shift(other.m_shift)
     {
     }
 
     /** \brief Number of rows of transformation viewed as a matrix.
-      * \returns Number of rows 
+      * \returns Number of rows
       * \details This equals the dimension of the space that the transformation acts on.
       */
-    Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index rows() const EIGEN_NOEXCEPT { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); }
 
     /** \brief Number of columns of transformation viewed as a matrix.
       * \returns Number of columns
       * \details This equals the dimension of the space that the transformation acts on.
       */
-    Index cols() const { return rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    Index cols() const EIGEN_NOEXCEPT { return rows(); }
 
     /** \brief Essential part of a Householder vector.
       * \param[in]  k  Index of Householder reflection
@@ -191,14 +217,15 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * This function returns the essential part of the Householder vector \f$ v_i \f$. This is a vector of
       * length \f$ n-i \f$ containing the last \f$ n-i \f$ entries of the vector
-      * \f[ 
-      * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. 
+      * \f[
+      * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ].
       * \f]
       * The index \f$ i \f$ equals \p k + shift(), corresponding to the k-th column of the matrix \p v
       * passed to the constructor.
       *
       * \sa setShift(), shift()
       */
+    EIGEN_DEVICE_FUNC
     const EssentialVectorType essentialVector(Index k) const
     {
       eigen_assert(k >= 0 && k < m_length);
@@ -206,31 +233,51 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     }
 
     /** \brief %Transpose of the Householder sequence. */
-    HouseholderSequence transpose() const
+    TransposeReturnType transpose() const
     {
-      return HouseholderSequence(*this).setTrans(!m_trans);
+      return TransposeReturnType(m_vectors.conjugate(), m_coeffs)
+              .setReverseFlag(!m_reverse)
+              .setLength(m_length)
+              .setShift(m_shift);
     }
 
     /** \brief Complex conjugate of the Householder sequence. */
     ConjugateReturnType conjugate() const
     {
       return ConjugateReturnType(m_vectors.conjugate(), m_coeffs.conjugate())
-             .setTrans(m_trans)
+             .setReverseFlag(m_reverse)
              .setLength(m_length)
              .setShift(m_shift);
     }
 
+    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+     *           returns \c *this otherwise.
+     */
+    template<bool Cond>
+    EIGEN_DEVICE_FUNC
+    inline typename internal::conditional<Cond,ConjugateReturnType,ConstHouseholderSequence>::type
+    conjugateIf() const
+    {
+      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstHouseholderSequence>::type ReturnType;
+      return ReturnType(m_vectors.template conjugateIf<Cond>(), m_coeffs.template conjugateIf<Cond>());
+    }
+
     /** \brief Adjoint (conjugate transpose) of the Householder sequence. */
-    ConjugateReturnType adjoint() const
+    AdjointReturnType adjoint() const
     {
-      return conjugate().setTrans(!m_trans);
+      return AdjointReturnType(m_vectors, m_coeffs.conjugate())
+              .setReverseFlag(!m_reverse)
+              .setLength(m_length)
+              .setShift(m_shift);
     }
 
     /** \brief Inverse of the Householder sequence (equals the adjoint). */
-    ConjugateReturnType inverse() const { return adjoint(); }
+    AdjointReturnType inverse() const { return adjoint(); }
 
     /** \internal */
-    template<typename DestType> inline void evalTo(DestType& dst) const
+    template<typename DestType>
+    inline EIGEN_DEVICE_FUNC
+    void evalTo(DestType& dst) const
     {
       Matrix<Scalar, DestType::RowsAtCompileTime, 1,
              AutoAlign|ColMajor, DestType::MaxRowsAtCompileTime, 1> workspace(rows());
@@ -239,6 +286,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
 
     /** \internal */
     template<typename Dest, typename Workspace>
+    EIGEN_DEVICE_FUNC
     void evalTo(Dest& dst, Workspace& workspace) const
     {
       workspace.resize(rows());
@@ -251,7 +299,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
         for(Index k = vecs-1; k >= 0; --k)
         {
           Index cornerSize = rows() - k - m_shift;
-          if(m_trans)
+          if(m_reverse)
             dst.bottomRightCorner(cornerSize, cornerSize)
                .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data());
           else
@@ -265,18 +313,26 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
         for(Index k = 0; k<cols()-vecs ; ++k)
           dst.col(k).tail(rows()-k-1).setZero();
       }
+      else if(m_length>BlockSize)
+      {
+        dst.setIdentity(rows(), rows());
+        if(m_reverse)
+          applyThisOnTheLeft(dst,workspace,true);
+        else
+          applyThisOnTheLeft(dst,workspace,true);
+      }
       else
       {
         dst.setIdentity(rows(), rows());
         for(Index k = vecs-1; k >= 0; --k)
         {
           Index cornerSize = rows() - k - m_shift;
-          if(m_trans)
+          if(m_reverse)
             dst.bottomRightCorner(cornerSize, cornerSize)
-               .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0));
+               .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data());
           else
             dst.bottomRightCorner(cornerSize, cornerSize)
-               .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0));
+               .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), workspace.data());
         }
       }
     }
@@ -295,42 +351,52 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       workspace.resize(dst.rows());
       for(Index k = 0; k < m_length; ++k)
       {
-        Index actual_k = m_trans ? m_length-k-1 : k;
+        Index actual_k = m_reverse ? m_length-k-1 : k;
         dst.rightCols(rows()-m_shift-actual_k)
            .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
       }
     }
 
     /** \internal */
-    template<typename Dest> inline void applyThisOnTheLeft(Dest& dst) const
+    template<typename Dest> inline void applyThisOnTheLeft(Dest& dst, bool inputIsIdentity = false) const
     {
       Matrix<Scalar,1,Dest::ColsAtCompileTime,RowMajor,1,Dest::MaxColsAtCompileTime> workspace;
-      applyThisOnTheLeft(dst, workspace);
+      applyThisOnTheLeft(dst, workspace, inputIsIdentity);
     }
 
     /** \internal */
     template<typename Dest, typename Workspace>
-    inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace) const
+    inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace, bool inputIsIdentity = false) const
     {
-      const Index BlockSize = 48;
+      if(inputIsIdentity && m_reverse)
+        inputIsIdentity = false;
       // if the entries are large enough, then apply the reflectors by block
       if(m_length>=BlockSize && dst.cols()>1)
       {
-        for(Index i = 0; i < m_length; i+=BlockSize)
+        // Make sure we have at least 2 useful blocks, otherwise it is point-less:
+        Index blockSize = m_length<Index(2*BlockSize) ? (m_length+1)/2 : Index(BlockSize);
+        for(Index i = 0; i < m_length; i+=blockSize)
         {
-          Index end = m_trans ? (std::min)(m_length,i+BlockSize) : m_length-i;
-          Index k = m_trans ? i : (std::max)(Index(0),end-BlockSize);
+          Index end = m_reverse ? (std::min)(m_length,i+blockSize) : m_length-i;
+          Index k = m_reverse ? i : (std::max)(Index(0),end-blockSize);
           Index bs = end-k;
           Index start = k + m_shift;
-          
+
           typedef Block<typename internal::remove_all<VectorsType>::type,Dynamic,Dynamic> SubVectorsType;
           SubVectorsType sub_vecs1(m_vectors.const_cast_derived(), Side==OnTheRight ? k : start,
                                                                    Side==OnTheRight ? start : k,
                                                                    Side==OnTheRight ? bs : m_vectors.rows()-start,
                                                                    Side==OnTheRight ? m_vectors.cols()-start : bs);
           typename internal::conditional<Side==OnTheRight, Transpose<SubVectorsType>, SubVectorsType&>::type sub_vecs(sub_vecs1);
-          Block<Dest,Dynamic,Dynamic> sub_dst(dst,dst.rows()-rows()+m_shift+k,0, rows()-m_shift-k,dst.cols());
-          apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_trans);
+
+          Index dstStart = dst.rows()-rows()+m_shift+k;
+          Index dstRows  = rows()-m_shift-k;
+          Block<Dest,Dynamic,Dynamic> sub_dst(dst,
+                                              dstStart,
+                                              inputIsIdentity ? dstStart : 0,
+                                              dstRows,
+                                              inputIsIdentity ? dstRows : dst.cols());
+          apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse);
         }
       }
       else
@@ -338,8 +404,9 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
         workspace.resize(dst.cols());
         for(Index k = 0; k < m_length; ++k)
         {
-          Index actual_k = m_trans ? k : m_length-k-1;
-          dst.bottomRows(rows()-m_shift-actual_k)
+          Index actual_k = m_reverse ? k : m_length-k-1;
+          Index dstStart = rows()-m_shift-actual_k;
+          dst.bottomRightCorner(dstStart, inputIsIdentity ? dstStart : dst.cols())
             .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
         }
       }
@@ -357,7 +424,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     {
       typename internal::matrix_type_times_scalar_type<Scalar, OtherDerived>::Type
         res(other.template cast<typename internal::matrix_type_times_scalar_type<Scalar,OtherDerived>::ResultScalar>());
-      applyThisOnTheLeft(res);
+      applyThisOnTheLeft(res, internal::is_identity<OtherDerived>::value && res.rows()==res.cols());
       return res;
     }
 
@@ -372,6 +439,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa length()
       */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence& setLength(Index length)
     {
       m_length = length;
@@ -389,13 +457,17 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa shift()
       */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence& setShift(Index shift)
     {
       m_shift = shift;
       return *this;
     }
 
+    EIGEN_DEVICE_FUNC
     Index length() const { return m_length; }  /**< \brief Returns the length of the Householder sequence. */
+
+    EIGEN_DEVICE_FUNC
     Index shift() const { return m_shift; }    /**< \brief Returns the shift of the Householder sequence. */
 
     /* Necessary for .adjoint() and .conjugate() */
@@ -403,27 +475,30 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
 
   protected:
 
-    /** \brief Sets the transpose flag.
-      * \param [in]  trans  New value of the transpose flag.
+    /** \internal
+      * \brief Sets the reverse flag.
+      * \param [in]  reverse  New value of the reverse flag.
       *
-      * By default, the transpose flag is not set. If the transpose flag is set, then this object represents 
-      * \f$ H^T = H_{n-1}^T \ldots H_1^T H_0^T \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$.
+      * By default, the reverse flag is not set. If the reverse flag is set, then this object represents
+      * \f$ H^r = H_{n-1} \ldots H_1 H_0 \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$.
+      * \note For real valued HouseholderSequence this is equivalent to transposing \f$ H \f$.
       *
-      * \sa trans()
+      * \sa reverseFlag(), transpose(), adjoint()
       */
-    HouseholderSequence& setTrans(bool trans)
+    HouseholderSequence& setReverseFlag(bool reverse)
     {
-      m_trans = trans;
+      m_reverse = reverse;
       return *this;
     }
 
-    bool trans() const { return m_trans; }     /**< \brief Returns the transpose flag. */
+    bool reverseFlag() const { return m_reverse; }     /**< \internal \brief Returns the reverse flag. */
 
     typename VectorsType::Nested m_vectors;
     typename CoeffsType::Nested m_coeffs;
-    bool m_trans;
+    bool m_reverse;
     Index m_length;
     Index m_shift;
+    enum { BlockSize = 48 };
 };
 
 /** \brief Computes the product of a matrix with a Householder sequence.
@@ -444,7 +519,7 @@ typename internal::matrix_type_times_scalar_type<typename VectorsType::Scalar,Ot
 }
 
 /** \ingroup Householder_Module \householder_module
-  * \brief Convenience function for constructing a Householder sequence. 
+  * \brief Convenience function for constructing a Householder sequence.
   * \returns A HouseholderSequence constructed from the specified arguments.
   */
 template<typename VectorsType, typename CoeffsType>
@@ -454,7 +529,7 @@ HouseholderSequence<VectorsType,CoeffsType> householderSequence(const VectorsTyp
 }
 
 /** \ingroup Householder_Module \householder_module
-  * \brief Convenience function for constructing a Householder sequence. 
+  * \brief Convenience function for constructing a Householder sequence.
   * \returns A HouseholderSequence constructed from the specified arguments.
   * \details This function differs from householderSequence() in that the template argument \p OnTheSide of
   * the constructed HouseholderSequence is set to OnTheRight, instead of the default OnTheLeft.
diff --git a/contrib/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/contrib/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index f66c846ef7920b771fd8360aaecb40d987d14a51..a117fc1551920de7e36ea6e33cbf959a0e62b4ab 100644
--- a/contrib/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/contrib/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_BASIC_PRECONDITIONERS_H
 #define EIGEN_BASIC_PRECONDITIONERS_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \ingroup IterativeLinearSolvers_Module
   * \brief A preconditioner based on the digonal entries
@@ -52,15 +52,15 @@ class DiagonalPreconditioner
       compute(mat);
     }
 
-    Index rows() const { return m_invdiag.size(); }
-    Index cols() const { return m_invdiag.size(); }
-    
+    EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_invdiag.size(); }
+    EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_invdiag.size(); }
+
     template<typename MatType>
     DiagonalPreconditioner& analyzePattern(const MatType& )
     {
       return *this;
     }
-    
+
     template<typename MatType>
     DiagonalPreconditioner& factorize(const MatType& mat)
     {
@@ -77,7 +77,7 @@ class DiagonalPreconditioner
       m_isInitialized = true;
       return *this;
     }
-    
+
     template<typename MatType>
     DiagonalPreconditioner& compute(const MatType& mat)
     {
@@ -99,7 +99,7 @@ class DiagonalPreconditioner
                 && "DiagonalPreconditioner::solve(): invalid number of rows of the right hand side matrix b");
       return Solve<DiagonalPreconditioner, Rhs>(*this, b.derived());
     }
-    
+
     ComputationInfo info() { return Success; }
 
   protected:
@@ -121,7 +121,7 @@ class DiagonalPreconditioner
   * \implsparsesolverconcept
   *
   * The diagonal entries are pre-inverted and stored into a dense vector.
-  * 
+  *
   * \sa class LeastSquaresConjugateGradient, class DiagonalPreconditioner
   */
 template <typename _Scalar>
@@ -146,7 +146,7 @@ class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar>
     {
       return *this;
     }
-    
+
     template<typename MatType>
     LeastSquareDiagonalPreconditioner& factorize(const MatType& mat)
     {
@@ -178,13 +178,13 @@ class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar>
       Base::m_isInitialized = true;
       return *this;
     }
-    
+
     template<typename MatType>
     LeastSquareDiagonalPreconditioner& compute(const MatType& mat)
     {
       return factorize(mat);
     }
-    
+
     ComputationInfo info() { return Success; }
 
   protected:
@@ -205,19 +205,19 @@ class IdentityPreconditioner
 
     template<typename MatrixType>
     explicit IdentityPreconditioner(const MatrixType& ) {}
-    
+
     template<typename MatrixType>
     IdentityPreconditioner& analyzePattern(const MatrixType& ) { return *this; }
-    
+
     template<typename MatrixType>
     IdentityPreconditioner& factorize(const MatrixType& ) { return *this; }
 
     template<typename MatrixType>
     IdentityPreconditioner& compute(const MatrixType& ) { return *this; }
-    
+
     template<typename Rhs>
     inline const Rhs& solve(const Rhs& b) const { return b; }
-    
+
     ComputationInfo info() { return Success; }
 };
 
diff --git a/contrib/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/contrib/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index 454f468149709d266080b80e271e3e3b481febca..153acef65ba921c1ecec39ff670ba2c32b1dfef6 100644
--- a/contrib/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/contrib/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@@ -191,32 +191,16 @@ public:
 
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
   {    
-    bool failed = false;
-    for(Index j=0; j<b.cols(); ++j)
-    {
-      m_iterations = Base::maxIterations();
-      m_error = Base::m_tolerance;
-      
-      typename Dest::ColXpr xj(x,j);
-      if(!internal::bicgstab(matrix(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
-        failed = true;
-    }
-    m_info = failed ? NumericalIssue
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+    
+    bool ret = internal::bicgstab(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error);
+
+    m_info = (!ret) ? NumericalIssue
            : m_error <= Base::m_tolerance ? Success
            : NoConvergence;
-    m_isInitialized = true;
-  }
-
-  /** \internal */
-  using Base::_solve_impl;
-  template<typename Rhs,typename Dest>
-  void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
-  {
-    x.resize(this->rows(),b.cols());
-    x.setZero();
-    _solve_with_guess_impl(b,x);
   }
 
 protected:
diff --git a/contrib/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/contrib/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index f7ce471349a2b39c57d2f0ad8f425c512f99cc79..5d8c6b4339ee592b24f6092cfa67a53c51a2a0da 100644
--- a/contrib/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/contrib/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -51,7 +51,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
     return;
   }
   const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
-  RealScalar threshold = numext::maxi(tol*tol*rhsNorm2,considerAsZero);
+  RealScalar threshold = numext::maxi(RealScalar(tol*tol*rhsNorm2),considerAsZero);
   RealScalar residualNorm2 = residual.squaredNorm();
   if (residualNorm2 < threshold)
   {
@@ -195,7 +195,7 @@ public:
 
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
   {
     typedef typename Base::MatrixWrapper MatrixWrapper;
     typedef typename Base::ActualMatrixType ActualMatrixType;
@@ -211,31 +211,14 @@ public:
                                            RowMajorWrapper,
                                            typename MatrixWrapper::template ConstSelfAdjointViewReturnType<UpLo>::Type
                                           >::type SelfAdjointWrapper;
+
     m_iterations = Base::maxIterations();
     m_error = Base::m_tolerance;
 
-    for(Index j=0; j<b.cols(); ++j)
-    {
-      m_iterations = Base::maxIterations();
-      m_error = Base::m_tolerance;
-
-      typename Dest::ColXpr xj(x,j);
-      RowMajorWrapper row_mat(matrix());
-      internal::conjugate_gradient(SelfAdjointWrapper(row_mat), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
-    }
-
-    m_isInitialized = true;
+    RowMajorWrapper row_mat(matrix());
+    internal::conjugate_gradient(SelfAdjointWrapper(row_mat), b, x, Base::m_preconditioner, m_iterations, m_error);
     m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
   }
-  
-  /** \internal */
-  using Base::_solve_impl;
-  template<typename Rhs,typename Dest>
-  void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
-  {
-    x.setZero();
-    _solve_with_guess_impl(b.derived(),x);
-  }
 
 protected:
 
diff --git a/contrib/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/contrib/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
index e45c272b4c4d7896b2319c1c3bfa47fc5ec01f80..7803fd8170f2f3772fa57abc22726a236026d9ba 100644
--- a/contrib/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
+++ b/contrib/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@@ -14,8 +14,8 @@
 #include <vector>
 #include <list>
 
-namespace Eigen {  
-/** 
+namespace Eigen {
+/**
   * \brief Modified Incomplete Cholesky with dual threshold
   *
   * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
@@ -41,28 +41,22 @@ namespace Eigen {
   * the info() method, then you can either increase the initial shift, or better use another preconditioning technique.
   *
   */
-template <typename Scalar, int _UpLo = Lower, typename _OrderingType =
-#ifndef EIGEN_MPL2_ONLY
-AMDOrdering<int>
-#else
-NaturalOrdering<int>
-#endif
->
+template <typename Scalar, int _UpLo = Lower, typename _OrderingType = AMDOrdering<int> >
 class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_UpLo,_OrderingType> >
 {
   protected:
     typedef SparseSolverBase<IncompleteCholesky<Scalar,_UpLo,_OrderingType> > Base;
     using Base::m_isInitialized;
   public:
-    typedef typename NumTraits<Scalar>::Real RealScalar; 
+    typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef _OrderingType OrderingType;
     typedef typename OrderingType::PermutationType PermutationType;
-    typedef typename PermutationType::StorageIndex StorageIndex; 
+    typedef typename PermutationType::StorageIndex StorageIndex;
     typedef SparseMatrix<Scalar,ColMajor,StorageIndex> FactorType;
     typedef Matrix<Scalar,Dynamic,1> VectorSx;
     typedef Matrix<RealScalar,Dynamic,1> VectorRx;
     typedef Matrix<StorageIndex,Dynamic, 1> VectorIx;
-    typedef std::vector<std::list<StorageIndex> > VectorList; 
+    typedef std::vector<std::list<StorageIndex> > VectorList;
     enum { UpLo = _UpLo };
     enum {
       ColsAtCompileTime = Dynamic,
@@ -76,22 +70,22 @@ class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_Up
       *
       * \sa IncompleteCholesky(const MatrixType&)
       */
-    IncompleteCholesky() : m_initialShift(1e-3),m_factorizationIsOk(false) {}
-    
+    IncompleteCholesky() : m_initialShift(1e-3),m_analysisIsOk(false),m_factorizationIsOk(false) {}
+
     /** Constructor computing the incomplete factorization for the given matrix \a matrix.
       */
     template<typename MatrixType>
-    IncompleteCholesky(const MatrixType& matrix) : m_initialShift(1e-3),m_factorizationIsOk(false)
+    IncompleteCholesky(const MatrixType& matrix) : m_initialShift(1e-3),m_analysisIsOk(false),m_factorizationIsOk(false)
     {
       compute(matrix);
     }
-    
+
     /** \returns number of rows of the factored matrix */
-    Index rows() const { return m_L.rows(); }
-    
+    EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_L.rows(); }
+
     /** \returns number of columns of the factored matrix */
-    Index cols() const { return m_L.cols(); }
-    
+    EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_L.cols(); }
+
 
     /** \brief Reports whether previous computation was successful.
       *
@@ -106,19 +100,19 @@ class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_Up
       eigen_assert(m_isInitialized && "IncompleteCholesky is not initialized.");
       return m_info;
     }
-    
+
     /** \brief Set the initial shift parameter \f$ \sigma \f$.
       */
     void setInitialShift(RealScalar shift) { m_initialShift = shift; }
-    
+
     /** \brief Computes the fill reducing permutation vector using the sparsity pattern of \a mat
       */
     template<typename MatrixType>
     void analyzePattern(const MatrixType& mat)
     {
-      OrderingType ord; 
+      OrderingType ord;
       PermutationType pinv;
-      ord(mat.template selfadjointView<UpLo>(), pinv); 
+      ord(mat.template selfadjointView<UpLo>(), pinv);
       if(pinv.size()>0) m_perm = pinv.inverse();
       else              m_perm.resize(0);
       m_L.resize(mat.rows(), mat.cols());
@@ -126,7 +120,7 @@ class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_Up
       m_isInitialized = true;
       m_info = Success;
     }
-    
+
     /** \brief Performs the numerical factorization of the input matrix \a mat
       *
       * The method analyzePattern() or compute() must have been called beforehand
@@ -136,7 +130,7 @@ class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_Up
       */
     template<typename MatrixType>
     void factorize(const MatrixType& mat);
-    
+
     /** Computes or re-computes the incomplete Cholesky factorization of the input matrix \a mat
       *
       * It is a shortcut for a sequential call to the analyzePattern() and factorize() methods.
@@ -149,7 +143,7 @@ class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_Up
       analyzePattern(mat);
       factorize(mat);
     }
-    
+
     // internal
     template<typename Rhs, typename Dest>
     void _solve_impl(const Rhs& b, Dest& x) const
@@ -176,16 +170,16 @@ class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_Up
 
   protected:
     FactorType m_L;              // The lower part stored in CSC
-    VectorRx m_scale;            // The vector for scaling the matrix 
+    VectorRx m_scale;            // The vector for scaling the matrix
     RealScalar m_initialShift;   // The initial shift parameter
-    bool m_analysisIsOk; 
-    bool m_factorizationIsOk; 
+    bool m_analysisIsOk;
+    bool m_factorizationIsOk;
     ComputationInfo m_info;
-    PermutationType m_perm; 
+    PermutationType m_perm;
 
   private:
-    inline void updateList(Ref<const VectorIx> colPtr, Ref<VectorIx> rowIdx, Ref<VectorSx> vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol); 
-}; 
+    inline void updateList(Ref<const VectorIx> colPtr, Ref<VectorIx> rowIdx, Ref<VectorSx> vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol);
+};
 
 // Based on the following paper:
 //   C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
@@ -196,10 +190,10 @@ template<typename _MatrixType>
 void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType& mat)
 {
   using std::sqrt;
-  eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); 
-    
+  eigen_assert(m_analysisIsOk && "analyzePattern() should be called first");
+
   // Dropping strategy : Keep only the p largest elements per column, where p is the number of elements in the column of the original matrix. Other strategies will be added
-  
+
   // Apply the fill-reducing permutation computed in analyzePattern()
   if (m_perm.rows() == mat.rows() ) // To detect the null permutation
   {
@@ -212,8 +206,8 @@ void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType
   {
     m_L.template selfadjointView<Lower>() = mat.template selfadjointView<_UpLo>();
   }
-  
-  Index n = m_L.cols(); 
+
+  Index n = m_L.cols();
   Index nnz = m_L.nonZeros();
   Map<VectorSx> vals(m_L.valuePtr(), nnz);         //values
   Map<VectorIx> rowIdx(m_L.innerIndexPtr(), nnz);  //Row indices
@@ -225,9 +219,9 @@ void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType
   VectorIx col_pattern(n);
   col_pattern.fill(-1);
   StorageIndex col_nnz;
-  
-  
-  // Computes the scaling factors 
+
+
+  // Computes the scaling factors
   m_scale.resize(n);
   m_scale.setZero();
   for (Index j = 0; j < n; j++)
@@ -237,7 +231,7 @@ void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType
       if(rowIdx[k]!=j)
         m_scale(rowIdx[k]) += numext::abs2(vals(k));
     }
-  
+
   m_scale = m_scale.cwiseSqrt().cwiseSqrt();
 
   for (Index j = 0; j < n; ++j)
@@ -247,8 +241,8 @@ void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType
       m_scale(j) = 1;
 
   // TODO disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster)
-  
-  // Scale and compute the shift for the matrix 
+
+  // Scale and compute the shift for the matrix
   RealScalar mindiag = NumTraits<RealScalar>::highest();
   for (Index j = 0; j < n; j++)
   {
@@ -259,7 +253,7 @@ void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType
   }
 
   FactorType L_save = m_L;
-  
+
   RealScalar shift = 0;
   if(mindiag <= RealScalar(0.))
     shift = m_initialShift - mindiag;
@@ -381,7 +375,7 @@ inline void IncompleteCholesky<Scalar,_UpLo, OrderingType>::updateList(Ref<const
   if (jk < colPtr(col+1) )
   {
     Index p = colPtr(col+1) - jk;
-    Index minpos; 
+    Index minpos;
     rowIdx.segment(jk,p).minCoeff(&minpos);
     minpos += jk;
     if (rowIdx(minpos) != rowIdx(jk))
@@ -395,6 +389,6 @@ inline void IncompleteCholesky<Scalar,_UpLo, OrderingType>::updateList(Ref<const
   }
 }
 
-} // end namespace Eigen 
+} // end namespace Eigen
 
 #endif
diff --git a/contrib/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/contrib/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index 338e6f10a871299f17fa5e41028f2c6a043d0283..cdcf709eb63e0c5785d314db3855c813b8f4e3f3 100644
--- a/contrib/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/contrib/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -12,19 +12,19 @@
 #define EIGEN_INCOMPLETE_LUT_H
 
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
-    
+
 /** \internal
-  * Compute a quick-sort split of a vector 
+  * Compute a quick-sort split of a vector
   * On output, the vector row is permuted such that its elements satisfy
   * abs(row(i)) >= abs(row(ncut)) if i<ncut
-  * abs(row(i)) <= abs(row(ncut)) if i>ncut 
+  * abs(row(i)) <= abs(row(ncut)) if i>ncut
   * \param row The vector of values
   * \param ind The array of index for the elements in @p row
   * \param ncut  The number of largest elements to keep
-  **/ 
+  **/
 template <typename VectorV, typename VectorI>
 Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
 {
@@ -34,15 +34,15 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
   Index mid;
   Index n = row.size(); /* length of the vector */
   Index first, last ;
-  
+
   ncut--; /* to fit the zero-based indices */
-  first = 0; 
-  last = n-1; 
+  first = 0;
+  last = n-1;
   if (ncut < first || ncut > last ) return 0;
-  
+
   do {
-    mid = first; 
-    RealScalar abskey = abs(row(mid)); 
+    mid = first;
+    RealScalar abskey = abs(row(mid));
     for (Index j = first + 1; j <= last; j++) {
       if ( abs(row(j)) > abskey) {
         ++mid;
@@ -53,12 +53,12 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
     /* Interchange for the pivot element */
     swap(row(mid), row(first));
     swap(ind(mid), ind(first));
-    
+
     if (mid > ncut) last = mid - 1;
-    else if (mid < ncut ) first = mid + 1; 
+    else if (mid < ncut ) first = mid + 1;
   } while (mid != ncut );
-  
-  return 0; /* mid is equal to ncut */ 
+
+  return 0; /* mid is equal to ncut */
 }
 
 }// end namespace internal
@@ -71,23 +71,23 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
   *
   * During the numerical factorization, two dropping rules are used :
   *  1) any element whose magnitude is less than some tolerance is dropped.
-  *    This tolerance is obtained by multiplying the input tolerance @p droptol 
+  *    This tolerance is obtained by multiplying the input tolerance @p droptol
   *    by the average magnitude of all the original elements in the current row.
-  *  2) After the elimination of the row, only the @p fill largest elements in 
-  *    the L part and the @p fill largest elements in the U part are kept 
-  *    (in addition to the diagonal element ). Note that @p fill is computed from 
-  *    the input parameter @p fillfactor which is used the ratio to control the fill_in 
+  *  2) After the elimination of the row, only the @p fill largest elements in
+  *    the L part and the @p fill largest elements in the U part are kept
+  *    (in addition to the diagonal element ). Note that @p fill is computed from
+  *    the input parameter @p fillfactor which is used the ratio to control the fill_in
   *    relatively to the initial number of nonzero elements.
-  * 
+  *
   * The two extreme cases are when @p droptol=0 (to keep all the @p fill*2 largest elements)
-  * and when @p fill=n/2 with @p droptol being different to zero. 
-  * 
-  * References : Yousef Saad, ILUT: A dual threshold incomplete LU factorization, 
+  * and when @p fill=n/2 with @p droptol being different to zero.
+  *
+  * References : Yousef Saad, ILUT: A dual threshold incomplete LU factorization,
   *              Numerical Linear Algebra with Applications, 1(4), pp 387-402, 1994.
-  * 
+  *
   * NOTE : The following implementation is derived from the ILUT implementation
-  * in the SPARSKIT package, Copyright (C) 2005, the Regents of the University of Minnesota 
-  *  released under the terms of the GNU LGPL: 
+  * in the SPARSKIT package, Copyright (C) 2005, the Regents of the University of Minnesota
+  *  released under the terms of the GNU LGPL:
   *    http://www-users.cs.umn.edu/~saad/software/SPARSKIT/README
   * However, Yousef Saad gave us permission to relicense his ILUT code to MPL2.
   * See the Eigen mailing list archive, thread: ILUT, date: July 8, 2012:
@@ -115,28 +115,28 @@ class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar, _StorageInd
     };
 
   public:
-    
+
     IncompleteLUT()
       : m_droptol(NumTraits<Scalar>::dummy_precision()), m_fillfactor(10),
         m_analysisIsOk(false), m_factorizationIsOk(false)
     {}
-    
+
     template<typename MatrixType>
     explicit IncompleteLUT(const MatrixType& mat, const RealScalar& droptol=NumTraits<Scalar>::dummy_precision(), int fillfactor = 10)
       : m_droptol(droptol),m_fillfactor(fillfactor),
         m_analysisIsOk(false),m_factorizationIsOk(false)
     {
       eigen_assert(fillfactor != 0);
-      compute(mat); 
+      compute(mat);
     }
-    
-    Index rows() const { return m_lu.rows(); }
-    
-    Index cols() const { return m_lu.cols(); }
+
+    EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
+
+    EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
@@ -144,36 +144,36 @@ class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar, _StorageInd
       eigen_assert(m_isInitialized && "IncompleteLUT is not initialized.");
       return m_info;
     }
-    
+
     template<typename MatrixType>
     void analyzePattern(const MatrixType& amat);
-    
+
     template<typename MatrixType>
     void factorize(const MatrixType& amat);
-    
+
     /**
       * Compute an incomplete LU factorization with dual threshold on the matrix mat
       * No pivoting is done in this version
-      * 
+      *
       **/
     template<typename MatrixType>
     IncompleteLUT& compute(const MatrixType& amat)
     {
-      analyzePattern(amat); 
+      analyzePattern(amat);
       factorize(amat);
       return *this;
     }
 
-    void setDroptol(const RealScalar& droptol); 
-    void setFillfactor(int fillfactor); 
-    
+    void setDroptol(const RealScalar& droptol);
+    void setFillfactor(int fillfactor);
+
     template<typename Rhs, typename Dest>
     void _solve_impl(const Rhs& b, Dest& x) const
     {
       x = m_Pinv * b;
       x = m_lu.template triangularView<UnitLower>().solve(x);
       x = m_lu.template triangularView<Upper>().solve(x);
-      x = m_P * x; 
+      x = m_P * x;
     }
 
 protected:
@@ -200,22 +200,22 @@ protected:
 
 /**
  * Set control parameter droptol
- *  \param droptol   Drop any element whose magnitude is less than this tolerance 
- **/ 
+ *  \param droptol   Drop any element whose magnitude is less than this tolerance
+ **/
 template<typename Scalar, typename StorageIndex>
 void IncompleteLUT<Scalar,StorageIndex>::setDroptol(const RealScalar& droptol)
 {
-  this->m_droptol = droptol;   
+  this->m_droptol = droptol;
 }
 
 /**
  * Set control parameter fillfactor
- * \param fillfactor  This is used to compute the  number @p fill_in of largest elements to keep on each row. 
- **/ 
+ * \param fillfactor  This is used to compute the  number @p fill_in of largest elements to keep on each row.
+ **/
 template<typename Scalar, typename StorageIndex>
 void IncompleteLUT<Scalar,StorageIndex>::setFillfactor(int fillfactor)
 {
-  this->m_fillfactor = fillfactor;   
+  this->m_fillfactor = fillfactor;
 }
 
 template <typename Scalar, typename StorageIndex>
@@ -225,24 +225,15 @@ void IncompleteLUT<Scalar,StorageIndex>::analyzePattern(const _MatrixType& amat)
   // Compute the Fill-reducing permutation
   // Since ILUT does not perform any numerical pivoting,
   // it is highly preferable to keep the diagonal through symmetric permutations.
-#ifndef EIGEN_MPL2_ONLY
   // To this end, let's symmetrize the pattern and perform AMD on it.
   SparseMatrix<Scalar,ColMajor, StorageIndex> mat1 = amat;
   SparseMatrix<Scalar,ColMajor, StorageIndex> mat2 = amat.transpose();
   // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice.
-  //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be prefered...
+  //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be preferred...
   SparseMatrix<Scalar,ColMajor, StorageIndex> AtA = mat2 + mat1;
   AMDOrdering<StorageIndex> ordering;
   ordering(AtA,m_P);
   m_Pinv  = m_P.inverse(); // cache the inverse permutation
-#else
-  // If AMD is not available, (MPL2-only), then let's use the slower COLAMD routine.
-  SparseMatrix<Scalar,ColMajor, StorageIndex> mat1 = amat;
-  COLAMDOrdering<StorageIndex> ordering;
-  ordering(mat1,m_Pinv);
-  m_P = m_Pinv.inverse();
-#endif
-
   m_analysisIsOk = true;
   m_factorizationIsOk = false;
   m_isInitialized = true;
diff --git a/contrib/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/contrib/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index 7c2326eb7fd98246e5589d16a541169b9195c917..28a0c5109e9fef44e1db80d742988f015d00b604 100644
--- a/contrib/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/contrib/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_ITERATIVE_SOLVER_BASE_H
 #define EIGEN_ITERATIVE_SOLVER_BASE_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -145,7 +145,7 @@ class IterativeSolverBase : public SparseSolverBase<Derived>
 protected:
   typedef SparseSolverBase<Derived> Base;
   using Base::m_isInitialized;
-  
+
 public:
   typedef typename internal::traits<Derived>::MatrixType MatrixType;
   typedef typename internal::traits<Derived>::Preconditioner Preconditioner;
@@ -169,10 +169,10 @@ public:
   }
 
   /** Initialize the solver with matrix \a A for further \c Ax=b solving.
-    * 
+    *
     * This constructor is a shortcut for the default constructor followed
     * by a call to compute().
-    * 
+    *
     * \warning this class stores a reference to the matrix A as well as some
     * precomputed values that depend on it. Therefore, if \a A is changed
     * this class becomes invalid. Call compute() to update it with the new
@@ -187,7 +187,7 @@ public:
   }
 
   ~IterativeSolverBase() {}
-  
+
   /** Initializes the iterative solver for the sparsity pattern of the matrix \a A for further solving \c Ax=b problems.
     *
     * Currently, this function mostly calls analyzePattern on the preconditioner. In the future
@@ -203,7 +203,7 @@ public:
     m_info = m_preconditioner.info();
     return derived();
   }
-  
+
   /** Initializes the iterative solver with the numerical values of the matrix \a A for further solving \c Ax=b problems.
     *
     * Currently, this function mostly calls factorize on the preconditioner.
@@ -216,7 +216,7 @@ public:
   template<typename MatrixDerived>
   Derived& factorize(const EigenBase<MatrixDerived>& A)
   {
-    eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); 
+    eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
     grab(A.derived());
     m_preconditioner.factorize(matrix());
     m_factorizationIsOk = true;
@@ -247,16 +247,16 @@ public:
   }
 
   /** \internal */
-  Index rows() const { return matrix().rows(); }
+  EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return matrix().rows(); }
 
   /** \internal */
-  Index cols() const { return matrix().cols(); }
+  EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return matrix().cols(); }
 
   /** \returns the tolerance threshold used by the stopping criteria.
     * \sa setTolerance()
     */
   RealScalar tolerance() const { return m_tolerance; }
-  
+
   /** Sets the tolerance threshold used by the stopping criteria.
     *
     * This value is used as an upper bound to the relative residual error: |Ax-b|/|b|.
@@ -270,19 +270,19 @@ public:
 
   /** \returns a read-write reference to the preconditioner for custom configuration. */
   Preconditioner& preconditioner() { return m_preconditioner; }
-  
+
   /** \returns a read-only reference to the preconditioner. */
   const Preconditioner& preconditioner() const { return m_preconditioner; }
 
   /** \returns the max number of iterations.
-    * It is either the value setted by setMaxIterations or, by default,
+    * It is either the value set by setMaxIterations or, by default,
     * twice the number of columns of the matrix.
     */
   Index maxIterations() const
   {
     return (m_maxIterations<0) ? 2*matrix().cols() : m_maxIterations;
   }
-  
+
   /** Sets the max number of iterations.
     * Default is twice the number of columns of the matrix.
     */
@@ -328,13 +328,13 @@ public:
     eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
     return m_info;
   }
-  
+
   /** \internal */
   template<typename Rhs, typename DestDerived>
-  void _solve_impl(const Rhs& b, SparseMatrixBase<DestDerived> &aDest) const
+  void _solve_with_guess_impl(const Rhs& b, SparseMatrixBase<DestDerived> &aDest) const
   {
     eigen_assert(rows()==b.rows());
-    
+
     Index rhsCols = b.cols();
     Index size = b.rows();
     DestDerived& dest(aDest.derived());
@@ -344,15 +344,65 @@ public:
     // We do not directly fill dest because sparse expressions have to be free of aliasing issue.
     // For non square least-square problems, b and dest might not have the same size whereas they might alias each-other.
     typename DestDerived::PlainObject tmp(cols(),rhsCols);
+    ComputationInfo global_info = Success;
     for(Index k=0; k<rhsCols; ++k)
     {
       tb = b.col(k);
-      tx = derived().solve(tb);
+      tx = dest.col(k);
+      derived()._solve_vector_with_guess_impl(tb,tx);
       tmp.col(k) = tx.sparseView(0);
+
+      // The call to _solve_vector_with_guess_impl updates m_info, so if it failed for a previous column
+      // we need to restore it to the worst value.
+      if(m_info==NumericalIssue)
+        global_info = NumericalIssue;
+      else if(m_info==NoConvergence)
+        global_info = NoConvergence;
     }
+    m_info = global_info;
     dest.swap(tmp);
   }
 
+  template<typename Rhs, typename DestDerived>
+  typename internal::enable_if<Rhs::ColsAtCompileTime!=1 && DestDerived::ColsAtCompileTime!=1>::type
+  _solve_with_guess_impl(const Rhs& b, MatrixBase<DestDerived> &aDest) const
+  {
+    eigen_assert(rows()==b.rows());
+
+    Index rhsCols = b.cols();
+    DestDerived& dest(aDest.derived());
+    ComputationInfo global_info = Success;
+    for(Index k=0; k<rhsCols; ++k)
+    {
+      typename DestDerived::ColXpr xk(dest,k);
+      typename Rhs::ConstColXpr bk(b,k);
+      derived()._solve_vector_with_guess_impl(bk,xk);
+
+      // The call to _solve_vector_with_guess updates m_info, so if it failed for a previous column
+      // we need to restore it to the worst value.
+      if(m_info==NumericalIssue)
+        global_info = NumericalIssue;
+      else if(m_info==NoConvergence)
+        global_info = NoConvergence;
+    }
+    m_info = global_info;
+  }
+
+  template<typename Rhs, typename DestDerived>
+  typename internal::enable_if<Rhs::ColsAtCompileTime==1 || DestDerived::ColsAtCompileTime==1>::type
+  _solve_with_guess_impl(const Rhs& b, MatrixBase<DestDerived> &dest) const
+  {
+    derived()._solve_vector_with_guess_impl(b,dest.derived());
+  }
+
+  /** \internal default initial guess = 0 */
+  template<typename Rhs,typename Dest>
+  void _solve_impl(const Rhs& b, Dest& x) const
+  {
+    x.setZero();
+    derived()._solve_with_guess_impl(b,x);
+  }
+
 protected:
   void init()
   {
@@ -370,19 +420,19 @@ protected:
   {
     return m_matrixWrapper.matrix();
   }
-  
+
   template<typename InputType>
   void grab(const InputType &A)
   {
     m_matrixWrapper.grab(A);
   }
-  
+
   MatrixWrapper m_matrixWrapper;
   Preconditioner m_preconditioner;
 
   Index m_maxIterations;
   RealScalar m_tolerance;
-  
+
   mutable RealScalar m_error;
   mutable Index m_iterations;
   mutable ComputationInfo m_info;
diff --git a/contrib/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/contrib/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
index 0aea0e099d24f3f26ccf7bf3128f86ee0faa7992..203fd0ec63f5979870fb66546e6bd991335b8801 100644
--- a/contrib/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
+++ b/contrib/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
@@ -182,32 +182,14 @@ public:
 
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
   {
     m_iterations = Base::maxIterations();
     m_error = Base::m_tolerance;
 
-    for(Index j=0; j<b.cols(); ++j)
-    {
-      m_iterations = Base::maxIterations();
-      m_error = Base::m_tolerance;
-
-      typename Dest::ColXpr xj(x,j);
-      internal::least_square_conjugate_gradient(matrix(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
-    }
-
-    m_isInitialized = true;
+    internal::least_square_conjugate_gradient(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error);
     m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
   }
-  
-  /** \internal */
-  using Base::_solve_impl;
-  template<typename Rhs,typename Dest>
-  void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
-  {
-    x.setZero();
-    _solve_with_guess_impl(b.derived(),x);
-  }
 
 };
 
diff --git a/contrib/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/contrib/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
index 0ace451771759b424acebae3180aeb8e176b863c..7b896575428056e83c36fcfd2cf11672696d685e 100644
--- a/contrib/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
+++ b/contrib/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
@@ -13,7 +13,7 @@
 namespace Eigen {
 
 template<typename Decomposition, typename RhsType, typename GuessType> class SolveWithGuess;
-  
+
 /** \class SolveWithGuess
   * \ingroup IterativeLinearSolvers_Module
   *
@@ -45,13 +45,15 @@ public:
   typedef typename internal::traits<SolveWithGuess>::PlainObject PlainObject;
   typedef typename internal::generic_xpr_base<SolveWithGuess<Decomposition,RhsType,GuessType>, MatrixXpr, typename internal::traits<RhsType>::StorageKind>::type Base;
   typedef typename internal::ref_selector<SolveWithGuess>::type Nested;
-  
+
   SolveWithGuess(const Decomposition &dec, const RhsType &rhs, const GuessType &guess)
     : m_dec(dec), m_rhs(rhs), m_guess(guess)
   {}
-  
-  EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); }
 
   EIGEN_DEVICE_FUNC const Decomposition& dec()   const { return m_dec; }
   EIGEN_DEVICE_FUNC const RhsType&       rhs()   const { return m_rhs; }
@@ -61,7 +63,7 @@ protected:
   const Decomposition &m_dec;
   const RhsType       &m_rhs;
   const GuessType     &m_guess;
-  
+
 private:
   Scalar coeff(Index row, Index col) const;
   Scalar coeff(Index i) const;
@@ -85,8 +87,8 @@ struct evaluator<SolveWithGuess<Decomposition,RhsType, GuessType> >
     m_result = solve.guess();
     solve.dec()._solve_with_guess_impl(solve.rhs(), m_result);
   }
-  
-protected:  
+
+protected:
   PlainObject m_result;
 };
 
@@ -108,7 +110,7 @@ struct Assignment<DstXprType, SolveWithGuess<DecType,RhsType,GuessType>, interna
   }
 };
 
-} // end namepsace internal
+} // end namespace internal
 
 } // end namespace Eigen
 
diff --git a/contrib/eigen/Eigen/src/Jacobi/Jacobi.h b/contrib/eigen/Eigen/src/Jacobi/Jacobi.h
index 1998c6322743bcc2aabd9bb86b488f7f27e2f284..76668a574aee62f2a52e434fd4b7ebc7d5a952ee 100644
--- a/contrib/eigen/Eigen/src/Jacobi/Jacobi.h
+++ b/contrib/eigen/Eigen/src/Jacobi/Jacobi.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_JACOBI_H
 #define EIGEN_JACOBI_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \ingroup Jacobi_Module
   * \jacobi_module
@@ -37,17 +37,20 @@ template<typename Scalar> class JacobiRotation
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
     /** Default constructor without any initialization. */
+    EIGEN_DEVICE_FUNC
     JacobiRotation() {}
 
     /** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */
+    EIGEN_DEVICE_FUNC
     JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {}
 
-    Scalar& c() { return m_c; }
-    Scalar c() const { return m_c; }
-    Scalar& s() { return m_s; }
-    Scalar s() const { return m_s; }
+    EIGEN_DEVICE_FUNC Scalar& c() { return m_c; }
+    EIGEN_DEVICE_FUNC Scalar c() const { return m_c; }
+    EIGEN_DEVICE_FUNC Scalar& s() { return m_s; }
+    EIGEN_DEVICE_FUNC Scalar s() const { return m_s; }
 
     /** Concatenates two planar rotation */
+    EIGEN_DEVICE_FUNC
     JacobiRotation operator*(const JacobiRotation& other)
     {
       using numext::conj;
@@ -56,19 +59,26 @@ template<typename Scalar> class JacobiRotation
     }
 
     /** Returns the transposed transformation */
+    EIGEN_DEVICE_FUNC
     JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); }
 
     /** Returns the adjoint transformation */
+    EIGEN_DEVICE_FUNC
     JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); }
 
     template<typename Derived>
+    EIGEN_DEVICE_FUNC
     bool makeJacobi(const MatrixBase<Derived>&, Index p, Index q);
+    EIGEN_DEVICE_FUNC
     bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z);
 
+    EIGEN_DEVICE_FUNC
     void makeGivens(const Scalar& p, const Scalar& q, Scalar* r=0);
 
   protected:
+    EIGEN_DEVICE_FUNC
     void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type);
+    EIGEN_DEVICE_FUNC
     void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type);
 
     Scalar m_c, m_s;
@@ -80,10 +90,12 @@ template<typename Scalar> class JacobiRotation
   * \sa MatrixBase::makeJacobi(const MatrixBase<Derived>&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z)
 {
   using std::sqrt;
   using std::abs;
+
   RealScalar deno = RealScalar(2)*abs(y);
   if(deno < (std::numeric_limits<RealScalar>::min)())
   {
@@ -123,6 +135,7 @@ bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, co
   */
 template<typename Scalar>
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Index p, Index q)
 {
   return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q)));
@@ -145,6 +158,7 @@ inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Ind
   * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r)
 {
   makeGivens(p, q, r, typename internal::conditional<NumTraits<Scalar>::IsComplex, internal::true_type, internal::false_type>::type());
@@ -153,12 +167,13 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar
 
 // specialization for complexes
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type)
 {
   using std::sqrt;
   using std::abs;
   using numext::conj;
-  
+
   if(q==Scalar(0))
   {
     m_c = numext::real(p)<0 ? Scalar(-1) : Scalar(1);
@@ -212,6 +227,7 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar
 
 // specialization for reals
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type)
 {
   using std::sqrt;
@@ -257,12 +273,13 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar
 
 namespace internal {
 /** \jacobi_module
-  * Applies the clock wise 2D rotation \a j to the set of 2D vectors of cordinates \a x and \a y:
+  * Applies the clock wise 2D rotation \a j to the set of 2D vectors of coordinates \a x and \a y:
   * \f$ \left ( \begin{array}{cc} x \\ y \end{array} \right )  =  J \left ( \begin{array}{cc} x \\ y \end{array} \right ) \f$
   *
   * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
 template<typename VectorX, typename VectorY, typename OtherScalar>
+EIGEN_DEVICE_FUNC
 void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j);
 }
 
@@ -274,6 +291,7 @@ void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>&
   */
 template<typename Derived>
 template<typename OtherScalar>
+EIGEN_DEVICE_FUNC
 inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j)
 {
   RowXpr x(this->row(p));
@@ -289,6 +307,7 @@ inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRo
   */
 template<typename Derived>
 template<typename OtherScalar>
+EIGEN_DEVICE_FUNC
 inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j)
 {
   ColXpr x(this->col(p));
@@ -302,7 +321,8 @@ template<typename Scalar, typename OtherScalar,
          int SizeAtCompileTime, int MinAlignment, bool Vectorizable>
 struct apply_rotation_in_the_plane_selector
 {
-  static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
+  static EIGEN_DEVICE_FUNC
+  inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
   {
     for(Index i=0; i<size; ++i)
     {
@@ -429,10 +449,11 @@ struct apply_rotation_in_the_plane_selector<Scalar,OtherScalar,SizeAtCompileTime
 };
 
 template<typename VectorX, typename VectorY, typename OtherScalar>
+EIGEN_DEVICE_FUNC
 void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j)
 {
   typedef typename VectorX::Scalar Scalar;
-  const bool Vectorizable =    (VectorX::Flags & VectorY::Flags & PacketAccessBit)
+  const bool Vectorizable =    (int(VectorX::Flags) & int(VectorY::Flags) & PacketAccessBit)
                             && (int(packet_traits<Scalar>::size) == int(packet_traits<OtherScalar>::size));
 
   eigen_assert(xpr_x.size() == xpr_y.size());
@@ -442,7 +463,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x
 
   Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0);
   Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0);
-  
+
   OtherScalar c = j.c();
   OtherScalar s = j.s();
   if (c==OtherScalar(1) && s==OtherScalar(0))
diff --git a/contrib/eigen/Eigen/src/KLUSupport/KLUSupport.h b/contrib/eigen/Eigen/src/KLUSupport/KLUSupport.h
new file mode 100644
index 0000000000000000000000000000000000000000..215db35b03f3019df6f15df235f58e2486042c06
--- /dev/null
+++ b/contrib/eigen/Eigen/src/KLUSupport/KLUSupport.h
@@ -0,0 +1,358 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Kyle Macfarlan <kyle.macfarlan@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_KLUSUPPORT_H
+#define EIGEN_KLUSUPPORT_H
+
+namespace Eigen {
+
+/* TODO extract L, extract U, compute det, etc... */
+
+/** \ingroup KLUSupport_Module
+  * \brief A sparse LU factorization and solver based on KLU
+  *
+  * This class allows to solve for A.X = B sparse linear problems via a LU factorization
+  * using the KLU library. The sparse matrix A must be squared and full rank.
+  * The vectors or matrices X and B can be either dense or sparse.
+  *
+  * \warning The input matrix A should be in a \b compressed and \b column-major form.
+  * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix.
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  *
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class UmfPackLU, class SparseLU
+  */
+
+
+inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B [ ], klu_common *Common, double) {
+   return klu_solve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), B, Common);
+}
+
+inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complex<double>B[], klu_common *Common, std::complex<double>) {
+   return klu_z_solve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), &numext::real_ref(B[0]), Common);
+}
+
+inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B[], klu_common *Common, double) {
+   return klu_tsolve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), B, Common);
+}
+
+inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complex<double>B[], klu_common *Common, std::complex<double>) {
+   return klu_z_tsolve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), &numext::real_ref(B[0]), 0, Common);
+}
+
+inline klu_numeric* klu_factor(int Ap [ ], int Ai [ ], double Ax [ ], klu_symbolic *Symbolic, klu_common *Common, double) {
+   return klu_factor(Ap, Ai, Ax, Symbolic, Common);
+}
+
+inline klu_numeric* klu_factor(int Ap[], int Ai[], std::complex<double> Ax[], klu_symbolic *Symbolic, klu_common *Common, std::complex<double>) {
+   return klu_z_factor(Ap, Ai, &numext::real_ref(Ax[0]), Symbolic, Common);
+}
+
+
+template<typename _MatrixType>
+class KLU : public SparseSolverBase<KLU<_MatrixType> >
+{
+  protected:
+    typedef SparseSolverBase<KLU<_MatrixType> > Base;
+    using Base::m_isInitialized;
+  public:
+    using Base::_solve_impl;
+    typedef _MatrixType MatrixType;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef Matrix<Scalar,Dynamic,1> Vector;
+    typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
+    typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
+    typedef SparseMatrix<Scalar> LUMatrixType;
+    typedef SparseMatrix<Scalar,ColMajor,int> KLUMatrixType;
+    typedef Ref<const KLUMatrixType, StandardCompressedFormat> KLUMatrixRef;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+
+  public:
+
+    KLU()
+      : m_dummy(0,0), mp_matrix(m_dummy)
+    {
+      init();
+    }
+
+    template<typename InputMatrixType>
+    explicit KLU(const InputMatrixType& matrix)
+      : mp_matrix(matrix)
+    {
+      init();
+      compute(matrix);
+    }
+
+    ~KLU()
+    {
+      if(m_symbolic) klu_free_symbolic(&m_symbolic,&m_common);
+      if(m_numeric)  klu_free_numeric(&m_numeric,&m_common);
+    }
+
+    EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return mp_matrix.rows(); }
+    EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return mp_matrix.cols(); }
+
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was successful,
+      *          \c NumericalIssue if the matrix.appears to be negative.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+      return m_info;
+    }
+#if 0 // not implemented yet
+    inline const LUMatrixType& matrixL() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_l;
+    }
+
+    inline const LUMatrixType& matrixU() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_u;
+    }
+
+    inline const IntColVectorType& permutationP() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_p;
+    }
+
+    inline const IntRowVectorType& permutationQ() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_q;
+    }
+#endif
+    /** Computes the sparse Cholesky decomposition of \a matrix
+     *  Note that the matrix should be column-major, and in compressed format for best performance.
+     *  \sa SparseMatrix::makeCompressed().
+     */
+    template<typename InputMatrixType>
+    void compute(const InputMatrixType& matrix)
+    {
+      if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common);
+      if(m_numeric)  klu_free_numeric(&m_numeric, &m_common);
+      grab(matrix.derived());
+      analyzePattern_impl();
+      factorize_impl();
+    }
+
+    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+      *
+      * This function is particularly useful when solving for several problems having the same structure.
+      *
+      * \sa factorize(), compute()
+      */
+    template<typename InputMatrixType>
+    void analyzePattern(const InputMatrixType& matrix)
+    {
+      if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common);
+      if(m_numeric)  klu_free_numeric(&m_numeric, &m_common);
+
+      grab(matrix.derived());
+
+      analyzePattern_impl();
+    }
+
+
+    /** Provides access to the control settings array used by KLU.
+      *
+      * See KLU documentation for details.
+      */
+    inline const klu_common& kluCommon() const
+    {
+      return m_common;
+    }
+
+    /** Provides access to the control settings array used by UmfPack.
+      *
+      * If this array contains NaN's, the default values are used.
+      *
+      * See KLU documentation for details.
+      */
+    inline klu_common& kluCommon()
+    {
+      return m_common;
+    }
+
+    /** Performs a numeric decomposition of \a matrix
+      *
+      * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed.
+      *
+      * \sa analyzePattern(), compute()
+      */
+    template<typename InputMatrixType>
+    void factorize(const InputMatrixType& matrix)
+    {
+      eigen_assert(m_analysisIsOk && "KLU: you must first call analyzePattern()");
+      if(m_numeric)
+        klu_free_numeric(&m_numeric,&m_common);
+
+      grab(matrix.derived());
+
+      factorize_impl();
+    }
+
+    /** \internal */
+    template<typename BDerived,typename XDerived>
+    bool _solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
+
+#if 0 // not implemented yet
+    Scalar determinant() const;
+
+    void extractData() const;
+#endif
+
+  protected:
+
+    void init()
+    {
+      m_info                  = InvalidInput;
+      m_isInitialized         = false;
+      m_numeric               = 0;
+      m_symbolic              = 0;
+      m_extractedDataAreDirty = true;
+
+      klu_defaults(&m_common);
+    }
+
+    void analyzePattern_impl()
+    {
+      m_info = InvalidInput;
+      m_analysisIsOk = false;
+      m_factorizationIsOk = false;
+      m_symbolic = klu_analyze(internal::convert_index<int>(mp_matrix.rows()),
+                                     const_cast<StorageIndex*>(mp_matrix.outerIndexPtr()), const_cast<StorageIndex*>(mp_matrix.innerIndexPtr()),
+                                     &m_common);
+      if (m_symbolic) {
+         m_isInitialized = true;
+         m_info = Success;
+         m_analysisIsOk = true;
+         m_extractedDataAreDirty = true;
+      }
+    }
+
+    void factorize_impl()
+    {
+
+      m_numeric = klu_factor(const_cast<StorageIndex*>(mp_matrix.outerIndexPtr()), const_cast<StorageIndex*>(mp_matrix.innerIndexPtr()), const_cast<Scalar*>(mp_matrix.valuePtr()),
+                                    m_symbolic, &m_common, Scalar());
+
+
+      m_info = m_numeric ? Success : NumericalIssue;
+      m_factorizationIsOk = m_numeric ? 1 : 0;
+      m_extractedDataAreDirty = true;
+    }
+
+    template<typename MatrixDerived>
+    void grab(const EigenBase<MatrixDerived> &A)
+    {
+      mp_matrix.~KLUMatrixRef();
+      ::new (&mp_matrix) KLUMatrixRef(A.derived());
+    }
+
+    void grab(const KLUMatrixRef &A)
+    {
+      if(&(A.derived()) != &mp_matrix)
+      {
+        mp_matrix.~KLUMatrixRef();
+        ::new (&mp_matrix) KLUMatrixRef(A);
+      }
+    }
+
+    // cached data to reduce reallocation, etc.
+#if 0 // not implemented yet
+    mutable LUMatrixType m_l;
+    mutable LUMatrixType m_u;
+    mutable IntColVectorType m_p;
+    mutable IntRowVectorType m_q;
+#endif
+
+    KLUMatrixType m_dummy;
+    KLUMatrixRef mp_matrix;
+
+    klu_numeric* m_numeric;
+    klu_symbolic* m_symbolic;
+    klu_common m_common;
+    mutable ComputationInfo m_info;
+    int m_factorizationIsOk;
+    int m_analysisIsOk;
+    mutable bool m_extractedDataAreDirty;
+
+  private:
+    KLU(const KLU& ) { }
+};
+
+#if 0 // not implemented yet
+template<typename MatrixType>
+void KLU<MatrixType>::extractData() const
+{
+  if (m_extractedDataAreDirty)
+  {
+     eigen_assert(false && "KLU: extractData Not Yet Implemented");
+
+    // get size of the data
+    int lnz, unz, rows, cols, nz_udiag;
+    umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar());
+
+    // allocate data
+    m_l.resize(rows,(std::min)(rows,cols));
+    m_l.resizeNonZeros(lnz);
+
+    m_u.resize((std::min)(rows,cols),cols);
+    m_u.resizeNonZeros(unz);
+
+    m_p.resize(rows);
+    m_q.resize(cols);
+
+    // extract
+    umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(),
+                        m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(),
+                        m_p.data(), m_q.data(), 0, 0, 0, m_numeric);
+
+    m_extractedDataAreDirty = false;
+  }
+}
+
+template<typename MatrixType>
+typename KLU<MatrixType>::Scalar KLU<MatrixType>::determinant() const
+{
+  eigen_assert(false && "KLU: extractData Not Yet Implemented");
+  return Scalar();
+}
+#endif
+
+template<typename MatrixType>
+template<typename BDerived,typename XDerived>
+bool KLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const
+{
+  Index rhsCols = b.cols();
+  EIGEN_STATIC_ASSERT((XDerived::Flags&RowMajorBit)==0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
+
+  x = b;
+  int info = klu_solve(m_symbolic, m_numeric, b.rows(), rhsCols, x.const_cast_derived().data(), const_cast<klu_common*>(&m_common), Scalar());
+
+  m_info = info!=0 ? Success : NumericalIssue;
+  return true;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_KLUSUPPORT_H
diff --git a/contrib/eigen/Eigen/src/LU/Determinant.h b/contrib/eigen/Eigen/src/LU/Determinant.h
index d6a3c1e5a54e056f062f439313d6b6e4a5cafa81..3a41e6fcba3b4e47c429c774579d02202c68db88 100644
--- a/contrib/eigen/Eigen/src/LU/Determinant.h
+++ b/contrib/eigen/Eigen/src/LU/Determinant.h
@@ -15,6 +15,7 @@ namespace Eigen {
 namespace internal {
 
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline const typename Derived::Scalar bruteforce_det3_helper
 (const MatrixBase<Derived>& matrix, int a, int b, int c)
 {
@@ -22,14 +23,6 @@ inline const typename Derived::Scalar bruteforce_det3_helper
          * (matrix.coeff(1,b) * matrix.coeff(2,c) - matrix.coeff(1,c) * matrix.coeff(2,b));
 }
 
-template<typename Derived>
-const typename Derived::Scalar bruteforce_det4_helper
-(const MatrixBase<Derived>& matrix, int j, int k, int m, int n)
-{
-  return (matrix.coeff(j,0) * matrix.coeff(k,1) - matrix.coeff(k,0) * matrix.coeff(j,1))
-       * (matrix.coeff(m,2) * matrix.coeff(n,3) - matrix.coeff(n,2) * matrix.coeff(m,3));
-}
-
 template<typename Derived,
          int DeterminantType = Derived::RowsAtCompileTime
 > struct determinant_impl
@@ -44,7 +37,8 @@ template<typename Derived,
 
 template<typename Derived> struct determinant_impl<Derived, 1>
 {
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  static inline EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     return m.coeff(0,0);
   }
@@ -52,7 +46,8 @@ template<typename Derived> struct determinant_impl<Derived, 1>
 
 template<typename Derived> struct determinant_impl<Derived, 2>
 {
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  static inline EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1);
   }
@@ -60,7 +55,8 @@ template<typename Derived> struct determinant_impl<Derived, 2>
 
 template<typename Derived> struct determinant_impl<Derived, 3>
 {
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  static inline EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     return bruteforce_det3_helper(m,0,1,2)
           - bruteforce_det3_helper(m,1,0,2)
@@ -70,15 +66,34 @@ template<typename Derived> struct determinant_impl<Derived, 3>
 
 template<typename Derived> struct determinant_impl<Derived, 4>
 {
-  static typename traits<Derived>::Scalar run(const Derived& m)
+  typedef typename traits<Derived>::Scalar Scalar;
+  static EIGEN_DEVICE_FUNC
+  Scalar run(const Derived& m)
+  {
+    Scalar d2_01 = det2(m, 0, 1);
+    Scalar d2_02 = det2(m, 0, 2);
+    Scalar d2_03 = det2(m, 0, 3);
+    Scalar d2_12 = det2(m, 1, 2);
+    Scalar d2_13 = det2(m, 1, 3);
+    Scalar d2_23 = det2(m, 2, 3);
+    Scalar d3_0 = det3(m, 1,d2_23, 2,d2_13, 3,d2_12);
+    Scalar d3_1 = det3(m, 0,d2_23, 2,d2_03, 3,d2_02);
+    Scalar d3_2 = det3(m, 0,d2_13, 1,d2_03, 3,d2_01);
+    Scalar d3_3 = det3(m, 0,d2_12, 1,d2_02, 2,d2_01);
+    return internal::pmadd(-m(0,3),d3_0, m(1,3)*d3_1) +
+           internal::pmadd(-m(2,3),d3_2, m(3,3)*d3_3);
+  }
+protected:
+  static EIGEN_DEVICE_FUNC
+  Scalar det2(const Derived& m, Index i0, Index i1)
+  {
+    return m(i0,0) * m(i1,1) - m(i1,0) * m(i0,1);
+  }
+
+  static EIGEN_DEVICE_FUNC
+  Scalar det3(const Derived& m, Index i0, const Scalar& d0, Index i1, const Scalar& d1, Index i2, const Scalar& d2)
   {
-    // trick by Martin Costabel to compute 4x4 det with only 30 muls
-    return bruteforce_det4_helper(m,0,1,2,3)
-          - bruteforce_det4_helper(m,0,2,1,3)
-          + bruteforce_det4_helper(m,0,3,1,2)
-          + bruteforce_det4_helper(m,1,2,0,3)
-          - bruteforce_det4_helper(m,1,3,0,2)
-          + bruteforce_det4_helper(m,2,3,0,1);
+    return internal::pmadd(m(i0,2), d0, internal::pmadd(-m(i1,2), d1, m(i2,2)*d2));
   }
 };
 
@@ -89,6 +104,7 @@ template<typename Derived> struct determinant_impl<Derived, 4>
   * \returns the determinant of this matrix
   */
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline typename internal::traits<Derived>::Scalar MatrixBase<Derived>::determinant() const
 {
   eigen_assert(rows() == cols());
diff --git a/contrib/eigen/Eigen/src/LU/FullPivLU.h b/contrib/eigen/Eigen/src/LU/FullPivLU.h
index 03b6af706136b501691f424ba6c2a830d5b2f04d..ba1749fa69c6a93afd16c2c7325c1dbc2fe0f4aa 100644
--- a/contrib/eigen/Eigen/src/LU/FullPivLU.h
+++ b/contrib/eigen/Eigen/src/LU/FullPivLU.h
@@ -18,6 +18,7 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
 {
   typedef MatrixXpr XprKind;
   typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
   enum { Flags = 0 };
 };
 
@@ -48,12 +49,12 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
   * The data of the LU decomposition can be directly accessed through the methods matrixLU(),
   * permutationP(), permutationQ().
   *
-  * As an exemple, here is how the original matrix can be retrieved:
+  * As an example, here is how the original matrix can be retrieved:
   * \include class_FullPivLU.cpp
   * Output: \verbinclude class_FullPivLU.out
   *
   * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
-  * 
+  *
   * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse()
   */
 template<typename _MatrixType> class FullPivLU
@@ -62,9 +63,9 @@ template<typename _MatrixType> class FullPivLU
   public:
     typedef _MatrixType MatrixType;
     typedef SolverBase<FullPivLU> Base;
+    friend class SolverBase<FullPivLU>;
 
     EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU)
-    // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int
     enum {
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
@@ -218,6 +219,7 @@ template<typename _MatrixType> class FullPivLU
       return internal::image_retval<FullPivLU>(*this, originalMatrix);
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \return a solution x to the equation Ax=b, where A is the matrix of which
       * *this is the LU decomposition.
       *
@@ -237,14 +239,10 @@ template<typename _MatrixType> class FullPivLU
       *
       * \sa TriangularView::solve(), kernel(), inverse()
       */
-    // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion.
     template<typename Rhs>
     inline const Solve<FullPivLU, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      return Solve<FullPivLU, Rhs>(*this, b.derived());
-    }
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
     /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
         the LU decomposition.
@@ -320,7 +318,7 @@ template<typename _MatrixType> class FullPivLU
       return m_usePrescribedThreshold ? m_prescribedThreshold
       // this formula comes from experimenting (see "LU precision tuning" thread on the list)
       // and turns out to be identical to Higham's formula used already in LDLt.
-                                      : NumTraits<Scalar>::epsilon() * m_lu.diagonalSize();
+          : NumTraits<Scalar>::epsilon() * RealScalar(m_lu.diagonalSize());
     }
 
     /** \returns the rank of the matrix of which *this is the LU decomposition.
@@ -406,16 +404,16 @@ template<typename _MatrixType> class FullPivLU
 
     MatrixType reconstructedMatrix() const;
 
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lu.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_lu.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+    inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
 
     template<bool Conjugate, typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
@@ -531,8 +529,8 @@ void FullPivLU<MatrixType>::computeInPlace()
       m_nonzero_pivots = k;
       for(Index i = k; i < size; ++i)
       {
-        m_rowsTranspositions.coeffRef(i) = i;
-        m_colsTranspositions.coeffRef(i) = i;
+        m_rowsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
+        m_colsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
       }
       break;
     }
@@ -543,8 +541,8 @@ void FullPivLU<MatrixType>::computeInPlace()
     // Now that we've found the pivot, we need to apply the row/col swaps to
     // bring it to the location (k,k).
 
-    m_rowsTranspositions.coeffRef(k) = row_of_biggest_in_corner;
-    m_colsTranspositions.coeffRef(k) = col_of_biggest_in_corner;
+    m_rowsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner);
+    m_colsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner);
     if(k != row_of_biggest_in_corner) {
       m_lu.row(k).swap(m_lu.row(row_of_biggest_in_corner));
       ++number_of_transpositions;
@@ -757,7 +755,6 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
   const Index rows = this->rows(),
               cols = this->cols(),
               nonzero_pivots = this->rank();
-  eigen_assert(rhs.rows() == rows);
   const Index smalldim = (std::min)(rows, cols);
 
   if(nonzero_pivots == 0)
@@ -807,7 +804,6 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType
 
   const Index rows = this->rows(), cols = this->cols(),
     nonzero_pivots = this->rank();
-   eigen_assert(rhs.rows() == cols);
   const Index smalldim = (std::min)(rows, cols);
 
   if(nonzero_pivots == 0)
@@ -821,29 +817,19 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType
   // Step 1
   c = permutationQ().inverse() * rhs;
 
-  if (Conjugate) {
-    // Step 2
-    m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
-        .template triangularView<Upper>()
-        .adjoint()
-        .solveInPlace(c.topRows(nonzero_pivots));
-    // Step 3
-    m_lu.topLeftCorner(smalldim, smalldim)
-        .template triangularView<UnitLower>()
-        .adjoint()
-        .solveInPlace(c.topRows(smalldim));
-  } else {
-    // Step 2
-    m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
-        .template triangularView<Upper>()
-        .transpose()
-        .solveInPlace(c.topRows(nonzero_pivots));
-    // Step 3
-    m_lu.topLeftCorner(smalldim, smalldim)
-        .template triangularView<UnitLower>()
-        .transpose()
-        .solveInPlace(c.topRows(smalldim));
-  }
+  // Step 2
+  m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
+      .template triangularView<Upper>()
+      .transpose()
+      .template conjugateIf<Conjugate>()
+      .solveInPlace(c.topRows(nonzero_pivots));
+
+  // Step 3
+  m_lu.topLeftCorner(smalldim, smalldim)
+      .template triangularView<UnitLower>()
+      .transpose()
+      .template conjugateIf<Conjugate>()
+      .solveInPlace(c.topRows(smalldim));
 
   // Step 4
   PermutationPType invp = permutationP().inverse().eval();
diff --git a/contrib/eigen/Eigen/src/LU/InverseImpl.h b/contrib/eigen/Eigen/src/LU/InverseImpl.h
index f49f23360042b65a73f242b7d6cbce1d8ccceb21..a40cefa9e7dcd4cbcc5e42dcb8d8e5fe79be6e97 100644
--- a/contrib/eigen/Eigen/src/LU/InverseImpl.h
+++ b/contrib/eigen/Eigen/src/LU/InverseImpl.h
@@ -77,10 +77,11 @@ inline void compute_inverse_size2_helper(
     const MatrixType& matrix, const typename ResultType::Scalar& invdet,
     ResultType& result)
 {
+  typename ResultType::Scalar temp = matrix.coeff(0,0);
   result.coeffRef(0,0) =  matrix.coeff(1,1) * invdet;
   result.coeffRef(1,0) = -matrix.coeff(1,0) * invdet;
   result.coeffRef(0,1) = -matrix.coeff(0,1) * invdet;
-  result.coeffRef(1,1) =  matrix.coeff(0,0) * invdet;
+  result.coeffRef(1,1) =  temp * invdet;
 }
 
 template<typename MatrixType, typename ResultType>
@@ -143,13 +144,18 @@ inline void compute_inverse_size3_helper(
     const Matrix<typename ResultType::Scalar,3,1>& cofactors_col0,
     ResultType& result)
 {
-  result.row(0) = cofactors_col0 * invdet;
-  result.coeffRef(1,0) =  cofactor_3x3<MatrixType,0,1>(matrix) * invdet;
-  result.coeffRef(1,1) =  cofactor_3x3<MatrixType,1,1>(matrix) * invdet;
+  // Compute cofactors in a way that avoids aliasing issues.
+  typedef typename ResultType::Scalar Scalar;
+  const Scalar c01 = cofactor_3x3<MatrixType,0,1>(matrix) * invdet;
+  const Scalar c11 = cofactor_3x3<MatrixType,1,1>(matrix) * invdet;
+  const Scalar c02 = cofactor_3x3<MatrixType,0,2>(matrix) * invdet;
   result.coeffRef(1,2) =  cofactor_3x3<MatrixType,2,1>(matrix) * invdet;
-  result.coeffRef(2,0) =  cofactor_3x3<MatrixType,0,2>(matrix) * invdet;
   result.coeffRef(2,1) =  cofactor_3x3<MatrixType,1,2>(matrix) * invdet;
   result.coeffRef(2,2) =  cofactor_3x3<MatrixType,2,2>(matrix) * invdet;
+  result.coeffRef(1,0) =  c01;
+  result.coeffRef(1,1) =  c11;
+  result.coeffRef(2,0) =  c02;  
+  result.row(0) = cofactors_col0 * invdet;
 }
 
 template<typename MatrixType, typename ResultType>
@@ -181,14 +187,13 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 3>
     bool& invertible
   )
   {
-    using std::abs;
     typedef typename ResultType::Scalar Scalar;
     Matrix<Scalar,3,1> cofactors_col0;
     cofactors_col0.coeffRef(0) =  cofactor_3x3<MatrixType,0,0>(matrix);
     cofactors_col0.coeffRef(1) =  cofactor_3x3<MatrixType,1,0>(matrix);
     cofactors_col0.coeffRef(2) =  cofactor_3x3<MatrixType,2,0>(matrix);
     determinant = (cofactors_col0.cwiseProduct(matrix.col(0))).sum();
-    invertible = abs(determinant) > absDeterminantThreshold;
+    invertible = Eigen::numext::abs(determinant) > absDeterminantThreshold;
     if(!invertible) return;
     const Scalar invdet = Scalar(1) / determinant;
     compute_inverse_size3_helper(matrix, invdet, cofactors_col0, inverse);
@@ -273,7 +278,13 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 4>
     using std::abs;
     determinant = matrix.determinant();
     invertible = abs(determinant) > absDeterminantThreshold;
-    if(invertible) compute_inverse<MatrixType, ResultType>::run(matrix, inverse);
+    if(invertible && extract_data(matrix) != extract_data(inverse)) {
+      compute_inverse<MatrixType, ResultType>::run(matrix, inverse);
+    }
+    else if(invertible) {
+      MatrixType matrix_t = matrix;
+      compute_inverse<MatrixType, ResultType>::run(matrix_t, inverse);
+    }
   }
 };
 
@@ -290,6 +301,7 @@ template<typename DstXprType, typename XprType>
 struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar>, Dense2Dense>
 {
   typedef Inverse<XprType> SrcXprType;
+  EIGEN_DEVICE_FUNC
   static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &)
   {
     Index dstRows = src.rows();
@@ -332,6 +344,7 @@ struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename Dst
   * \sa computeInverseAndDetWithCheck()
   */
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline const Inverse<Derived> MatrixBase<Derived>::inverse() const
 {
   EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
@@ -345,6 +358,8 @@ inline const Inverse<Derived> MatrixBase<Derived>::inverse() const
   *
   * This is only for fixed-size square matrices of size up to 4x4.
   *
+  * Notice that it will trigger a copy of input matrix when trying to do the inverse in place.
+  *
   * \param inverse Reference to the matrix in which to store the inverse.
   * \param determinant Reference to the variable in which to store the determinant.
   * \param invertible Reference to the bool variable in which to store whether the matrix is invertible.
@@ -385,6 +400,8 @@ inline void MatrixBase<Derived>::computeInverseAndDetWithCheck(
   *
   * This is only for fixed-size square matrices of size up to 4x4.
   *
+  * Notice that it will trigger a copy of input matrix when trying to do the inverse in place.
+  *
   * \param inverse Reference to the matrix in which to store the inverse.
   * \param invertible Reference to the bool variable in which to store whether the matrix is invertible.
   * \param absDeterminantThreshold Optional parameter controlling the invertibility check.
diff --git a/contrib/eigen/Eigen/src/LU/PartialPivLU.h b/contrib/eigen/Eigen/src/LU/PartialPivLU.h
index 6b10f39fab470e41bfab4e5619257935a74d74d5..34aed72494d4315d7ba22950445c26fc27b2fbae 100644
--- a/contrib/eigen/Eigen/src/LU/PartialPivLU.h
+++ b/contrib/eigen/Eigen/src/LU/PartialPivLU.h
@@ -19,6 +19,7 @@ template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> >
 {
   typedef MatrixXpr XprKind;
   typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
   typedef traits<_MatrixType> BaseTraits;
   enum {
     Flags = BaseTraits::Flags & RowMajorBit,
@@ -69,7 +70,7 @@ struct enable_if_ref<Ref<T>,Derived> {
   * The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP().
   *
   * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
-  * 
+  *
   * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU
   */
 template<typename _MatrixType> class PartialPivLU
@@ -79,8 +80,9 @@ template<typename _MatrixType> class PartialPivLU
 
     typedef _MatrixType MatrixType;
     typedef SolverBase<PartialPivLU> Base;
+    friend class SolverBase<PartialPivLU>;
+
     EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU)
-    // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int
     enum {
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
@@ -152,6 +154,7 @@ template<typename _MatrixType> class PartialPivLU
       return m_p;
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** This method returns the solution x to the equation Ax=b, where A is the matrix of which
       * *this is the LU decomposition.
       *
@@ -169,14 +172,10 @@ template<typename _MatrixType> class PartialPivLU
       *
       * \sa TriangularView::solve(), inverse(), computeInverse()
       */
-    // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion.
     template<typename Rhs>
     inline const Solve<PartialPivLU, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
-      return Solve<PartialPivLU, Rhs>(*this, b.derived());
-    }
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
     /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
         the LU decomposition.
@@ -217,8 +216,8 @@ template<typename _MatrixType> class PartialPivLU
 
     MatrixType reconstructedMatrix() const;
 
-    inline Index rows() const { return m_lu.rows(); }
-    inline Index cols() const { return m_lu.cols(); }
+    EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
+    EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
@@ -231,8 +230,6 @@ template<typename _MatrixType> class PartialPivLU
       * Step 3: replace c by the solution x to Ux = c.
       */
 
-      eigen_assert(rhs.rows() == m_lu.rows());
-
       // Step 1
       dst = permutationP() * rhs;
 
@@ -246,26 +243,21 @@ template<typename _MatrixType> class PartialPivLU
     template<bool Conjugate, typename RhsType, typename DstType>
     EIGEN_DEVICE_FUNC
     void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const {
-     /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
+     /* The decomposition PA = LU can be rewritten as A^T = U^T L^T P.
       * So we proceed as follows:
-      * Step 1: compute c = Pb.
-      * Step 2: replace c by the solution x to Lx = c.
-      * Step 3: replace c by the solution x to Ux = c.
+      * Step 1: compute c as the solution to L^T c = b
+      * Step 2: replace c by the solution x to U^T x = c.
+      * Step 3: update  c = P^-1 c.
       */
 
       eigen_assert(rhs.rows() == m_lu.cols());
 
-      if (Conjugate) {
-        // Step 1
-        dst = m_lu.template triangularView<Upper>().adjoint().solve(rhs);
-        // Step 2
-        m_lu.template triangularView<UnitLower>().adjoint().solveInPlace(dst);
-      } else {
-        // Step 1
-        dst = m_lu.template triangularView<Upper>().transpose().solve(rhs);
-        // Step 2
-        m_lu.template triangularView<UnitLower>().transpose().solveInPlace(dst);
-      }
+      // Step 1
+      dst = m_lu.template triangularView<Upper>().transpose()
+                .template conjugateIf<Conjugate>().solve(rhs);
+      // Step 2
+      m_lu.template triangularView<UnitLower>().transpose()
+          .template conjugateIf<Conjugate>().solveInPlace(dst);
       // Step 3
       dst = permutationP().transpose() * dst;
     }
@@ -339,17 +331,18 @@ PartialPivLU<MatrixType>::PartialPivLU(EigenBase<InputType>& matrix)
 namespace internal {
 
 /** \internal This is the blocked version of fullpivlu_unblocked() */
-template<typename Scalar, int StorageOrder, typename PivIndex>
+template<typename Scalar, int StorageOrder, typename PivIndex, int SizeAtCompileTime=Dynamic>
 struct partial_lu_impl
 {
-  // FIXME add a stride to Map, so that the following mapping becomes easier,
-  // another option would be to create an expression being able to automatically
-  // warp any Map, Matrix, and Block expressions as a unique type, but since that's exactly
-  // a Map + stride, why not adding a stride to Map, and convenient ctors from a Matrix,
-  // and Block.
-  typedef Map<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > MapLU;
-  typedef Block<MapLU, Dynamic, Dynamic> MatrixType;
-  typedef Block<MatrixType,Dynamic,Dynamic> BlockType;
+  static const int UnBlockedBound = 16;
+  static const bool UnBlockedAtCompileTime = SizeAtCompileTime!=Dynamic && SizeAtCompileTime<=UnBlockedBound;
+  static const int ActualSizeAtCompileTime = UnBlockedAtCompileTime ? SizeAtCompileTime : Dynamic;
+  // Remaining rows and columns at compile-time:
+  static const int RRows = SizeAtCompileTime==2 ? 1 : Dynamic;
+  static const int RCols = SizeAtCompileTime==2 ? 1 : Dynamic;
+  typedef Matrix<Scalar, ActualSizeAtCompileTime, ActualSizeAtCompileTime, StorageOrder> MatrixType;
+  typedef Ref<MatrixType> MatrixTypeRef;
+  typedef Ref<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > BlockType;
   typedef typename MatrixType::RealScalar RealScalar;
 
   /** \internal performs the LU decomposition in-place of the matrix \a lu
@@ -362,19 +355,22 @@ struct partial_lu_impl
     *
     * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
     */
-  static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
+  static Index unblocked_lu(MatrixTypeRef& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
   {
     typedef scalar_score_coeff_op<Scalar> Scoring;
     typedef typename Scoring::result_type Score;
     const Index rows = lu.rows();
     const Index cols = lu.cols();
     const Index size = (std::min)(rows,cols);
+    // For small compile-time matrices it is worth processing the last row separately:
+    //  speedup: +100% for 2x2, +10% for others.
+    const Index endk = UnBlockedAtCompileTime ? size-1 : size;
     nb_transpositions = 0;
     Index first_zero_pivot = -1;
-    for(Index k = 0; k < size; ++k)
+    for(Index k = 0; k < endk; ++k)
     {
-      Index rrows = rows-k-1;
-      Index rcols = cols-k-1;
+      int rrows = internal::convert_index<int>(rows-k-1);
+      int rcols = internal::convert_index<int>(cols-k-1);
 
       Index row_of_biggest_in_col;
       Score biggest_in_corner
@@ -391,9 +387,7 @@ struct partial_lu_impl
           ++nb_transpositions;
         }
 
-        // FIXME shall we introduce a safe quotient expression in cas 1/lu.coeff(k,k)
-        // overflow but not the actual quotient?
-        lu.col(k).tail(rrows) /= lu.coeff(k,k);
+        lu.col(k).tail(fix<RRows>(rrows)) /= lu.coeff(k,k);
       }
       else if(first_zero_pivot==-1)
       {
@@ -403,8 +397,18 @@ struct partial_lu_impl
       }
 
       if(k<rows-1)
-        lu.bottomRightCorner(rrows,rcols).noalias() -= lu.col(k).tail(rrows) * lu.row(k).tail(rcols);
+        lu.bottomRightCorner(fix<RRows>(rrows),fix<RCols>(rcols)).noalias() -= lu.col(k).tail(fix<RRows>(rrows)) * lu.row(k).tail(fix<RCols>(rcols));
+    }
+
+    // special handling of the last entry
+    if(UnBlockedAtCompileTime)
+    {
+      Index k = endk;
+      row_transpositions[k] = PivIndex(k);
+      if (Scoring()(lu(k, k)) == Score(0) && first_zero_pivot == -1)
+        first_zero_pivot = k;
     }
+
     return first_zero_pivot;
   }
 
@@ -420,18 +424,17 @@ struct partial_lu_impl
     * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
     *
     * \note This very low level interface using pointers, etc. is to:
-    *   1 - reduce the number of instanciations to the strict minimum
-    *   2 - avoid infinite recursion of the instanciations with Block<Block<Block<...> > >
+    *   1 - reduce the number of instantiations to the strict minimum
+    *   2 - avoid infinite recursion of the instantiations with Block<Block<Block<...> > >
     */
   static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256)
   {
-    MapLU lu1(lu_data,StorageOrder==RowMajor?rows:luStride,StorageOrder==RowMajor?luStride:cols);
-    MatrixType lu(lu1,0,0,rows,cols);
+    MatrixTypeRef lu = MatrixType::Map(lu_data,rows, cols, OuterStride<>(luStride));
 
     const Index size = (std::min)(rows,cols);
 
     // if the matrix is too small, no blocking:
-    if(size<=16)
+    if(UnBlockedAtCompileTime || size<=UnBlockedBound)
     {
       return unblocked_lu(lu, row_transpositions, nb_transpositions);
     }
@@ -457,12 +460,12 @@ struct partial_lu_impl
       //                          A00 | A01 | A02
       // lu  = A_0 | A_1 | A_2 =  A10 | A11 | A12
       //                          A20 | A21 | A22
-      BlockType A_0(lu,0,0,rows,k);
-      BlockType A_2(lu,0,k+bs,rows,tsize);
-      BlockType A11(lu,k,k,bs,bs);
-      BlockType A12(lu,k,k+bs,bs,tsize);
-      BlockType A21(lu,k+bs,k,trows,bs);
-      BlockType A22(lu,k+bs,k+bs,trows,tsize);
+      BlockType A_0 = lu.block(0,0,rows,k);
+      BlockType A_2 = lu.block(0,k+bs,rows,tsize);
+      BlockType A11 = lu.block(k,k,bs,bs);
+      BlockType A12 = lu.block(k,k+bs,bs,tsize);
+      BlockType A21 = lu.block(k+bs,k,trows,bs);
+      BlockType A22 = lu.block(k+bs,k+bs,trows,tsize);
 
       PivIndex nb_transpositions_in_panel;
       // recursively call the blocked LU algorithm on [A11^T A21^T]^T
@@ -501,11 +504,18 @@ struct partial_lu_impl
 template<typename MatrixType, typename TranspositionType>
 void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::StorageIndex& nb_transpositions)
 {
+  // Special-case of zero matrix.
+  if (lu.rows() == 0 || lu.cols() == 0) {
+    nb_transpositions = 0;
+    return;
+  }
   eigen_assert(lu.cols() == row_transpositions.size());
-  eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1);
+  eigen_assert(row_transpositions.size() < 2 || (&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1);
 
   partial_lu_impl
-    <typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, typename TranspositionType::StorageIndex>
+    < typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor,
+      typename TranspositionType::StorageIndex,
+      EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime)>
     ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions);
 }
 
diff --git a/contrib/eigen/Eigen/src/LU/arch/InverseSize4.h b/contrib/eigen/Eigen/src/LU/arch/InverseSize4.h
new file mode 100644
index 0000000000000000000000000000000000000000..a232ffc0aaded63fec6b52e87a527d6c17dd3ed3
--- /dev/null
+++ b/contrib/eigen/Eigen/src/LU/arch/InverseSize4.h
@@ -0,0 +1,351 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2001 Intel Corporation
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+// The algorithm below is a reimplementation of former \src\LU\Inverse_SSE.h using PacketMath.
+// inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M,
+// adjugate of M and determinant of M respectively. M# is computed block-wise
+// using specific formulae. For proof, see:
+// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html
+// Variable names are adopted from \src\LU\Inverse_SSE.h.
+//
+// The SSE code for the 4x4 float and double matrix inverse in former (deprecated) \src\LU\Inverse_SSE.h
+// comes from the following Intel's library:
+// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
+//
+// Here is the respective copyright and license statement:
+//
+//   Copyright (c) 2001 Intel Corporation.
+//
+// Permition is granted to use, copy, distribute and prepare derivative works
+// of this library for any purpose and without fee, provided, that the above
+// copyright notice and this statement appear in all copies.
+// Intel makes no representations about the suitability of this software for
+// any purpose, and specifically disclaims all warranties.
+// See LEGAL.TXT for all the legal information.
+//
+// TODO: Unify implementations of different data types (i.e. float and double).
+#ifndef EIGEN_INVERSE_SIZE_4_H
+#define EIGEN_INVERSE_SIZE_4_H
+
+namespace Eigen
+{
+namespace internal
+{
+template <typename MatrixType, typename ResultType>
+struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType>
+{
+  enum
+  {
+    MatrixAlignment = traits<MatrixType>::Alignment,
+    ResultAlignment = traits<ResultType>::Alignment,
+    StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
+  };
+  typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType;
+
+  static void run(const MatrixType &mat, ResultType &result)
+  {
+    ActualMatrixType matrix(mat);
+
+    const float* data = matrix.data();
+    const Index stride = matrix.innerStride();
+    Packet4f _L1 = ploadt<Packet4f,MatrixAlignment>(data);
+    Packet4f _L2 = ploadt<Packet4f,MatrixAlignment>(data + stride*4);
+    Packet4f _L3 = ploadt<Packet4f,MatrixAlignment>(data + stride*8);
+    Packet4f _L4 = ploadt<Packet4f,MatrixAlignment>(data + stride*12);
+
+    // Four 2x2 sub-matrices of the input matrix
+    // input = [[A, B],
+    //          [C, D]]
+    Packet4f A, B, C, D;
+
+    if (!StorageOrdersMatch)
+    {
+      A = vec4f_unpacklo(_L1, _L2);
+      B = vec4f_unpacklo(_L3, _L4);
+      C = vec4f_unpackhi(_L1, _L2);
+      D = vec4f_unpackhi(_L3, _L4);
+    }
+    else
+    {
+      A = vec4f_movelh(_L1, _L2);
+      B = vec4f_movehl(_L2, _L1);
+      C = vec4f_movelh(_L3, _L4);
+      D = vec4f_movehl(_L4, _L3);
+    }
+
+    Packet4f AB, DC;
+
+    // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
+    AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B);
+    AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1)));
+
+    // DC = D#*C
+    DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C);
+    DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1)));
+
+    // determinants of the sub-matrices
+    Packet4f dA, dB, dC, dD;
+
+    dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A);
+    dA = psub(dA, vec4f_movehl(dA, dA));
+
+    dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B);
+    dB = psub(dB, vec4f_movehl(dB, dB));
+
+    dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C);
+    dC = psub(dC, vec4f_movehl(dC, dC));
+
+    dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D);
+    dD = psub(dD, vec4f_movehl(dD, dD));
+
+    Packet4f d, d1, d2;
+
+    d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB);
+    d = padd(d, vec4f_movehl(d, d));
+    d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0));
+    d1 = pmul(dA, dD);
+    d2 = pmul(dB, dC);
+
+    // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
+    Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0);
+
+    // reciprocal of the determinant of the input matrix, rd = 1/det
+    Packet4f rd = pdiv(pset1<Packet4f>(1.0f), det);
+
+    // Four sub-matrices of the inverse
+    Packet4f iA, iB, iC, iD;
+
+    // iD = D*|A| - C*A#*B
+    iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB));
+    iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB)));
+    iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD);
+
+    // iA = A*|D| - B*D#*C
+    iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC));
+    iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC)));
+    iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA);
+
+    // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
+    iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0));
+    iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1)));
+    iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB);
+
+    // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
+    iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0));
+    iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
+    iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
+
+    const float sign_mask[4] = {0.0f, numext::bit_cast<float>(0x80000000u), numext::bit_cast<float>(0x80000000u), 0.0f};
+    const Packet4f p4f_sign_PNNP = ploadu<Packet4f>(sign_mask);
+    rd = pxor(rd, p4f_sign_PNNP);
+    iA = pmul(iA, rd);
+    iB = pmul(iB, rd);
+    iC = pmul(iC, rd);
+    iD = pmul(iD, rd);
+
+    Index res_stride = result.outerStride();
+    float *res = result.data();
+
+    pstoret<float, Packet4f, ResultAlignment>(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1));
+    pstoret<float, Packet4f, ResultAlignment>(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0));
+    pstoret<float, Packet4f, ResultAlignment>(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1));
+    pstoret<float, Packet4f, ResultAlignment>(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0));
+  }
+};
+
+#if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG))
+// same algorithm as above, except that each operand is split into
+// halves for two registers to hold.
+template <typename MatrixType, typename ResultType>
+struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultType>
+{
+  enum
+  {
+    MatrixAlignment = traits<MatrixType>::Alignment,
+    ResultAlignment = traits<ResultType>::Alignment,
+    StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
+  };
+  typedef typename conditional<(MatrixType::Flags & LinearAccessBit),
+                               MatrixType const &,
+                               typename MatrixType::PlainObject>::type
+      ActualMatrixType;
+
+  static void run(const MatrixType &mat, ResultType &result)
+  {
+    ActualMatrixType matrix(mat);
+
+    // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower
+    // row e.g. A1, upper row of A, A2, lower row of A
+    // input = [[A, B],  =  [[[A1, [B1,
+    //          [C, D]]        A2], B2]],
+    //                       [[C1, [D1,
+    //                         C2], D2]]]
+
+    Packet2d A1, A2, B1, B2, C1, C2, D1, D2;
+
+    const double* data = matrix.data();
+    const Index stride = matrix.innerStride();
+    if (StorageOrdersMatch)
+    {
+      A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
+      B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
+      A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
+      B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
+      C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
+      D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
+      C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
+      D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
+    }
+    else
+    {
+      Packet2d temp;
+      A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
+      C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
+      A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
+      C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
+      temp = A1;
+      A1 = vec2d_unpacklo(A1, A2);
+      A2 = vec2d_unpackhi(temp, A2);
+
+      temp = C1;
+      C1 = vec2d_unpacklo(C1, C2);
+      C2 = vec2d_unpackhi(temp, C2);
+
+      B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
+      D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
+      B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
+      D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
+
+      temp = B1;
+      B1 = vec2d_unpacklo(B1, B2);
+      B2 = vec2d_unpackhi(temp, B2);
+
+      temp = D1;
+      D1 = vec2d_unpacklo(D1, D2);
+      D2 = vec2d_unpackhi(temp, D2);
+    }
+
+    // determinants of the sub-matrices
+    Packet2d dA, dB, dC, dD;
+
+    dA = vec2d_swizzle2(A2, A2, 1);
+    dA = pmul(A1, dA);
+    dA = psub(dA, vec2d_duplane(dA, 1));
+
+    dB = vec2d_swizzle2(B2, B2, 1);
+    dB = pmul(B1, dB);
+    dB = psub(dB, vec2d_duplane(dB, 1));
+
+    dC = vec2d_swizzle2(C2, C2, 1);
+    dC = pmul(C1, dC);
+    dC = psub(dC, vec2d_duplane(dC, 1));
+
+    dD = vec2d_swizzle2(D2, D2, 1);
+    dD = pmul(D1, dD);
+    dD = psub(dD, vec2d_duplane(dD, 1));
+
+    Packet2d DC1, DC2, AB1, AB2;
+
+    // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
+    AB1 = pmul(B1, vec2d_duplane(A2, 1));
+    AB2 = pmul(B2, vec2d_duplane(A1, 0));
+    AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1)));
+    AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0)));
+
+    // DC = D#*C
+    DC1 = pmul(C1, vec2d_duplane(D2, 1));
+    DC2 = pmul(C2, vec2d_duplane(D1, 0));
+    DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1)));
+    DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0)));
+
+    Packet2d d1, d2;
+
+    // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
+    Packet2d det;
+
+    // reciprocal of the determinant of the input matrix, rd = 1/det
+    Packet2d rd;
+
+    d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0));
+    d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3));
+    rd = padd(d1, d2);
+    rd = padd(rd, vec2d_duplane(rd, 1));
+
+    d1 = pmul(dA, dD);
+    d2 = pmul(dB, dC);
+
+    det = padd(d1, d2);
+    det = psub(det, rd);
+    det = vec2d_duplane(det, 0);
+    rd = pdiv(pset1<Packet2d>(1.0), det);
+
+    // rows of four sub-matrices of the inverse
+    Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2;
+
+    // iD = D*|A| - C*A#*B
+    iD1 = pmul(AB1, vec2d_duplane(C1, 0));
+    iD2 = pmul(AB1, vec2d_duplane(C2, 0));
+    iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1)));
+    iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1)));
+    dA = vec2d_duplane(dA, 0);
+    iD1 = psub(pmul(D1, dA), iD1);
+    iD2 = psub(pmul(D2, dA), iD2);
+
+    // iA = A*|D| - B*D#*C
+    iA1 = pmul(DC1, vec2d_duplane(B1, 0));
+    iA2 = pmul(DC1, vec2d_duplane(B2, 0));
+    iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1)));
+    iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1)));
+    dD = vec2d_duplane(dD, 0);
+    iA1 = psub(pmul(A1, dD), iA1);
+    iA2 = psub(pmul(A2, dD), iA2);
+
+    // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
+    iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1));
+    iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1));
+    iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2)));
+    iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2)));
+    dB = vec2d_duplane(dB, 0);
+    iB1 = psub(pmul(C1, dB), iB1);
+    iB2 = psub(pmul(C2, dB), iB2);
+
+    // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
+    iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1));
+    iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1));
+    iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2)));
+    iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2)));
+    dC = vec2d_duplane(dC, 0);
+    iC1 = psub(pmul(B1, dC), iC1);
+    iC2 = psub(pmul(B2, dC), iC2);
+
+    const double sign_mask1[2] = {0.0, numext::bit_cast<double>(0x8000000000000000ull)};
+    const double sign_mask2[2] = {numext::bit_cast<double>(0x8000000000000000ull), 0.0};
+    const Packet2d sign_PN = ploadu<Packet2d>(sign_mask1);
+    const Packet2d sign_NP = ploadu<Packet2d>(sign_mask2);
+    d1 = pxor(rd, sign_PN);
+    d2 = pxor(rd, sign_NP);
+
+    Index res_stride = result.outerStride();
+    double *res = result.data();
+    pstoret<double, Packet2d, ResultAlignment>(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2));
+  }
+};
+#endif
+} // namespace internal
+} // namespace Eigen
+#endif
diff --git a/contrib/eigen/Eigen/src/LU/arch/Inverse_SSE.h b/contrib/eigen/Eigen/src/LU/arch/Inverse_SSE.h
deleted file mode 100644
index 4dce2ef20eedfa847cab9814494410bf7f99e094..0000000000000000000000000000000000000000
--- a/contrib/eigen/Eigen/src/LU/arch/Inverse_SSE.h
+++ /dev/null
@@ -1,338 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2001 Intel Corporation
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// The SSE code for the 4x4 float and double matrix inverse in this file
-// comes from the following Intel's library:
-// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
-//
-// Here is the respective copyright and license statement:
-//
-//   Copyright (c) 2001 Intel Corporation.
-//
-// Permition is granted to use, copy, distribute and prepare derivative works
-// of this library for any purpose and without fee, provided, that the above
-// copyright notice and this statement appear in all copies.
-// Intel makes no representations about the suitability of this software for
-// any purpose, and specifically disclaims all warranties.
-// See LEGAL.TXT for all the legal information.
-
-#ifndef EIGEN_INVERSE_SSE_H
-#define EIGEN_INVERSE_SSE_H
-
-namespace Eigen { 
-
-namespace internal {
-
-template<typename MatrixType, typename ResultType>
-struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
-{
-  enum {
-    MatrixAlignment     = traits<MatrixType>::Alignment,
-    ResultAlignment     = traits<ResultType>::Alignment,
-    StorageOrdersMatch  = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
-  };
-  typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType;
-  
-  static void run(const MatrixType& mat, ResultType& result)
-  {
-    ActualMatrixType matrix(mat);
-    const Packet4f p4f_sign_PNNP = _mm_castsi128_ps(_mm_set_epi32(0x00000000, 0x80000000, 0x80000000, 0x00000000));
-
-    // Load the full matrix into registers
-    __m128 _L1 = matrix.template packet<MatrixAlignment>( 0);
-    __m128 _L2 = matrix.template packet<MatrixAlignment>( 4);
-    __m128 _L3 = matrix.template packet<MatrixAlignment>( 8);
-    __m128 _L4 = matrix.template packet<MatrixAlignment>(12);
-
-    // The inverse is calculated using "Divide and Conquer" technique. The
-    // original matrix is divide into four 2x2 sub-matrices. Since each
-    // register holds four matrix element, the smaller matrices are
-    // represented as a registers. Hence we get a better locality of the
-    // calculations.
-
-    __m128 A, B, C, D; // the four sub-matrices
-    if(!StorageOrdersMatch)
-    {
-      A = _mm_unpacklo_ps(_L1, _L2);
-      B = _mm_unpacklo_ps(_L3, _L4);
-      C = _mm_unpackhi_ps(_L1, _L2);
-      D = _mm_unpackhi_ps(_L3, _L4);
-    }
-    else
-    {
-      A = _mm_movelh_ps(_L1, _L2);
-      B = _mm_movehl_ps(_L2, _L1);
-      C = _mm_movelh_ps(_L3, _L4);
-      D = _mm_movehl_ps(_L4, _L3);
-    }
-
-    __m128 iA, iB, iC, iD,                 // partial inverse of the sub-matrices
-            DC, AB;
-    __m128 dA, dB, dC, dD;                 // determinant of the sub-matrices
-    __m128 det, d, d1, d2;
-    __m128 rd;                             // reciprocal of the determinant
-
-    //  AB = A# * B
-    AB = _mm_mul_ps(_mm_shuffle_ps(A,A,0x0F), B);
-    AB = _mm_sub_ps(AB,_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E)));
-    //  DC = D# * C
-    DC = _mm_mul_ps(_mm_shuffle_ps(D,D,0x0F), C);
-    DC = _mm_sub_ps(DC,_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E)));
-
-    //  dA = |A|
-    dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A);
-    dA = _mm_sub_ss(dA, _mm_movehl_ps(dA,dA));
-    //  dB = |B|
-    dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B);
-    dB = _mm_sub_ss(dB, _mm_movehl_ps(dB,dB));
-
-    //  dC = |C|
-    dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C);
-    dC = _mm_sub_ss(dC, _mm_movehl_ps(dC,dC));
-    //  dD = |D|
-    dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D);
-    dD = _mm_sub_ss(dD, _mm_movehl_ps(dD,dD));
-
-    //  d = trace(AB*DC) = trace(A#*B*D#*C)
-    d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB);
-
-    //  iD = C*A#*B
-    iD = _mm_mul_ps(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB));
-    iD = _mm_add_ps(iD,_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB)));
-    //  iA = B*D#*C
-    iA = _mm_mul_ps(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC));
-    iA = _mm_add_ps(iA,_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC)));
-
-    //  d = trace(AB*DC) = trace(A#*B*D#*C) [continue]
-    d  = _mm_add_ps(d, _mm_movehl_ps(d, d));
-    d  = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1));
-    d1 = _mm_mul_ss(dA,dD);
-    d2 = _mm_mul_ss(dB,dC);
-
-    //  iD = D*|A| - C*A#*B
-    iD = _mm_sub_ps(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD);
-
-    //  iA = A*|D| - B*D#*C;
-    iA = _mm_sub_ps(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA);
-
-    //  det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C)
-    det = _mm_sub_ss(_mm_add_ss(d1,d2),d);
-    rd  = _mm_div_ss(_mm_set_ss(1.0f), det);
-
-//     #ifdef ZERO_SINGULAR
-//         rd = _mm_and_ps(_mm_cmpneq_ss(det,_mm_setzero_ps()), rd);
-//     #endif
-
-    //  iB = D * (A#B)# = D*B#*A
-    iB = _mm_mul_ps(D, _mm_shuffle_ps(AB,AB,0x33));
-    iB = _mm_sub_ps(iB, _mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66)));
-    //  iC = A * (D#C)# = A*C#*D
-    iC = _mm_mul_ps(A, _mm_shuffle_ps(DC,DC,0x33));
-    iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66)));
-
-    rd = _mm_shuffle_ps(rd,rd,0);
-    rd = _mm_xor_ps(rd, p4f_sign_PNNP);
-
-    //  iB = C*|B| - D*B#*A
-    iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB);
-
-    //  iC = B*|C| - A*C#*D;
-    iC = _mm_sub_ps(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC);
-
-    //  iX = iX / det
-    iA = _mm_mul_ps(rd,iA);
-    iB = _mm_mul_ps(rd,iB);
-    iC = _mm_mul_ps(rd,iC);
-    iD = _mm_mul_ps(rd,iD);
-
-    Index res_stride = result.outerStride();
-    float* res = result.data();
-    pstoret<float, Packet4f, ResultAlignment>(res+0,            _mm_shuffle_ps(iA,iB,0x77));
-    pstoret<float, Packet4f, ResultAlignment>(res+res_stride,   _mm_shuffle_ps(iA,iB,0x22));
-    pstoret<float, Packet4f, ResultAlignment>(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77));
-    pstoret<float, Packet4f, ResultAlignment>(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22));
-  }
-
-};
-
-template<typename MatrixType, typename ResultType>
-struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType>
-{
-  enum {
-    MatrixAlignment     = traits<MatrixType>::Alignment,
-    ResultAlignment     = traits<ResultType>::Alignment,
-    StorageOrdersMatch  = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
-  };
-  typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType;
-  
-  static void run(const MatrixType& mat, ResultType& result)
-  {
-    ActualMatrixType matrix(mat);
-    const __m128d _Sign_NP = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
-    const __m128d _Sign_PN = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-
-    // The inverse is calculated using "Divide and Conquer" technique. The
-    // original matrix is divide into four 2x2 sub-matrices. Since each
-    // register of the matrix holds two elements, the smaller matrices are
-    // consisted of two registers. Hence we get a better locality of the
-    // calculations.
-
-    // the four sub-matrices
-    __m128d A1, A2, B1, B2, C1, C2, D1, D2;
-    
-    if(StorageOrdersMatch)
-    {
-      A1 = matrix.template packet<MatrixAlignment>( 0); B1 = matrix.template packet<MatrixAlignment>( 2);
-      A2 = matrix.template packet<MatrixAlignment>( 4); B2 = matrix.template packet<MatrixAlignment>( 6);
-      C1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
-      C2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
-    }
-    else
-    {
-      __m128d tmp;
-      A1 = matrix.template packet<MatrixAlignment>( 0); C1 = matrix.template packet<MatrixAlignment>( 2);
-      A2 = matrix.template packet<MatrixAlignment>( 4); C2 = matrix.template packet<MatrixAlignment>( 6);
-      tmp = A1;
-      A1 = _mm_unpacklo_pd(A1,A2);
-      A2 = _mm_unpackhi_pd(tmp,A2);
-      tmp = C1;
-      C1 = _mm_unpacklo_pd(C1,C2);
-      C2 = _mm_unpackhi_pd(tmp,C2);
-      
-      B1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
-      B2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
-      tmp = B1;
-      B1 = _mm_unpacklo_pd(B1,B2);
-      B2 = _mm_unpackhi_pd(tmp,B2);
-      tmp = D1;
-      D1 = _mm_unpacklo_pd(D1,D2);
-      D2 = _mm_unpackhi_pd(tmp,D2);
-    }
-    
-    __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2,     // partial invese of the sub-matrices
-            DC1, DC2, AB1, AB2;
-    __m128d dA, dB, dC, dD;     // determinant of the sub-matrices
-    __m128d det, d1, d2, rd;
-
-    //  dA = |A|
-    dA = _mm_shuffle_pd(A2, A2, 1);
-    dA = _mm_mul_pd(A1, dA);
-    dA = _mm_sub_sd(dA, _mm_shuffle_pd(dA,dA,3));
-    //  dB = |B|
-    dB = _mm_shuffle_pd(B2, B2, 1);
-    dB = _mm_mul_pd(B1, dB);
-    dB = _mm_sub_sd(dB, _mm_shuffle_pd(dB,dB,3));
-
-    //  AB = A# * B
-    AB1 = _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,3));
-    AB2 = _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,0));
-    AB1 = _mm_sub_pd(AB1, _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,3)));
-    AB2 = _mm_sub_pd(AB2, _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,0)));
-
-    //  dC = |C|
-    dC = _mm_shuffle_pd(C2, C2, 1);
-    dC = _mm_mul_pd(C1, dC);
-    dC = _mm_sub_sd(dC, _mm_shuffle_pd(dC,dC,3));
-    //  dD = |D|
-    dD = _mm_shuffle_pd(D2, D2, 1);
-    dD = _mm_mul_pd(D1, dD);
-    dD = _mm_sub_sd(dD, _mm_shuffle_pd(dD,dD,3));
-
-    //  DC = D# * C
-    DC1 = _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,3));
-    DC2 = _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,0));
-    DC1 = _mm_sub_pd(DC1, _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,3)));
-    DC2 = _mm_sub_pd(DC2, _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,0)));
-
-    //  rd = trace(AB*DC) = trace(A#*B*D#*C)
-    d1 = _mm_mul_pd(AB1, _mm_shuffle_pd(DC1, DC2, 0));
-    d2 = _mm_mul_pd(AB2, _mm_shuffle_pd(DC1, DC2, 3));
-    rd = _mm_add_pd(d1, d2);
-    rd = _mm_add_sd(rd, _mm_shuffle_pd(rd, rd,3));
-
-    //  iD = C*A#*B
-    iD1 = _mm_mul_pd(AB1, _mm_shuffle_pd(C1,C1,0));
-    iD2 = _mm_mul_pd(AB1, _mm_shuffle_pd(C2,C2,0));
-    iD1 = _mm_add_pd(iD1, _mm_mul_pd(AB2, _mm_shuffle_pd(C1,C1,3)));
-    iD2 = _mm_add_pd(iD2, _mm_mul_pd(AB2, _mm_shuffle_pd(C2,C2,3)));
-
-    //  iA = B*D#*C
-    iA1 = _mm_mul_pd(DC1, _mm_shuffle_pd(B1,B1,0));
-    iA2 = _mm_mul_pd(DC1, _mm_shuffle_pd(B2,B2,0));
-    iA1 = _mm_add_pd(iA1, _mm_mul_pd(DC2, _mm_shuffle_pd(B1,B1,3)));
-    iA2 = _mm_add_pd(iA2, _mm_mul_pd(DC2, _mm_shuffle_pd(B2,B2,3)));
-
-    //  iD = D*|A| - C*A#*B
-    dA = _mm_shuffle_pd(dA,dA,0);
-    iD1 = _mm_sub_pd(_mm_mul_pd(D1, dA), iD1);
-    iD2 = _mm_sub_pd(_mm_mul_pd(D2, dA), iD2);
-
-    //  iA = A*|D| - B*D#*C;
-    dD = _mm_shuffle_pd(dD,dD,0);
-    iA1 = _mm_sub_pd(_mm_mul_pd(A1, dD), iA1);
-    iA2 = _mm_sub_pd(_mm_mul_pd(A2, dD), iA2);
-
-    d1 = _mm_mul_sd(dA, dD);
-    d2 = _mm_mul_sd(dB, dC);
-
-    //  iB = D * (A#B)# = D*B#*A
-    iB1 = _mm_mul_pd(D1, _mm_shuffle_pd(AB2,AB1,1));
-    iB2 = _mm_mul_pd(D2, _mm_shuffle_pd(AB2,AB1,1));
-    iB1 = _mm_sub_pd(iB1, _mm_mul_pd(_mm_shuffle_pd(D1,D1,1), _mm_shuffle_pd(AB2,AB1,2)));
-    iB2 = _mm_sub_pd(iB2, _mm_mul_pd(_mm_shuffle_pd(D2,D2,1), _mm_shuffle_pd(AB2,AB1,2)));
-
-    //  det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C)
-    det = _mm_add_sd(d1, d2);
-    det = _mm_sub_sd(det, rd);
-
-    //  iC = A * (D#C)# = A*C#*D
-    iC1 = _mm_mul_pd(A1, _mm_shuffle_pd(DC2,DC1,1));
-    iC2 = _mm_mul_pd(A2, _mm_shuffle_pd(DC2,DC1,1));
-    iC1 = _mm_sub_pd(iC1, _mm_mul_pd(_mm_shuffle_pd(A1,A1,1), _mm_shuffle_pd(DC2,DC1,2)));
-    iC2 = _mm_sub_pd(iC2, _mm_mul_pd(_mm_shuffle_pd(A2,A2,1), _mm_shuffle_pd(DC2,DC1,2)));
-
-    rd = _mm_div_sd(_mm_set_sd(1.0), det);
-//     #ifdef ZERO_SINGULAR
-//         rd = _mm_and_pd(_mm_cmpneq_sd(det,_mm_setzero_pd()), rd);
-//     #endif
-    rd = _mm_shuffle_pd(rd,rd,0);
-
-    //  iB = C*|B| - D*B#*A
-    dB = _mm_shuffle_pd(dB,dB,0);
-    iB1 = _mm_sub_pd(_mm_mul_pd(C1, dB), iB1);
-    iB2 = _mm_sub_pd(_mm_mul_pd(C2, dB), iB2);
-
-    d1 = _mm_xor_pd(rd, _Sign_PN);
-    d2 = _mm_xor_pd(rd, _Sign_NP);
-
-    //  iC = B*|C| - A*C#*D;
-    dC = _mm_shuffle_pd(dC,dC,0);
-    iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1);
-    iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2);
-
-    Index res_stride = result.outerStride();
-    double* res = result.data();
-    pstoret<double, Packet2d, ResultAlignment>(res+0,             _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1));
-    pstoret<double, Packet2d, ResultAlignment>(res+res_stride,    _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2));
-    pstoret<double, Packet2d, ResultAlignment>(res+2,             _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1));
-    pstoret<double, Packet2d, ResultAlignment>(res+res_stride+2,  _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2));
-    pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride,  _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1));
-    pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride,  _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2));
-    pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1));
-    pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2));
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_INVERSE_SSE_H
diff --git a/contrib/eigen/Eigen/src/OrderingMethods/Amd.h b/contrib/eigen/Eigen/src/OrderingMethods/Amd.h
index f91ecb24efc1f20b90d24ee98c40b46d7742e639..7ca3f33b12f61258819d7bd2bf8fe3222397aace 100644
--- a/contrib/eigen/Eigen/src/OrderingMethods/Amd.h
+++ b/contrib/eigen/Eigen/src/OrderingMethods/Amd.h
@@ -2,32 +2,22 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 /*
-
 NOTE: this routine has been adapted from the CSparse library:
 
 Copyright (c) 2006, Timothy A. Davis.
 http://www.suitesparse.com
 
-CSparse is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-CSparse is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this Module; if not, write to the Free Software
-Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
+The author of CSparse, Timothy A. Davis., has executed a license with Google LLC
+to permit distribution of this code and derivative works as part of Eigen under
+the Mozilla Public License v. 2.0, as stated at the top of this file.
 */
 
-#include "../Core/util/NonMPL2.h"
-
 #ifndef EIGEN_SPARSE_AMD_H
 #define EIGEN_SPARSE_AMD_H
 
diff --git a/contrib/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h b/contrib/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h
index da85b4d6ea23cb4dedfa0f502e288a89fd79b01b..8e339a704a1812c368e19f445aaec7deda219eb2 100644
--- a/contrib/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/contrib/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -13,115 +13,119 @@
 //   Davis (davis@cise.ufl.edu), University of Florida.  The algorithm was
 //   developed in collaboration with John Gilbert, Xerox PARC, and Esmond
 //   Ng, Oak Ridge National Laboratory.
-// 
+//
 //     Date:
-// 
+//
 //   September 8, 2003.  Version 2.3.
-// 
+//
 //     Acknowledgements:
-// 
+//
 //   This work was supported by the National Science Foundation, under
 //   grants DMS-9504974 and DMS-9803599.
-// 
+//
 //     Notice:
-// 
+//
 //   Copyright (c) 1998-2003 by the University of Florida.
 //   All Rights Reserved.
-// 
+//
 //   THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
 //   EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
-// 
+//
 //   Permission is hereby granted to use, copy, modify, and/or distribute
 //   this program, provided that the Copyright, this License, and the
 //   Availability of the original version is retained on all copies and made
 //   accessible to the end-user of any code or package that includes COLAMD
-//   or any modified version of COLAMD. 
-// 
+//   or any modified version of COLAMD.
+//
 //     Availability:
-// 
+//
 //   The colamd/symamd library is available at
-// 
+//
 //       http://www.suitesparse.com
 
-  
+
 #ifndef EIGEN_COLAMD_H
 #define EIGEN_COLAMD_H
 
 namespace internal {
+
+namespace Colamd {
+
 /* Ensure that debugging is turned off: */
 #ifndef COLAMD_NDEBUG
 #define COLAMD_NDEBUG
 #endif /* NDEBUG */
+
+
 /* ========================================================================== */
 /* === Knob and statistics definitions ====================================== */
 /* ========================================================================== */
 
 /* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
-#define COLAMD_KNOBS 20
+const int NKnobs = 20;
 
 /* number of output statistics.  Only stats [0..6] are currently used. */
-#define COLAMD_STATS 20 
+const int NStats = 20;
 
-/* knobs [0] and stats [0]: dense row knob and output statistic. */
-#define COLAMD_DENSE_ROW 0
+/* Indices into knobs and stats array. */
+enum KnobsStatsIndex {
+  /* knobs [0] and stats [0]: dense row knob and output statistic. */
+  DenseRow = 0,
 
-/* knobs [1] and stats [1]: dense column knob and output statistic. */
-#define COLAMD_DENSE_COL 1
+  /* knobs [1] and stats [1]: dense column knob and output statistic. */
+  DenseCol = 1,
 
-/* stats [2]: memory defragmentation count output statistic */
-#define COLAMD_DEFRAG_COUNT 2
+  /* stats [2]: memory defragmentation count output statistic */
+  DefragCount = 2,
 
-/* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
-#define COLAMD_STATUS 3
+  /* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
+  Status = 3,
 
-/* stats [4..6]: error info, or info on jumbled columns */ 
-#define COLAMD_INFO1 4
-#define COLAMD_INFO2 5
-#define COLAMD_INFO3 6
+  /* stats [4..6]: error info, or info on jumbled columns */
+  Info1 = 4,
+  Info2 = 5,
+  Info3 = 6
+};
 
 /* error codes returned in stats [3]: */
-#define COLAMD_OK       (0)
-#define COLAMD_OK_BUT_JUMBLED     (1)
-#define COLAMD_ERROR_A_not_present    (-1)
-#define COLAMD_ERROR_p_not_present    (-2)
-#define COLAMD_ERROR_nrow_negative    (-3)
-#define COLAMD_ERROR_ncol_negative    (-4)
-#define COLAMD_ERROR_nnz_negative   (-5)
-#define COLAMD_ERROR_p0_nonzero     (-6)
-#define COLAMD_ERROR_A_too_small    (-7)
-#define COLAMD_ERROR_col_length_negative  (-8)
-#define COLAMD_ERROR_row_index_out_of_bounds  (-9)
-#define COLAMD_ERROR_out_of_memory    (-10)
-#define COLAMD_ERROR_internal_error   (-999)
-
+enum Status {
+  Ok = 0,
+  OkButJumbled = 1,
+  ErrorANotPresent = -1,
+  ErrorPNotPresent = -2,
+  ErrorNrowNegative = -3,
+  ErrorNcolNegative = -4,
+  ErrorNnzNegative = -5,
+  ErrorP0Nonzero = -6,
+  ErrorATooSmall = -7,
+  ErrorColLengthNegative = -8,
+  ErrorRowIndexOutOfBounds = -9,
+  ErrorOutOfMemory = -10,
+  ErrorInternalError = -999
+};
 /* ========================================================================== */
 /* === Definitions ========================================================== */
 /* ========================================================================== */
 
-#define ONES_COMPLEMENT(r) (-(r)-1)
+template <typename IndexType>
+IndexType ones_complement(const IndexType r) {
+  return (-(r)-1);
+}
 
 /* -------------------------------------------------------------------------- */
-
-#define COLAMD_EMPTY (-1)
+const int Empty = -1;
 
 /* Row and column status */
-#define ALIVE (0)
-#define DEAD  (-1)
+enum RowColumnStatus {
+  Alive = 0,
+  Dead = -1
+};
 
 /* Column status */
-#define DEAD_PRINCIPAL    (-1)
-#define DEAD_NON_PRINCIPAL  (-2)
-
-/* Macros for row and column status update and checking. */
-#define ROW_IS_DEAD(r)      ROW_IS_MARKED_DEAD (Row[r].shared2.mark)
-#define ROW_IS_MARKED_DEAD(row_mark)  (row_mark < ALIVE)
-#define ROW_IS_ALIVE(r)     (Row [r].shared2.mark >= ALIVE)
-#define COL_IS_DEAD(c)      (Col [c].start < ALIVE)
-#define COL_IS_ALIVE(c)     (Col [c].start >= ALIVE)
-#define COL_IS_DEAD_PRINCIPAL(c)  (Col [c].start == DEAD_PRINCIPAL)
-#define KILL_ROW(r)     { Row [r].shared2.mark = DEAD ; }
-#define KILL_PRINCIPAL_COL(c)   { Col [c].start = DEAD_PRINCIPAL ; }
-#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; }
+enum ColumnStatus {
+  DeadPrincipal = -1,
+  DeadNonPrincipal = -2
+};
 
 /* ========================================================================== */
 /* === Colamd reporting mechanism =========================================== */
@@ -129,9 +133,9 @@ namespace internal {
 
 // == Row and Column structures ==
 template <typename IndexType>
-struct colamd_col
+struct ColStructure
 {
-  IndexType start ;   /* index for A of first row in this column, or DEAD */
+  IndexType start ;   /* index for A of first row in this column, or Dead */
   /* if column is dead */
   IndexType length ;  /* number of rows in this column */
   union
@@ -159,11 +163,21 @@ struct colamd_col
     IndexType degree_next ; /* next column, if col is in a degree list */
     IndexType hash_next ;   /* next column, if col is in a hash list */
   } shared4 ;
-  
+
+  inline bool is_dead() const { return start < Alive; }
+
+  inline bool is_alive() const { return start >= Alive; }
+
+  inline bool is_dead_principal() const { return start == DeadPrincipal; }
+
+  inline void kill_principal() { start = DeadPrincipal; }
+
+  inline void kill_non_principal() { start = DeadNonPrincipal; }
+
 };
- 
+
 template <typename IndexType>
-struct Colamd_Row
+struct RowStructure
 {
   IndexType start ;   /* index for A of first col in this row */
   IndexType length ;  /* number of principal columns in this row */
@@ -177,13 +191,19 @@ struct Colamd_Row
     IndexType mark ;  /* for computing set differences and marking dead rows*/
     IndexType first_column ;/* first column in row (used in garbage collection) */
   } shared2 ;
-  
+
+  inline bool is_dead() const { return shared2.mark < Alive; }
+
+  inline bool is_alive() const { return shared2.mark >= Alive; }
+
+  inline void kill() { shared2.mark = Dead; }
+
 };
- 
+
 /* ========================================================================== */
 /* === Colamd recommended memory size ======================================= */
 /* ========================================================================== */
- 
+
 /*
   The recommended length Alen of the array A passed to colamd is given by
   the COLAMD_RECOMMENDED (nnz, n_row, n_col) macro.  It returns -1 if any
@@ -192,41 +212,41 @@ struct Colamd_Row
   required for the Col and Row arrays, respectively, which are internal to
   colamd.  An additional n_col space is the minimal amount of "elbow room",
   and nnz/5 more space is recommended for run time efficiency.
-  
+
   This macro is not needed when using symamd.
-  
+
   Explicit typecast to IndexType added Sept. 23, 2002, COLAMD version 2.2, to avoid
   gcc -pedantic warning messages.
 */
 template <typename IndexType>
-inline IndexType colamd_c(IndexType n_col) 
-{ return IndexType( ((n_col) + 1) * sizeof (colamd_col<IndexType>) / sizeof (IndexType) ) ; }
+inline IndexType colamd_c(IndexType n_col)
+{ return IndexType( ((n_col) + 1) * sizeof (ColStructure<IndexType>) / sizeof (IndexType) ) ; }
 
 template <typename IndexType>
 inline IndexType  colamd_r(IndexType n_row)
-{ return IndexType(((n_row) + 1) * sizeof (Colamd_Row<IndexType>) / sizeof (IndexType)); }
+{ return IndexType(((n_row) + 1) * sizeof (RowStructure<IndexType>) / sizeof (IndexType)); }
 
 // Prototypes of non-user callable routines
 template <typename IndexType>
-static IndexType init_rows_cols (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> col [], IndexType A [], IndexType p [], IndexType stats[COLAMD_STATS] ); 
+static IndexType init_rows_cols (IndexType n_row, IndexType n_col, RowStructure<IndexType> Row [], ColStructure<IndexType> col [], IndexType A [], IndexType p [], IndexType stats[NStats] );
 
 template <typename IndexType>
-static void init_scoring (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType head [], double knobs[COLAMD_KNOBS], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg);
+static void init_scoring (IndexType n_row, IndexType n_col, RowStructure<IndexType> Row [], ColStructure<IndexType> Col [], IndexType A [], IndexType head [], double knobs[NKnobs], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg);
 
 template <typename IndexType>
-static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree);
+static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, RowStructure<IndexType> Row [], ColStructure<IndexType> Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree);
 
 template <typename IndexType>
-static void order_children (IndexType n_col, colamd_col<IndexType> Col [], IndexType p []);
+static void order_children (IndexType n_col, ColStructure<IndexType> Col [], IndexType p []);
 
 template <typename IndexType>
-static void detect_super_cols (colamd_col<IndexType> Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ;
+static void detect_super_cols (ColStructure<IndexType> Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ;
 
 template <typename IndexType>
-static IndexType garbage_collection (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType *pfree) ;
+static IndexType garbage_collection (IndexType n_row, IndexType n_col, RowStructure<IndexType> Row [], ColStructure<IndexType> Col [], IndexType A [], IndexType *pfree) ;
 
 template <typename IndexType>
-static inline  IndexType clear_mark (IndexType n_row, Colamd_Row<IndexType> Row [] ) ;
+static inline  IndexType clear_mark (IndexType n_row, RowStructure<IndexType> Row [] ) ;
 
 /* === No debugging ========================================================= */
 
@@ -240,37 +260,37 @@ static inline  IndexType clear_mark (IndexType n_row, Colamd_Row<IndexType> Row
 
 
 /**
- * \brief Returns the recommended value of Alen 
- * 
- * Returns recommended value of Alen for use by colamd.  
- * Returns -1 if any input argument is negative.  
- * The use of this routine or macro is optional.  
- * Note that the macro uses its arguments   more than once, 
- * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED.  
- * 
+ * \brief Returns the recommended value of Alen
+ *
+ * Returns recommended value of Alen for use by colamd.
+ * Returns -1 if any input argument is negative.
+ * The use of this routine or macro is optional.
+ * Note that the macro uses its arguments   more than once,
+ * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED.
+ *
  * \param nnz nonzeros in A
  * \param n_row number of rows in A
  * \param n_col number of columns in A
  * \return recommended value of Alen for use by colamd
  */
 template <typename IndexType>
-inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType n_col)
+inline IndexType recommended ( IndexType nnz, IndexType n_row, IndexType n_col)
 {
   if ((nnz) < 0 || (n_row) < 0 || (n_col) < 0)
     return (-1);
   else
-    return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5)); 
+    return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5));
 }
 
 /**
  * \brief set default parameters  The use of this routine is optional.
- * 
- * Colamd: rows with more than (knobs [COLAMD_DENSE_ROW] * n_col)
+ *
+ * Colamd: rows with more than (knobs [DenseRow] * n_col)
  * entries are removed prior to ordering.  Columns with more than
- * (knobs [COLAMD_DENSE_COL] * n_row) entries are removed prior to
- * ordering, and placed last in the output column ordering. 
+ * (knobs [DenseCol] * n_row) entries are removed prior to
+ * ordering, and placed last in the output column ordering.
  *
- * COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1,
+ * DenseRow and DenseCol are defined as 0 and 1,
  * respectively, in colamd.h.  Default values of these two knobs
  * are both 0.5.  Currently, only knobs [0] and knobs [1] are
  * used, but future versions may use more knobs.  If so, they will
@@ -279,37 +299,37 @@ inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType
  * not need to change, assuming that you either use
  * colamd_set_defaults, or pass a (double *) NULL pointer as the
  * knobs array to colamd or symamd.
- * 
+ *
  * \param knobs parameter settings for colamd
  */
 
-static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS])
+static inline void set_defaults(double knobs[NKnobs])
 {
   /* === Local variables ================================================== */
-  
+
   int i ;
 
   if (!knobs)
   {
     return ;      /* no knobs to initialize */
   }
-  for (i = 0 ; i < COLAMD_KNOBS ; i++)
+  for (i = 0 ; i < NKnobs ; i++)
   {
     knobs [i] = 0 ;
   }
-  knobs [COLAMD_DENSE_ROW] = 0.5 ;  /* ignore rows over 50% dense */
-  knobs [COLAMD_DENSE_COL] = 0.5 ;  /* ignore columns over 50% dense */
+  knobs [Colamd::DenseRow] = 0.5 ;  /* ignore rows over 50% dense */
+  knobs [Colamd::DenseCol] = 0.5 ;  /* ignore columns over 50% dense */
 }
 
-/** 
+/**
  * \brief  Computes a column ordering using the column approximate minimum degree ordering
- * 
+ *
  * Computes a column ordering (Q) of A such that P(AQ)=LU or
  * (AQ)'AQ=LL' have less fill-in and require fewer floating point
  * operations than factorizing the unpermuted matrix A or A'A,
  * respectively.
- * 
- * 
+ *
+ *
  * \param n_row number of rows in A
  * \param n_col number of columns in A
  * \param Alen, size of the array A
@@ -319,143 +339,143 @@ static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS])
  * \param stats colamd output statistics and error codes
  */
 template <typename IndexType>
-static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[COLAMD_KNOBS], IndexType stats[COLAMD_STATS])
+static bool compute_ordering(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[NKnobs], IndexType stats[NStats])
 {
   /* === Local variables ================================================== */
-  
+
   IndexType i ;     /* loop index */
   IndexType nnz ;     /* nonzeros in A */
   IndexType Row_size ;    /* size of Row [], in integers */
   IndexType Col_size ;    /* size of Col [], in integers */
   IndexType need ;      /* minimum required length of A */
-  Colamd_Row<IndexType> *Row ;   /* pointer into A of Row [0..n_row] array */
-  colamd_col<IndexType> *Col ;   /* pointer into A of Col [0..n_col] array */
+  Colamd::RowStructure<IndexType> *Row ;   /* pointer into A of Row [0..n_row] array */
+  Colamd::ColStructure<IndexType> *Col ;   /* pointer into A of Col [0..n_col] array */
   IndexType n_col2 ;    /* number of non-dense, non-empty columns */
   IndexType n_row2 ;    /* number of non-dense, non-empty rows */
   IndexType ngarbage ;    /* number of garbage collections performed */
   IndexType max_deg ;   /* maximum row degree */
-  double default_knobs [COLAMD_KNOBS] ; /* default knobs array */
-  
-  
+  double default_knobs [NKnobs] ; /* default knobs array */
+
+
   /* === Check the input arguments ======================================== */
-  
+
   if (!stats)
   {
     COLAMD_DEBUG0 (("colamd: stats not present\n")) ;
     return (false) ;
   }
-  for (i = 0 ; i < COLAMD_STATS ; i++)
+  for (i = 0 ; i < NStats ; i++)
   {
     stats [i] = 0 ;
   }
-  stats [COLAMD_STATUS] = COLAMD_OK ;
-  stats [COLAMD_INFO1] = -1 ;
-  stats [COLAMD_INFO2] = -1 ;
-  
+  stats [Colamd::Status] = Colamd::Ok ;
+  stats [Colamd::Info1] = -1 ;
+  stats [Colamd::Info2] = -1 ;
+
   if (!A)   /* A is not present */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ;
+    stats [Colamd::Status] = Colamd::ErrorANotPresent ;
     COLAMD_DEBUG0 (("colamd: A not present\n")) ;
     return (false) ;
   }
-  
+
   if (!p)   /* p is not present */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ;
+    stats [Colamd::Status] = Colamd::ErrorPNotPresent ;
     COLAMD_DEBUG0 (("colamd: p not present\n")) ;
     return (false) ;
   }
-  
+
   if (n_row < 0)  /* n_row must be >= 0 */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_nrow_negative ;
-    stats [COLAMD_INFO1] = n_row ;
+    stats [Colamd::Status] = Colamd::ErrorNrowNegative ;
+    stats [Colamd::Info1] = n_row ;
     COLAMD_DEBUG0 (("colamd: nrow negative %d\n", n_row)) ;
     return (false) ;
   }
-  
+
   if (n_col < 0)  /* n_col must be >= 0 */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ;
-    stats [COLAMD_INFO1] = n_col ;
+    stats [Colamd::Status] = Colamd::ErrorNcolNegative ;
+    stats [Colamd::Info1] = n_col ;
     COLAMD_DEBUG0 (("colamd: ncol negative %d\n", n_col)) ;
     return (false) ;
   }
-  
+
   nnz = p [n_col] ;
   if (nnz < 0)  /* nnz must be >= 0 */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ;
-    stats [COLAMD_INFO1] = nnz ;
+    stats [Colamd::Status] = Colamd::ErrorNnzNegative ;
+    stats [Colamd::Info1] = nnz ;
     COLAMD_DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ;
     return (false) ;
   }
-  
+
   if (p [0] != 0)
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ;
-    stats [COLAMD_INFO1] = p [0] ;
+    stats [Colamd::Status] = Colamd::ErrorP0Nonzero ;
+    stats [Colamd::Info1] = p [0] ;
     COLAMD_DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ;
     return (false) ;
   }
-  
+
   /* === If no knobs, set default knobs =================================== */
-  
+
   if (!knobs)
   {
-    colamd_set_defaults (default_knobs) ;
+    set_defaults (default_knobs) ;
     knobs = default_knobs ;
   }
-  
+
   /* === Allocate the Row and Col arrays from array A ===================== */
-  
+
   Col_size = colamd_c (n_col) ;
   Row_size = colamd_r (n_row) ;
   need = 2*nnz + n_col + Col_size + Row_size ;
-  
+
   if (need > Alen)
   {
     /* not enough space in array A to perform the ordering */
-    stats [COLAMD_STATUS] = COLAMD_ERROR_A_too_small ;
-    stats [COLAMD_INFO1] = need ;
-    stats [COLAMD_INFO2] = Alen ;
+    stats [Colamd::Status] = Colamd::ErrorATooSmall ;
+    stats [Colamd::Info1] = need ;
+    stats [Colamd::Info2] = Alen ;
     COLAMD_DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen));
     return (false) ;
   }
-  
+
   Alen -= Col_size + Row_size ;
-  Col = (colamd_col<IndexType> *) &A [Alen] ;
-  Row = (Colamd_Row<IndexType> *) &A [Alen + Col_size] ;
+  Col = (ColStructure<IndexType> *) &A [Alen] ;
+  Row = (RowStructure<IndexType> *) &A [Alen + Col_size] ;
 
   /* === Construct the row and column data structures ===================== */
-  
-  if (!Eigen::internal::init_rows_cols (n_row, n_col, Row, Col, A, p, stats))
+
+  if (!Colamd::init_rows_cols (n_row, n_col, Row, Col, A, p, stats))
   {
     /* input matrix is invalid */
     COLAMD_DEBUG0 (("colamd: Matrix invalid\n")) ;
     return (false) ;
   }
-  
+
   /* === Initialize scores, kill dense rows/columns ======================= */
 
-  Eigen::internal::init_scoring (n_row, n_col, Row, Col, A, p, knobs,
+  Colamd::init_scoring (n_row, n_col, Row, Col, A, p, knobs,
 		&n_row2, &n_col2, &max_deg) ;
-  
+
   /* === Order the supercolumns =========================================== */
-  
-  ngarbage = Eigen::internal::find_ordering (n_row, n_col, Alen, Row, Col, A, p,
+
+  ngarbage = Colamd::find_ordering (n_row, n_col, Alen, Row, Col, A, p,
 			    n_col2, max_deg, 2*nnz) ;
-  
+
   /* === Order the non-principal columns ================================== */
-  
-  Eigen::internal::order_children (n_col, Col, p) ;
-  
+
+  Colamd::order_children (n_col, Col, p) ;
+
   /* === Return statistics in stats ======================================= */
-  
-  stats [COLAMD_DENSE_ROW] = n_row - n_row2 ;
-  stats [COLAMD_DENSE_COL] = n_col - n_col2 ;
-  stats [COLAMD_DEFRAG_COUNT] = ngarbage ;
-  COLAMD_DEBUG0 (("colamd: done.\n")) ; 
+
+  stats [Colamd::DenseRow] = n_row - n_row2 ;
+  stats [Colamd::DenseCol] = n_col - n_col2 ;
+  stats [Colamd::DefragCount] = ngarbage ;
+  COLAMD_DEBUG0 (("colamd: done.\n")) ;
   return (true) ;
 }
 
@@ -465,7 +485,6 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *
 
 /* There are no user-callable routines beyond this point in the file */
 
-
 /* ========================================================================== */
 /* === init_rows_cols ======================================================= */
 /* ========================================================================== */
@@ -485,11 +504,11 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
 
     IndexType n_row,      /* number of rows of A */
     IndexType n_col,      /* number of columns of A */
-    Colamd_Row<IndexType> Row [],    /* of size n_row+1 */
-    colamd_col<IndexType> Col [],    /* of size n_col+1 */
+    RowStructure<IndexType> Row [],    /* of size n_row+1 */
+    ColStructure<IndexType> Col [],    /* of size n_col+1 */
     IndexType A [],     /* row indices of A, of size Alen */
     IndexType p [],     /* pointers to columns in A, of size n_col+1 */
-    IndexType stats [COLAMD_STATS]  /* colamd statistics */ 
+    IndexType stats [NStats]  /* colamd statistics */
     )
 {
   /* === Local variables ================================================== */
@@ -512,24 +531,24 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
     if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200
     {
       /* column pointers must be non-decreasing */
-      stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;
-      stats [COLAMD_INFO1] = col ;
-      stats [COLAMD_INFO2] = Col [col].length ;
+      stats [Colamd::Status] = Colamd::ErrorColLengthNegative ;
+      stats [Colamd::Info1] = col ;
+      stats [Colamd::Info2] = Col [col].length ;
       COLAMD_DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ;
       return (false) ;
     }
 
     Col [col].shared1.thickness = 1 ;
     Col [col].shared2.score = 0 ;
-    Col [col].shared3.prev = COLAMD_EMPTY ;
-    Col [col].shared4.degree_next = COLAMD_EMPTY ;
+    Col [col].shared3.prev = Empty ;
+    Col [col].shared4.degree_next = Empty ;
   }
 
   /* p [0..n_col] no longer needed, used as "head" in subsequent routines */
 
   /* === Scan columns, compute row degrees, and check row indices ========= */
 
-  stats [COLAMD_INFO3] = 0 ;  /* number of duplicate or unsorted row indices*/
+  stats [Info3] = 0 ;  /* number of duplicate or unsorted row indices*/
 
   for (row = 0 ; row < n_row ; row++)
   {
@@ -551,10 +570,10 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
       /* make sure row indices within range */
       if (row < 0 || row >= n_row)
       {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ;
-	stats [COLAMD_INFO1] = col ;
-	stats [COLAMD_INFO2] = row ;
-	stats [COLAMD_INFO3] = n_row ;
+	stats [Colamd::Status] = Colamd::ErrorRowIndexOutOfBounds ;
+	stats [Colamd::Info1] = col ;
+	stats [Colamd::Info2] = row ;
+	stats [Colamd::Info3] = n_row ;
 	COLAMD_DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ;
 	return (false) ;
       }
@@ -563,10 +582,10 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
       {
 	/* row index are unsorted or repeated (or both), thus col */
 	/* is jumbled.  This is a notice, not an error condition. */
-	stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ;
-	stats [COLAMD_INFO1] = col ;
-	stats [COLAMD_INFO2] = row ;
-	(stats [COLAMD_INFO3]) ++ ;
+	stats [Colamd::Status] = Colamd::OkButJumbled ;
+	stats [Colamd::Info1] = col ;
+	stats [Colamd::Info2] = row ;
+	(stats [Colamd::Info3]) ++ ;
 	COLAMD_DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col));
       }
 
@@ -604,7 +623,7 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
 
   /* === Create row form ================================================== */
 
-  if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
+  if (stats [Status] == OkButJumbled)
   {
     /* if cols jumbled, watch for repeated row indices */
     for (col = 0 ; col < n_col ; col++)
@@ -646,7 +665,7 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
 
   /* === See if we need to re-create columns ============================== */
 
-  if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
+  if (stats [Status] == OkButJumbled)
   {
     COLAMD_DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ;
 
@@ -701,11 +720,11 @@ static void init_scoring
 
     IndexType n_row,      /* number of rows of A */
     IndexType n_col,      /* number of columns of A */
-    Colamd_Row<IndexType> Row [],    /* of size n_row+1 */
-    colamd_col<IndexType> Col [],    /* of size n_col+1 */
+    RowStructure<IndexType> Row [],    /* of size n_row+1 */
+    ColStructure<IndexType> Col [],    /* of size n_col+1 */
     IndexType A [],     /* column form and row form of A */
     IndexType head [],    /* of size n_col+1 */
-    double knobs [COLAMD_KNOBS],/* parameters */
+    double knobs [NKnobs],/* parameters */
     IndexType *p_n_row2,    /* number of non-dense, non-empty rows */
     IndexType *p_n_col2,    /* number of non-dense, non-empty columns */
     IndexType *p_max_deg    /* maximum row degree */
@@ -732,8 +751,8 @@ static void init_scoring
 
   /* === Extract knobs ==================================================== */
 
-  dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_ROW] * n_col), n_col)) ;
-  dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_COL] * n_row), n_row)) ;
+  dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [Colamd::DenseRow] * n_col), n_col)) ;
+  dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [Colamd::DenseCol] * n_row), n_row)) ;
   COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ;
   max_deg = 0 ;
   n_col2 = n_col ;
@@ -750,7 +769,7 @@ static void init_scoring
     {
       /* this is a empty column, kill and order it last */
       Col [c].shared2.order = --n_col2 ;
-      KILL_PRINCIPAL_COL (c) ;
+      Col[c].kill_principal() ;
     }
   }
   COLAMD_DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ;
@@ -761,7 +780,7 @@ static void init_scoring
   for (c = n_col-1 ; c >= 0 ; c--)
   {
     /* skip any dead columns */
-    if (COL_IS_DEAD (c))
+    if (Col[c].is_dead())
     {
       continue ;
     }
@@ -777,7 +796,7 @@ static void init_scoring
       {
 	Row [*cp++].shared1.degree-- ;
       }
-      KILL_PRINCIPAL_COL (c) ;
+      Col[c].kill_principal() ;
     }
   }
   COLAMD_DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ;
@@ -791,7 +810,7 @@ static void init_scoring
     if (deg > dense_row_count || deg == 0)
     {
       /* kill a dense or empty row */
-      KILL_ROW (r) ;
+      Row[r].kill() ;
       --n_row2 ;
     }
     else
@@ -813,7 +832,7 @@ static void init_scoring
   for (c = n_col-1 ; c >= 0 ; c--)
   {
     /* skip dead column */
-    if (COL_IS_DEAD (c))
+    if (Col[c].is_dead())
     {
       continue ;
     }
@@ -826,7 +845,7 @@ static void init_scoring
       /* get a row */
       row = *cp++ ;
       /* skip if dead */
-      if (ROW_IS_DEAD (row))
+      if (Row[row].is_dead())
       {
 	continue ;
       }
@@ -845,7 +864,7 @@ static void init_scoring
       /* and have already been killed) */
       COLAMD_DEBUG2 (("Newly null killed: %d\n", c)) ;
       Col [c].shared2.order = --n_col2 ;
-      KILL_PRINCIPAL_COL (c) ;
+      Col[c].kill_principal() ;
     }
     else
     {
@@ -870,7 +889,7 @@ static void init_scoring
   /* clear the hash buckets */
   for (c = 0 ; c <= n_col ; c++)
   {
-    head [c] = COLAMD_EMPTY ;
+    head [c] = Empty ;
   }
   min_score = n_col ;
   /* place in reverse order, so low column indices are at the front */
@@ -878,7 +897,7 @@ static void init_scoring
   for (c = n_col-1 ; c >= 0 ; c--)
   {
     /* only add principal columns to degree lists */
-    if (COL_IS_ALIVE (c))
+    if (Col[c].is_alive())
     {
       COLAMD_DEBUG4 (("place %d score %d minscore %d ncol %d\n",
 		      c, Col [c].shared2.score, min_score, n_col)) ;
@@ -891,16 +910,16 @@ static void init_scoring
       COLAMD_ASSERT (min_score <= n_col) ;
       COLAMD_ASSERT (score >= 0) ;
       COLAMD_ASSERT (score <= n_col) ;
-      COLAMD_ASSERT (head [score] >= COLAMD_EMPTY) ;
+      COLAMD_ASSERT (head [score] >= Empty) ;
 
       /* now add this column to dList at proper score location */
       next_col = head [score] ;
-      Col [c].shared3.prev = COLAMD_EMPTY ;
+      Col [c].shared3.prev = Empty ;
       Col [c].shared4.degree_next = next_col ;
 
       /* if there already was a column with the same score, set its */
       /* previous pointer to this new column */
-      if (next_col != COLAMD_EMPTY)
+      if (next_col != Empty)
       {
 	Col [next_col].shared3.prev = c ;
       }
@@ -939,8 +958,8 @@ static IndexType find_ordering /* return the number of garbage collections */
     IndexType n_row,      /* number of rows of A */
     IndexType n_col,      /* number of columns of A */
     IndexType Alen,     /* size of A, 2*nnz + n_col or larger */
-    Colamd_Row<IndexType> Row [],    /* of size n_row+1 */
-    colamd_col<IndexType> Col [],    /* of size n_col+1 */
+    RowStructure<IndexType> Row [],    /* of size n_row+1 */
+    ColStructure<IndexType> Col [],    /* of size n_col+1 */
     IndexType A [],     /* column form and row form of A */
     IndexType head [],    /* of size n_col+1 */
     IndexType n_col2,     /* Remaining columns to order */
@@ -986,7 +1005,7 @@ static IndexType find_ordering /* return the number of garbage collections */
   /* === Initialization and clear mark ==================================== */
 
   max_mark = INT_MAX - n_col ;  /* INT_MAX defined in <limits.h> */
-  tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
+  tag_mark = Colamd::clear_mark (n_row, Row) ;
   min_score = 0 ;
   ngarbage = 0 ;
   COLAMD_DEBUG1 (("colamd: Ordering, n_col2=%d\n", n_col2)) ;
@@ -1001,10 +1020,10 @@ static IndexType find_ordering /* return the number of garbage collections */
     /* make sure degree list isn't empty */
     COLAMD_ASSERT (min_score >= 0) ;
     COLAMD_ASSERT (min_score <= n_col) ;
-    COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ;
+    COLAMD_ASSERT (head [min_score] >= Empty) ;
 
     /* get pivot column from head of minimum degree list */
-    while (min_score < n_col && head [min_score] == COLAMD_EMPTY)
+    while (min_score < n_col && head [min_score] == Empty)
     {
       min_score++ ;
     }
@@ -1012,12 +1031,12 @@ static IndexType find_ordering /* return the number of garbage collections */
     COLAMD_ASSERT (pivot_col >= 0 && pivot_col <= n_col) ;
     next_col = Col [pivot_col].shared4.degree_next ;
     head [min_score] = next_col ;
-    if (next_col != COLAMD_EMPTY)
+    if (next_col != Empty)
     {
-      Col [next_col].shared3.prev = COLAMD_EMPTY ;
+      Col [next_col].shared3.prev = Empty ;
     }
 
-    COLAMD_ASSERT (COL_IS_ALIVE (pivot_col)) ;
+    COLAMD_ASSERT (Col[pivot_col].is_alive()) ;
     COLAMD_DEBUG3 (("Pivot col: %d\n", pivot_col)) ;
 
     /* remember score for defrag check */
@@ -1036,12 +1055,12 @@ static IndexType find_ordering /* return the number of garbage collections */
     needed_memory = numext::mini(pivot_col_score, n_col - k) ;
     if (pfree + needed_memory >= Alen)
     {
-      pfree = Eigen::internal::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
+      pfree = Colamd::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
       ngarbage++ ;
       /* after garbage collection we will have enough */
       COLAMD_ASSERT (pfree + needed_memory < Alen) ;
       /* garbage collection has wiped out the Row[].shared2.mark array */
-      tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
+      tag_mark = Colamd::clear_mark (n_row, Row) ;
 
     }
 
@@ -1064,9 +1083,9 @@ static IndexType find_ordering /* return the number of garbage collections */
     {
       /* get a row */
       row = *cp++ ;
-      COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ;
+      COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", Row[row].is_alive(), row)) ;
       /* skip if row is dead */
-      if (ROW_IS_DEAD (row))
+      if (Row[row].is_dead())
       {
 	continue ;
       }
@@ -1078,7 +1097,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 	col = *rp++ ;
 	/* add the column, if alive and untagged */
 	col_thickness = Col [col].shared1.thickness ;
-	if (col_thickness > 0 && COL_IS_ALIVE (col))
+	if (col_thickness > 0 && Col[col].is_alive())
 	{
 	  /* tag column in pivot row */
 	  Col [col].shared1.thickness = -col_thickness ;
@@ -1105,7 +1124,7 @@ static IndexType find_ordering /* return the number of garbage collections */
       /* may be killing an already dead row */
       row = *cp++ ;
       COLAMD_DEBUG3 (("Kill row in pivot col: %d\n", row)) ;
-      KILL_ROW (row) ;
+      Row[row].kill() ;
     }
 
     /* === Select a row index to use as the new pivot row =============== */
@@ -1120,7 +1139,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     else
     {
       /* there is no pivot row, since it is of zero length */
-      pivot_row = COLAMD_EMPTY ;
+      pivot_row = Empty ;
       COLAMD_ASSERT (pivot_row_length == 0) ;
     }
     COLAMD_ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ;
@@ -1157,7 +1176,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     while (rp < rp_end)
     {
       col = *rp++ ;
-      COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
+      COLAMD_ASSERT (Col[col].is_alive() && col != pivot_col) ;
       COLAMD_DEBUG3 (("Col: %d\n", col)) ;
 
       /* clear tags used to construct pivot row pattern */
@@ -1172,8 +1191,8 @@ static IndexType find_ordering /* return the number of garbage collections */
       next_col = Col [col].shared4.degree_next ;
       COLAMD_ASSERT (cur_score >= 0) ;
       COLAMD_ASSERT (cur_score <= n_col) ;
-      COLAMD_ASSERT (cur_score >= COLAMD_EMPTY) ;
-      if (prev_col == COLAMD_EMPTY)
+      COLAMD_ASSERT (cur_score >= Empty) ;
+      if (prev_col == Empty)
       {
 	head [cur_score] = next_col ;
       }
@@ -1181,7 +1200,7 @@ static IndexType find_ordering /* return the number of garbage collections */
       {
 	Col [prev_col].shared4.degree_next = next_col ;
       }
-      if (next_col != COLAMD_EMPTY)
+      if (next_col != Empty)
       {
 	Col [next_col].shared3.prev = prev_col ;
       }
@@ -1194,12 +1213,12 @@ static IndexType find_ordering /* return the number of garbage collections */
       {
 	/* get a row */
 	row = *cp++ ;
-	row_mark = Row [row].shared2.mark ;
 	/* skip if dead */
-	if (ROW_IS_MARKED_DEAD (row_mark))
+	if (Row[row].is_dead())
 	{
 	  continue ;
 	}
+  row_mark = Row [row].shared2.mark ;
 	COLAMD_ASSERT (row != pivot_row) ;
 	set_difference = row_mark - tag_mark ;
 	/* check if the row has been seen yet */
@@ -1215,7 +1234,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 	if (set_difference == 0)
 	{
 	  COLAMD_DEBUG3 (("aggressive absorption. Row: %d\n", row)) ;
-	  KILL_ROW (row) ;
+	  Row[row].kill() ;
 	}
 	else
 	{
@@ -1237,7 +1256,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     {
       /* get a column */
       col = *rp++ ;
-      COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
+      COLAMD_ASSERT (Col[col].is_alive() && col != pivot_col) ;
       hash = 0 ;
       cur_score = 0 ;
       cp = &A [Col [col].start] ;
@@ -1252,12 +1271,12 @@ static IndexType find_ordering /* return the number of garbage collections */
 	/* get a row */
 	row = *cp++ ;
 	COLAMD_ASSERT(row >= 0 && row < n_row) ;
-	row_mark = Row [row].shared2.mark ;
 	/* skip if dead */
-	if (ROW_IS_MARKED_DEAD (row_mark))
+	if (Row [row].is_dead())
 	{
 	  continue ;
 	}
+  row_mark = Row [row].shared2.mark ;
 	COLAMD_ASSERT (row_mark > tag_mark) ;
 	/* compact the column */
 	*new_cp++ = row ;
@@ -1278,7 +1297,7 @@ static IndexType find_ordering /* return the number of garbage collections */
       {
 	COLAMD_DEBUG4 (("further mass elimination. Col: %d\n", col)) ;
 	/* nothing left but the pivot row in this column */
-	KILL_PRINCIPAL_COL (col) ;
+	Col[col].kill_principal() ;
 	pivot_row_degree -= Col [col].shared1.thickness ;
 	COLAMD_ASSERT (pivot_row_degree >= 0) ;
 	/* order it */
@@ -1302,7 +1321,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 	COLAMD_ASSERT (hash <= n_col) ;
 
 	head_column = head [hash] ;
-	if (head_column > COLAMD_EMPTY)
+	if (head_column > Empty)
 	{
 	  /* degree list "hash" is non-empty, use prev (shared3) of */
 	  /* first column in degree list as head of hash bucket */
@@ -1319,7 +1338,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 
 	/* save hash function in Col [col].shared3.hash */
 	Col [col].shared3.hash = (IndexType) hash ;
-	COLAMD_ASSERT (COL_IS_ALIVE (col)) ;
+	COLAMD_ASSERT (Col[col].is_alive()) ;
       }
     }
 
@@ -1329,11 +1348,11 @@ static IndexType find_ordering /* return the number of garbage collections */
 
     COLAMD_DEBUG3 (("** Supercolumn detection phase. **\n")) ;
 
-    Eigen::internal::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ;
+    Colamd::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ;
 
     /* === Kill the pivotal column ====================================== */
 
-    KILL_PRINCIPAL_COL (pivot_col) ;
+    Col[pivot_col].kill_principal() ;
 
     /* === Clear mark =================================================== */
 
@@ -1341,7 +1360,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     if (tag_mark >= max_mark)
     {
       COLAMD_DEBUG2 (("clearing tag_mark\n")) ;
-      tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
+      tag_mark = Colamd::clear_mark (n_row, Row) ;
     }
 
     /* === Finalize the new pivot row, and column scores ================ */
@@ -1357,7 +1376,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     {
       col = *rp++ ;
       /* skip dead columns */
-      if (COL_IS_DEAD (col))
+      if (Col[col].is_dead())
       {
 	continue ;
       }
@@ -1391,11 +1410,11 @@ static IndexType find_ordering /* return the number of garbage collections */
       COLAMD_ASSERT (min_score <= n_col) ;
       COLAMD_ASSERT (cur_score >= 0) ;
       COLAMD_ASSERT (cur_score <= n_col) ;
-      COLAMD_ASSERT (head [cur_score] >= COLAMD_EMPTY) ;
+      COLAMD_ASSERT (head [cur_score] >= Empty) ;
       next_col = head [cur_score] ;
       Col [col].shared4.degree_next = next_col ;
-      Col [col].shared3.prev = COLAMD_EMPTY ;
-      if (next_col != COLAMD_EMPTY)
+      Col [col].shared3.prev = Empty ;
+      if (next_col != Empty)
       {
 	Col [next_col].shared3.prev = col ;
       }
@@ -1448,7 +1467,7 @@ static inline  void order_children
   /* === Parameters ======================================================= */
 
   IndexType n_col,      /* number of columns of A */
-  colamd_col<IndexType> Col [],    /* of size n_col+1 */
+  ColStructure<IndexType> Col [],    /* of size n_col+1 */
   IndexType p []      /* p [0 ... n_col-1] is the column permutation*/
   )
 {
@@ -1464,15 +1483,15 @@ static inline  void order_children
   for (i = 0 ; i < n_col ; i++)
   {
     /* find an un-ordered non-principal column */
-    COLAMD_ASSERT (COL_IS_DEAD (i)) ;
-    if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == COLAMD_EMPTY)
+    COLAMD_ASSERT (col_is_dead(Col, i)) ;
+    if (!Col[i].is_dead_principal() && Col [i].shared2.order == Empty)
     {
       parent = i ;
       /* once found, find its principal parent */
       do
       {
 	parent = Col [parent].shared1.parent ;
-      } while (!COL_IS_DEAD_PRINCIPAL (parent)) ;
+      } while (!Col[parent].is_dead_principal()) ;
 
       /* now, order all un-ordered non-principal columns along path */
       /* to this parent.  collapse tree at the same time */
@@ -1482,7 +1501,7 @@ static inline  void order_children
 
       do
       {
-	COLAMD_ASSERT (Col [c].shared2.order == COLAMD_EMPTY) ;
+	COLAMD_ASSERT (Col [c].shared2.order == Empty) ;
 
 	/* order this column */
 	Col [c].shared2.order = order++ ;
@@ -1493,9 +1512,9 @@ static inline  void order_children
 	c = Col [c].shared1.parent ;
 
 	/* continue until we hit an ordered column.  There are */
-	/* guarranteed not to be anymore unordered columns */
+	/* guaranteed not to be anymore unordered columns */
 	/* above an ordered column */
-      } while (Col [c].shared2.order == COLAMD_EMPTY) ;
+      } while (Col [c].shared2.order == Empty) ;
 
       /* re-order the super_col parent to largest order for this group */
       Col [parent].shared2.order = order ;
@@ -1547,8 +1566,8 @@ template <typename IndexType>
 static void detect_super_cols
 (
   /* === Parameters ======================================================= */
-  
-  colamd_col<IndexType> Col [],    /* of size n_col+1 */
+
+  ColStructure<IndexType> Col [],    /* of size n_col+1 */
   IndexType A [],     /* row indices of A */
   IndexType head [],    /* head of degree lists and hash buckets */
   IndexType row_start,    /* pointer to set of columns to check */
@@ -1578,7 +1597,7 @@ static void detect_super_cols
   while (rp < rp_end)
   {
     col = *rp++ ;
-    if (COL_IS_DEAD (col))
+    if (Col[col].is_dead())
     {
       continue ;
     }
@@ -1590,7 +1609,7 @@ static void detect_super_cols
     /* === Get the first column in this hash bucket ===================== */
 
     head_column = head [hash] ;
-    if (head_column > COLAMD_EMPTY)
+    if (head_column > Empty)
     {
       first_col = Col [head_column].shared3.headhash ;
     }
@@ -1601,10 +1620,10 @@ static void detect_super_cols
 
     /* === Consider each column in the hash bucket ====================== */
 
-    for (super_c = first_col ; super_c != COLAMD_EMPTY ;
+    for (super_c = first_col ; super_c != Empty ;
 	 super_c = Col [super_c].shared4.hash_next)
     {
-      COLAMD_ASSERT (COL_IS_ALIVE (super_c)) ;
+      COLAMD_ASSERT (Col [super_c].is_alive()) ;
       COLAMD_ASSERT (Col [super_c].shared3.hash == hash) ;
       length = Col [super_c].length ;
 
@@ -1614,10 +1633,10 @@ static void detect_super_cols
       /* === Compare super_c with all columns after it ================ */
 
       for (c = Col [super_c].shared4.hash_next ;
-	   c != COLAMD_EMPTY ; c = Col [c].shared4.hash_next)
+	   c != Empty ; c = Col [c].shared4.hash_next)
       {
 	COLAMD_ASSERT (c != super_c) ;
-	COLAMD_ASSERT (COL_IS_ALIVE (c)) ;
+	COLAMD_ASSERT (Col[c].is_alive()) ;
 	COLAMD_ASSERT (Col [c].shared3.hash == hash) ;
 
 	/* not identical if lengths or scores are different */
@@ -1635,10 +1654,10 @@ static void detect_super_cols
 	for (i = 0 ; i < length ; i++)
 	{
 	  /* the columns are "clean" (no dead rows) */
-	  COLAMD_ASSERT (ROW_IS_ALIVE (*cp1))  ;
-	  COLAMD_ASSERT (ROW_IS_ALIVE (*cp2))  ;
+	  COLAMD_ASSERT ( cp1->is_alive() );
+	  COLAMD_ASSERT ( cp2->is_alive() );
 	  /* row indices will same order for both supercols, */
-	  /* no gather scatter nessasary */
+	  /* no gather scatter necessary */
 	  if (*cp1++ != *cp2++)
 	  {
 	    break ;
@@ -1658,9 +1677,9 @@ static void detect_super_cols
 
 	Col [super_c].shared1.thickness += Col [c].shared1.thickness ;
 	Col [c].shared1.parent = super_c ;
-	KILL_NON_PRINCIPAL_COL (c) ;
+	Col[c].kill_non_principal() ;
 	/* order c later, in order_children() */
-	Col [c].shared2.order = COLAMD_EMPTY ;
+	Col [c].shared2.order = Empty ;
 	/* remove c from hash bucket */
 	Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ;
       }
@@ -1668,15 +1687,15 @@ static void detect_super_cols
 
     /* === Empty this hash bucket ======================================= */
 
-    if (head_column > COLAMD_EMPTY)
+    if (head_column > Empty)
     {
       /* corresponding degree list "hash" is not empty */
-      Col [head_column].shared3.headhash = COLAMD_EMPTY ;
+      Col [head_column].shared3.headhash = Empty ;
     }
     else
     {
       /* corresponding degree list "hash" is empty */
-      head [hash] = COLAMD_EMPTY ;
+      head [hash] = Empty ;
     }
   }
 }
@@ -1688,7 +1707,7 @@ static void detect_super_cols
 
 /*
   Defragments and compacts columns and rows in the workspace A.  Used when
-  all avaliable memory has been used while performing row merging.  Returns
+  all available memory has been used while performing row merging.  Returns
   the index of the first free position in A, after garbage collection.  The
   time taken by this routine is linear is the size of the array A, which is
   itself linear in the number of nonzeros in the input matrix.
@@ -1698,11 +1717,11 @@ template <typename IndexType>
 static IndexType garbage_collection  /* returns the new value of pfree */
   (
     /* === Parameters ======================================================= */
-    
+
     IndexType n_row,      /* number of rows */
     IndexType n_col,      /* number of columns */
-    Colamd_Row<IndexType> Row [],    /* row info */
-    colamd_col<IndexType> Col [],    /* column info */
+    RowStructure<IndexType> Row [],    /* row info */
+    ColStructure<IndexType> Col [],    /* column info */
     IndexType A [],     /* A [0 ... Alen-1] holds the matrix */
     IndexType *pfree      /* &A [0] ... pfree is in use */
     )
@@ -1721,7 +1740,7 @@ static IndexType garbage_collection  /* returns the new value of pfree */
   pdest = &A[0] ;
   for (c = 0 ; c < n_col ; c++)
   {
-    if (COL_IS_ALIVE (c))
+    if (Col[c].is_alive())
     {
       psrc = &A [Col [c].start] ;
 
@@ -1732,7 +1751,7 @@ static IndexType garbage_collection  /* returns the new value of pfree */
       for (j = 0 ; j < length ; j++)
       {
 	r = *psrc++ ;
-	if (ROW_IS_ALIVE (r))
+	if (Row[r].is_alive())
 	{
 	  *pdest++ = r ;
 	}
@@ -1745,22 +1764,22 @@ static IndexType garbage_collection  /* returns the new value of pfree */
 
   for (r = 0 ; r < n_row ; r++)
   {
-    if (ROW_IS_ALIVE (r))
+    if (Row[r].is_alive())
     {
       if (Row [r].length == 0)
       {
-	/* this row is of zero length.  cannot compact it, so kill it */
-	COLAMD_DEBUG3 (("Defrag row kill\n")) ;
-	KILL_ROW (r) ;
+        /* this row is of zero length.  cannot compact it, so kill it */
+        COLAMD_DEBUG3 (("Defrag row kill\n")) ;
+        Row[r].kill() ;
       }
       else
       {
-	/* save first column index in Row [r].shared2.first_column */
-	psrc = &A [Row [r].start] ;
-	Row [r].shared2.first_column = *psrc ;
-	COLAMD_ASSERT (ROW_IS_ALIVE (r)) ;
-	/* flag the start of the row with the one's complement of row */
-	*psrc = ONES_COMPLEMENT (r) ;
+        /* save first column index in Row [r].shared2.first_column */
+        psrc = &A [Row [r].start] ;
+        Row [r].shared2.first_column = *psrc ;
+        COLAMD_ASSERT (Row[r].is_alive()) ;
+        /* flag the start of the row with the one's complement of row */
+        *psrc = ones_complement(r) ;
 
       }
     }
@@ -1776,11 +1795,11 @@ static IndexType garbage_collection  /* returns the new value of pfree */
     {
       psrc-- ;
       /* get the row index */
-      r = ONES_COMPLEMENT (*psrc) ;
+      r = ones_complement(*psrc) ;
       COLAMD_ASSERT (r >= 0 && r < n_row) ;
       /* restore first column index */
       *psrc = Row [r].shared2.first_column ;
-      COLAMD_ASSERT (ROW_IS_ALIVE (r)) ;
+      COLAMD_ASSERT (Row[r].is_alive()) ;
 
       /* move and compact the row */
       COLAMD_ASSERT (pdest <= psrc) ;
@@ -1789,7 +1808,7 @@ static IndexType garbage_collection  /* returns the new value of pfree */
       for (j = 0 ; j < length ; j++)
       {
 	c = *psrc++ ;
-	if (COL_IS_ALIVE (c))
+	if (Col[c].is_alive())
 	{
 	  *pdest++ = c ;
 	}
@@ -1821,7 +1840,7 @@ static inline  IndexType clear_mark  /* return the new value for tag_mark */
       /* === Parameters ======================================================= */
 
     IndexType n_row,    /* number of rows in A */
-    Colamd_Row<IndexType> Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */
+    RowStructure<IndexType> Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */
     )
 {
   /* === Local variables ================================================== */
@@ -1830,7 +1849,7 @@ static inline  IndexType clear_mark  /* return the new value for tag_mark */
 
   for (r = 0 ; r < n_row ; r++)
   {
-    if (ROW_IS_ALIVE (r))
+    if (Row[r].is_alive())
     {
       Row [r].shared2.mark = 0 ;
     }
@@ -1838,6 +1857,7 @@ static inline  IndexType clear_mark  /* return the new value for tag_mark */
   return (1) ;
 }
 
+} // namespace Colamd
 
-} // namespace internal 
+} // namespace internal
 #endif
diff --git a/contrib/eigen/Eigen/src/OrderingMethods/Ordering.h b/contrib/eigen/Eigen/src/OrderingMethods/Ordering.h
index 7ea9b14d7eb285206a4eb98f9b084a3486b65e5e..c578970142114353ea402af96e575496ea97c8a6 100644
--- a/contrib/eigen/Eigen/src/OrderingMethods/Ordering.h
+++ b/contrib/eigen/Eigen/src/OrderingMethods/Ordering.h
@@ -31,15 +31,13 @@ void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat)
   for (int i = 0; i < C.rows(); i++) 
   {
       for (typename MatrixType::InnerIterator it(C, i); it; ++it)
-        it.valueRef() = 0.0;
+        it.valueRef() = typename MatrixType::Scalar(0);
   }
   symmat = C + A;
 }
     
 }
 
-#ifndef EIGEN_MPL2_ONLY
-
 /** \ingroup OrderingMethods_Module
   * \class AMDOrdering
   *
@@ -81,8 +79,6 @@ class AMDOrdering
     }
 };
 
-#endif // EIGEN_MPL2_ONLY
-
 /** \ingroup OrderingMethods_Module
   * \class NaturalOrdering
   *
@@ -133,17 +129,17 @@ class COLAMDOrdering
       StorageIndex n = StorageIndex(mat.cols());
       StorageIndex nnz = StorageIndex(mat.nonZeros());
       // Get the recommended value of Alen to be used by colamd
-      StorageIndex Alen = internal::colamd_recommended(nnz, m, n); 
+      StorageIndex Alen = internal::Colamd::recommended(nnz, m, n); 
       // Set the default parameters
-      double knobs [COLAMD_KNOBS]; 
-      StorageIndex stats [COLAMD_STATS];
-      internal::colamd_set_defaults(knobs);
+      double knobs [internal::Colamd::NKnobs]; 
+      StorageIndex stats [internal::Colamd::NStats];
+      internal::Colamd::set_defaults(knobs);
       
       IndexVector p(n+1), A(Alen); 
       for(StorageIndex i=0; i <= n; i++)   p(i) = mat.outerIndexPtr()[i];
       for(StorageIndex i=0; i < nnz; i++)  A(i) = mat.innerIndexPtr()[i];
       // Call Colamd routine to compute the ordering 
-      StorageIndex info = internal::colamd(m, n, Alen, A.data(), p.data(), knobs, stats); 
+      StorageIndex info = internal::Colamd::compute_ordering(m, n, Alen, A.data(), p.data(), knobs, stats); 
       EIGEN_UNUSED_VARIABLE(info);
       eigen_assert( info && "COLAMD failed " );
       
diff --git a/contrib/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h b/contrib/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h
index 160d8a5234a8f9da2e3a6154578347abb6b46266..37426877ad4338463f796209fe884b5fb3499704 100644
--- a/contrib/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h
+++ b/contrib/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h
@@ -203,7 +203,7 @@ class PastixBase : public SparseSolverBase<Derived>
     
      /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the PaStiX reports a problem
       *          \c InvalidInput if the input matrix is invalid
       *
diff --git a/contrib/eigen/Eigen/src/PardisoSupport/PardisoSupport.h b/contrib/eigen/Eigen/src/PardisoSupport/PardisoSupport.h
index f8c7d0780bce7a1f7dab5efc1628e9e4c31d1054..f89b79bd55bcd039f9cb3293da6cc9ebec56bcd1 100644
--- a/contrib/eigen/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/contrib/eigen/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -123,6 +123,7 @@ class PardisoImpl : public SparseSolverBase<Derived>
     };
 
     PardisoImpl()
+      : m_analysisIsOk(false), m_factorizationIsOk(false)
     {
       eigen_assert((sizeof(StorageIndex) >= sizeof(_INTEGER_t) && sizeof(StorageIndex) <= 8) && "Non-supported index type");
       m_iparm.setZero();
@@ -140,7 +141,7 @@ class PardisoImpl : public SparseSolverBase<Derived>
   
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix appears to be negative.
       */
     ComputationInfo info() const
@@ -192,8 +193,7 @@ class PardisoImpl : public SparseSolverBase<Derived>
     void pardisoInit(int type)
     {
       m_type = type;
-      EIGEN_USING_STD(abs);
-      bool symmetric = abs(m_type) < 10;
+      bool symmetric = std::abs(m_type) < 10;
       m_iparm[0] = 1;   // No solver default
       m_iparm[1] = 2;   // use Metis for the ordering
       m_iparm[2] = 0;   // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??)
@@ -386,14 +386,15 @@ class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
 {
   protected:
     typedef PardisoImpl<PardisoLU> Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
     friend class PardisoImpl< PardisoLU<MatrixType> >;
 
   public:
 
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::RealScalar RealScalar;
+
     using Base::compute;
     using Base::solve;
 
@@ -441,14 +442,14 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
 {
   protected:
     typedef PardisoImpl< PardisoLLT<MatrixType,_UpLo> > Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
     friend class PardisoImpl< PardisoLLT<MatrixType,_UpLo> >;
 
   public:
 
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::RealScalar RealScalar;
     typedef typename Base::StorageIndex StorageIndex;
     enum { UpLo = _UpLo };
     using Base::compute;
@@ -504,14 +505,14 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
 {
   protected:
     typedef PardisoImpl< PardisoLDLT<MatrixType,Options> > Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
     friend class PardisoImpl< PardisoLDLT<MatrixType,Options> >;
 
   public:
 
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::RealScalar RealScalar;
     typedef typename Base::StorageIndex StorageIndex;
     using Base::compute;
     enum { UpLo = Options&(Upper|Lower) };
diff --git a/contrib/eigen/Eigen/src/QR/ColPivHouseholderQR.h b/contrib/eigen/Eigen/src/QR/ColPivHouseholderQR.h
index a7b47d55dc57899b8fc1a3029a64c3654b9c30b0..9b677e9bf45d8ba1ced1d9705d8353c69e857224 100644
--- a/contrib/eigen/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/contrib/eigen/Eigen/src/QR/ColPivHouseholderQR.h
@@ -17,6 +17,9 @@ namespace internal {
 template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> >
  : traits<_MatrixType>
 {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
   enum { Flags = 0 };
 };
 
@@ -46,20 +49,19 @@ template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> >
   * \sa MatrixBase::colPivHouseholderQr()
   */
 template<typename _MatrixType> class ColPivHouseholderQR
+        : public SolverBase<ColPivHouseholderQR<_MatrixType> >
 {
   public:
 
     typedef _MatrixType MatrixType;
+    typedef SolverBase<ColPivHouseholderQR> Base;
+    friend class SolverBase<ColPivHouseholderQR>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(ColPivHouseholderQR)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    // FIXME should be int
-    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
     typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
     typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
@@ -156,6 +158,7 @@ template<typename _MatrixType> class ColPivHouseholderQR
       computeInPlace();
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
       * *this is the QR decomposition, if any exists.
       *
@@ -172,11 +175,8 @@ template<typename _MatrixType> class ColPivHouseholderQR
       */
     template<typename Rhs>
     inline const Solve<ColPivHouseholderQR, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return Solve<ColPivHouseholderQR, Rhs>(*this, b.derived());
-    }
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
     HouseholderSequenceType householderQ() const;
     HouseholderSequenceType matrixQ() const
@@ -402,7 +402,7 @@ template<typename _MatrixType> class ColPivHouseholderQR
       */
     RealScalar maxPivot() const { return m_maxpivot; }
 
-    /** \brief Reports whether the QR factorization was succesful.
+    /** \brief Reports whether the QR factorization was successful.
       *
       * \note This function always returns \c Success. It is provided for compatibility
       * with other factorization routines.
@@ -416,8 +416,10 @@ template<typename _MatrixType> class ColPivHouseholderQR
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
@@ -584,8 +586,6 @@ template<typename _MatrixType>
 template<typename RhsType, typename DstType>
 void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  eigen_assert(rhs.rows() == rows());
-
   const Index nonzero_pivots = nonzeroPivots();
 
   if(nonzero_pivots == 0)
@@ -596,11 +596,7 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &
 
   typename RhsType::PlainObject c(rhs);
 
-  // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
-  c.applyOnTheLeft(householderSequence(m_qr, m_hCoeffs)
-                    .setLength(nonzero_pivots)
-                    .transpose()
-    );
+  c.applyOnTheLeft(householderQ().setLength(nonzero_pivots).adjoint() );
 
   m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots)
       .template triangularView<Upper>()
@@ -609,6 +605,31 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &
   for(Index i = 0; i < nonzero_pivots; ++i) dst.row(m_colsPermutation.indices().coeff(i)) = c.row(i);
   for(Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero();
 }
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void ColPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  const Index nonzero_pivots = nonzeroPivots();
+
+  if(nonzero_pivots == 0)
+  {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(m_colsPermutation.transpose()*rhs);
+
+  m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots)
+        .template triangularView<Upper>()
+        .transpose().template conjugateIf<Conjugate>()
+        .solveInPlace(c.topRows(nonzero_pivots));
+
+  dst.topRows(nonzero_pivots) = c.topRows(nonzero_pivots);
+  dst.bottomRows(rows()-nonzero_pivots).setZero();
+
+  dst.applyOnTheLeft(householderQ().setLength(nonzero_pivots).template conjugateIf<!Conjugate>() );
+}
 #endif
 
 namespace internal {
diff --git a/contrib/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/contrib/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h
index 34c637b70a9b0aefbf4b5160c54bb3c087199da6..486d3373af5e5068abddbaa11801487175cd3d79 100644
--- a/contrib/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/contrib/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@@ -16,6 +16,9 @@ namespace internal {
 template <typename _MatrixType>
 struct traits<CompleteOrthogonalDecomposition<_MatrixType> >
     : traits<_MatrixType> {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
   enum { Flags = 0 };
 };
 
@@ -44,19 +47,21 @@ struct traits<CompleteOrthogonalDecomposition<_MatrixType> >
   * 
   * \sa MatrixBase::completeOrthogonalDecomposition()
   */
-template <typename _MatrixType>
-class CompleteOrthogonalDecomposition {
+template <typename _MatrixType> class CompleteOrthogonalDecomposition
+          : public SolverBase<CompleteOrthogonalDecomposition<_MatrixType> >
+{
  public:
   typedef _MatrixType MatrixType;
+  typedef SolverBase<CompleteOrthogonalDecomposition> Base;
+
+  template<typename Derived>
+  friend struct internal::solve_assertion;
+
+  EIGEN_GENERIC_PUBLIC_INTERFACE(CompleteOrthogonalDecomposition)
   enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
   };
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
   typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime>
       PermutationType;
@@ -131,9 +136,9 @@ class CompleteOrthogonalDecomposition {
       m_temp(matrix.cols())
   {
     computeInPlace();
-  }
-
+  } 
 
+  #ifdef EIGEN_PARSED_BY_DOXYGEN
   /** This method computes the minimum-norm solution X to a least squares
    * problem \f[\mathrm{minimize} \|A X - B\|, \f] where \b A is the matrix of
    * which \c *this is the complete orthogonal decomposition.
@@ -145,11 +150,8 @@ class CompleteOrthogonalDecomposition {
    */
   template <typename Rhs>
   inline const Solve<CompleteOrthogonalDecomposition, Rhs> solve(
-      const MatrixBase<Rhs>& b) const {
-    eigen_assert(m_cpqr.m_isInitialized &&
-                 "CompleteOrthogonalDecomposition is not initialized.");
-    return Solve<CompleteOrthogonalDecomposition, Rhs>(*this, b.derived());
-  }
+      const MatrixBase<Rhs>& b) const;
+  #endif
 
   HouseholderSequenceType householderQ(void) const;
   HouseholderSequenceType matrixQ(void) const { return m_cpqr.householderQ(); }
@@ -158,8 +160,8 @@ class CompleteOrthogonalDecomposition {
    */
   MatrixType matrixZ() const {
     MatrixType Z = MatrixType::Identity(m_cpqr.cols(), m_cpqr.cols());
-    applyZAdjointOnTheLeftInPlace(Z);
-    return Z.adjoint();
+    applyZOnTheLeftInPlace<false>(Z);
+    return Z;
   }
 
   /** \returns a reference to the matrix where the complete orthogonal
@@ -275,6 +277,7 @@ class CompleteOrthogonalDecomposition {
    */
   inline const Inverse<CompleteOrthogonalDecomposition> pseudoInverse() const
   {
+    eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized.");
     return Inverse<CompleteOrthogonalDecomposition>(*this);
   }
 
@@ -353,7 +356,7 @@ class CompleteOrthogonalDecomposition {
   inline RealScalar maxPivot() const { return m_cpqr.maxPivot(); }
 
   /** \brief Reports whether the complete orthogonal decomposition was
-   * succesful.
+   * successful.
    *
    * \note This function always returns \c Success. It is provided for
    * compatibility
@@ -367,7 +370,10 @@ class CompleteOrthogonalDecomposition {
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   template <typename RhsType, typename DstType>
-  EIGEN_DEVICE_FUNC void _solve_impl(const RhsType& rhs, DstType& dst) const;
+  void _solve_impl(const RhsType& rhs, DstType& dst) const;
+
+  template<bool Conjugate, typename RhsType, typename DstType>
+  void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
 #endif
 
  protected:
@@ -375,8 +381,22 @@ class CompleteOrthogonalDecomposition {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
   }
 
+  template<bool Transpose_, typename Rhs>
+  void _check_solve_assertion(const Rhs& b) const {
+      EIGEN_ONLY_USED_FOR_DEBUG(b);
+      eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized.");
+      eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "CompleteOrthogonalDecomposition::solve(): invalid number of rows of the right hand side matrix b");
+  }
+
   void computeInPlace();
 
+  /** Overwrites \b rhs with \f$ \mathbf{Z} * \mathbf{rhs} \f$ or
+   *  \f$ \mathbf{\overline Z} * \mathbf{rhs} \f$ if \c Conjugate 
+   *  is set to \c true.
+   */
+  template <bool Conjugate, typename Rhs>
+  void applyZOnTheLeftInPlace(Rhs& rhs) const;
+
   /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$.
    */
   template <typename Rhs>
@@ -452,7 +472,7 @@ void CompleteOrthogonalDecomposition<MatrixType>::computeInPlace()
         // Apply Z(k) to the first k rows of X_k
         m_cpqr.m_qr.topRightCorner(k, cols - rank + 1)
             .applyHouseholderOnTheRight(
-                m_cpqr.m_qr.row(k).tail(cols - rank).transpose(), m_zCoeffs(k),
+                m_cpqr.m_qr.row(k).tail(cols - rank).adjoint(), m_zCoeffs(k),
                 &m_temp(0));
       }
       if (k != rank - 1) {
@@ -464,6 +484,28 @@ void CompleteOrthogonalDecomposition<MatrixType>::computeInPlace()
   }
 }
 
+template <typename MatrixType>
+template <bool Conjugate, typename Rhs>
+void CompleteOrthogonalDecomposition<MatrixType>::applyZOnTheLeftInPlace(
+    Rhs& rhs) const {
+  const Index cols = this->cols();
+  const Index nrhs = rhs.cols();
+  const Index rank = this->rank();
+  Matrix<typename Rhs::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
+  for (Index k = rank-1; k >= 0; --k) {
+    if (k != rank - 1) {
+      rhs.row(k).swap(rhs.row(rank - 1));
+    }
+    rhs.middleRows(rank - 1, cols - rank + 1)
+        .applyHouseholderOnTheLeft(
+            matrixQTZ().row(k).tail(cols - rank).transpose().template conjugateIf<!Conjugate>(), zCoeffs().template conjugateIf<Conjugate>()(k),
+            &temp(0));
+    if (k != rank - 1) {
+      rhs.row(k).swap(rhs.row(rank - 1));
+    }
+  }
+}
+
 template <typename MatrixType>
 template <typename Rhs>
 void CompleteOrthogonalDecomposition<MatrixType>::applyZAdjointOnTheLeftInPlace(
@@ -471,7 +513,7 @@ void CompleteOrthogonalDecomposition<MatrixType>::applyZAdjointOnTheLeftInPlace(
   const Index cols = this->cols();
   const Index nrhs = rhs.cols();
   const Index rank = this->rank();
-  Matrix<typename MatrixType::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
+  Matrix<typename Rhs::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
   for (Index k = 0; k < rank; ++k) {
     if (k != rank - 1) {
       rhs.row(k).swap(rhs.row(rank - 1));
@@ -491,8 +533,6 @@ template <typename _MatrixType>
 template <typename RhsType, typename DstType>
 void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
     const RhsType& rhs, DstType& dst) const {
-  eigen_assert(rhs.rows() == this->rows());
-
   const Index rank = this->rank();
   if (rank == 0) {
     dst.setZero();
@@ -500,11 +540,8 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
   }
 
   // Compute c = Q^* * rhs
-  // Note that the matrix Q = H_0^* H_1^*... so its inverse is
-  // Q^* = (H_0 H_1 ...)^T
   typename RhsType::PlainObject c(rhs);
-  c.applyOnTheLeft(
-      householderSequence(matrixQTZ(), hCoeffs()).setLength(rank).transpose());
+  c.applyOnTheLeft(matrixQ().setLength(rank).adjoint());
 
   // Solve T z = c(1:rank, :)
   dst.topRows(rank) = matrixT()
@@ -523,10 +560,45 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
   // Undo permutation to get x = P^{-1} * y.
   dst = colsPermutation() * dst;
 }
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  const Index rank = this->rank();
+
+  if (rank == 0) {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(colsPermutation().transpose()*rhs);
+
+  if (rank < cols()) {
+    applyZOnTheLeftInPlace<!Conjugate>(c);
+  }
+
+  matrixT().topLeftCorner(rank, rank)
+           .template triangularView<Upper>()
+           .transpose().template conjugateIf<Conjugate>()
+           .solveInPlace(c.topRows(rank));
+
+  dst.topRows(rank) = c.topRows(rank);
+  dst.bottomRows(rows()-rank).setZero();
+
+  dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf<!Conjugate>() );
+}
 #endif
 
 namespace internal {
 
+template<typename MatrixType>
+struct traits<Inverse<CompleteOrthogonalDecomposition<MatrixType> > >
+  : traits<typename Transpose<typename MatrixType::PlainObject>::PlainObject>
+{
+  enum { Flags = 0 };
+};
+
 template<typename DstXprType, typename MatrixType>
 struct Assignment<DstXprType, Inverse<CompleteOrthogonalDecomposition<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename CompleteOrthogonalDecomposition<MatrixType>::Scalar>, Dense2Dense>
 {
@@ -534,7 +606,8 @@ struct Assignment<DstXprType, Inverse<CompleteOrthogonalDecomposition<MatrixType
   typedef Inverse<CodType> SrcXprType;
   static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename CodType::Scalar> &)
   {
-    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.rows()));
+    typedef Matrix<typename CodType::Scalar, CodType::RowsAtCompileTime, CodType::RowsAtCompileTime, 0, CodType::MaxRowsAtCompileTime, CodType::MaxRowsAtCompileTime> IdentityMatrixType;
+    dst = src.nestedExpression().solve(IdentityMatrixType::Identity(src.cols(), src.cols()));
   }
 };
 
diff --git a/contrib/eigen/Eigen/src/QR/FullPivHouseholderQR.h b/contrib/eigen/Eigen/src/QR/FullPivHouseholderQR.h
index e489bddc2dbc69cecec344be252eddf079aa0622..d0664a1d8c76338d6ef56b0afb438817fceff862 100644
--- a/contrib/eigen/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/contrib/eigen/Eigen/src/QR/FullPivHouseholderQR.h
@@ -18,6 +18,9 @@ namespace internal {
 template<typename _MatrixType> struct traits<FullPivHouseholderQR<_MatrixType> >
  : traits<_MatrixType>
 {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
   enum { Flags = 0 };
 };
 
@@ -55,20 +58,19 @@ struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
   * \sa MatrixBase::fullPivHouseholderQr()
   */
 template<typename _MatrixType> class FullPivHouseholderQR
+        : public SolverBase<FullPivHouseholderQR<_MatrixType> >
 {
   public:
 
     typedef _MatrixType MatrixType;
+    typedef SolverBase<FullPivHouseholderQR> Base;
+    friend class SolverBase<FullPivHouseholderQR>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivHouseholderQR)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    // FIXME should be int
-    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef internal::FullPivHouseholderQRMatrixQReturnType<MatrixType> MatrixQReturnType;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
     typedef Matrix<StorageIndex, 1,
@@ -156,6 +158,7 @@ template<typename _MatrixType> class FullPivHouseholderQR
       computeInPlace();
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
       * \c *this is the QR decomposition.
       *
@@ -173,11 +176,8 @@ template<typename _MatrixType> class FullPivHouseholderQR
       */
     template<typename Rhs>
     inline const Solve<FullPivHouseholderQR, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return Solve<FullPivHouseholderQR, Rhs>(*this, b.derived());
-    }
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
     /** \returns Expression object representing the matrix Q
       */
@@ -392,22 +392,24 @@ template<typename _MatrixType> class FullPivHouseholderQR
       *          diagonal coefficient of U.
       */
     RealScalar maxPivot() const { return m_maxpivot; }
-    
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
-    
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
-    
+
     void computeInPlace();
-    
+
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     IntDiagSizeVectorType m_rows_transpositions;
@@ -499,15 +501,15 @@ void FullPivHouseholderQR<MatrixType>::computeInPlace()
       m_nonzero_pivots = k;
       for(Index i = k; i < size; i++)
       {
-        m_rows_transpositions.coeffRef(i) = i;
-        m_cols_transpositions.coeffRef(i) = i;
+        m_rows_transpositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
+        m_cols_transpositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
         m_hCoeffs.coeffRef(i) = Scalar(0);
       }
       break;
     }
 
-    m_rows_transpositions.coeffRef(k) = row_of_biggest_in_corner;
-    m_cols_transpositions.coeffRef(k) = col_of_biggest_in_corner;
+    m_rows_transpositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner);
+    m_cols_transpositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner);
     if(k != row_of_biggest_in_corner) {
       m_qr.row(k).tail(cols-k).swap(m_qr.row(row_of_biggest_in_corner).tail(cols-k));
       ++number_of_transpositions;
@@ -541,7 +543,6 @@ template<typename _MatrixType>
 template<typename RhsType, typename DstType>
 void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  eigen_assert(rhs.rows() == rows());
   const Index l_rank = rank();
 
   // FIXME introduce nonzeroPivots() and use it here. and more generally,
@@ -554,7 +555,7 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType
 
   typename RhsType::PlainObject c(rhs);
 
-  Matrix<Scalar,1,RhsType::ColsAtCompileTime> temp(rhs.cols());
+  Matrix<typename RhsType::Scalar,1,RhsType::ColsAtCompileTime> temp(rhs.cols());
   for (Index k = 0; k < l_rank; ++k)
   {
     Index remainingSize = rows()-k;
@@ -571,6 +572,42 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType
   for(Index i = 0; i < l_rank; ++i) dst.row(m_cols_permutation.indices().coeff(i)) = c.row(i);
   for(Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero();
 }
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void FullPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  const Index l_rank = rank();
+
+  if(l_rank == 0)
+  {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(m_cols_permutation.transpose()*rhs);
+
+  m_qr.topLeftCorner(l_rank, l_rank)
+         .template triangularView<Upper>()
+         .transpose().template conjugateIf<Conjugate>()
+         .solveInPlace(c.topRows(l_rank));
+
+  dst.topRows(l_rank) = c.topRows(l_rank);
+  dst.bottomRows(rows()-l_rank).setZero();
+
+  Matrix<Scalar, 1, DstType::ColsAtCompileTime> temp(dst.cols());
+  const Index size = (std::min)(rows(), cols());
+  for (Index k = size-1; k >= 0; --k)
+  {
+    Index remainingSize = rows()-k;
+
+    dst.bottomRightCorner(remainingSize, dst.cols())
+       .applyHouseholderOnTheLeft(m_qr.col(k).tail(remainingSize-1).template conjugateIf<!Conjugate>(),
+                                  m_hCoeffs.template conjugateIf<Conjugate>().coeff(k), &temp.coeffRef(0));
+
+    dst.row(k).swap(dst.row(m_rows_transpositions.coeff(k)));
+  }
+}
 #endif
 
 namespace internal {
diff --git a/contrib/eigen/Eigen/src/QR/HouseholderQR.h b/contrib/eigen/Eigen/src/QR/HouseholderQR.h
index 3513d995cb68b8e421510c026c21e32c8e5d8e91..801739fbd810575c3743939341b34568230a82d8 100644
--- a/contrib/eigen/Eigen/src/QR/HouseholderQR.h
+++ b/contrib/eigen/Eigen/src/QR/HouseholderQR.h
@@ -14,6 +14,18 @@
 
 namespace Eigen { 
 
+namespace internal {
+template<typename _MatrixType> struct traits<HouseholderQR<_MatrixType> >
+ : traits<_MatrixType>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
+
+} // end namespace internal
+
 /** \ingroup QR_Module
   *
   *
@@ -42,20 +54,19 @@ namespace Eigen {
   * \sa MatrixBase::householderQr()
   */
 template<typename _MatrixType> class HouseholderQR
+        : public SolverBase<HouseholderQR<_MatrixType> >
 {
   public:
 
     typedef _MatrixType MatrixType;
+    typedef SolverBase<HouseholderQR> Base;
+    friend class SolverBase<HouseholderQR>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(HouseholderQR)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    // FIXME should be int
-    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, (MatrixType::Flags&RowMajorBit) ? RowMajor : ColMajor, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
     typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
@@ -121,6 +132,7 @@ template<typename _MatrixType> class HouseholderQR
       computeInPlace();
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
       * *this is the QR decomposition, if any exists.
       *
@@ -137,11 +149,8 @@ template<typename _MatrixType> class HouseholderQR
       */
     template<typename Rhs>
     inline const Solve<HouseholderQR, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
-      return Solve<HouseholderQR, Rhs>(*this, b.derived());
-    }
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
     /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations.
       *
@@ -204,28 +213,30 @@ template<typename _MatrixType> class HouseholderQR
 
     inline Index rows() const { return m_qr.rows(); }
     inline Index cols() const { return m_qr.cols(); }
-    
+
     /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
       * 
       * For advanced uses only.
       */
     const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
-    
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
-    
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
 
     void computeInPlace();
-    
+
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     RowVectorType m_temp;
@@ -292,7 +303,7 @@ template<typename MatrixQR, typename HCoeffs,
   bool InnerStrideIsOne = (MatrixQR::InnerStrideAtCompileTime == 1 && HCoeffs::InnerStrideAtCompileTime == 1)>
 struct householder_qr_inplace_blocked
 {
-  // This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h
+  // This is specialized for LAPACK-supported Scalar types in HouseholderQR_LAPACKE.h
   static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize=32,
       typename MatrixQR::Scalar* tempData = 0)
   {
@@ -350,15 +361,10 @@ template<typename RhsType, typename DstType>
 void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
   const Index rank = (std::min)(rows(), cols());
-  eigen_assert(rhs.rows() == rows());
 
   typename RhsType::PlainObject c(rhs);
 
-  // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
-  c.applyOnTheLeft(householderSequence(
-    m_qr.leftCols(rank),
-    m_hCoeffs.head(rank)).transpose()
-  );
+  c.applyOnTheLeft(householderQ().setLength(rank).adjoint() );
 
   m_qr.topLeftCorner(rank, rank)
       .template triangularView<Upper>()
@@ -367,6 +373,25 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c
   dst.topRows(rank) = c.topRows(rank);
   dst.bottomRows(cols()-rank).setZero();
 }
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void HouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  const Index rank = (std::min)(rows(), cols());
+
+  typename RhsType::PlainObject c(rhs);
+
+  m_qr.topLeftCorner(rank, rank)
+      .template triangularView<Upper>()
+      .transpose().template conjugateIf<Conjugate>()
+      .solveInPlace(c.topRows(rank));
+
+  dst.topRows(rank) = c.topRows(rank);
+  dst.bottomRows(rows()-rank).setZero();
+
+  dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf<!Conjugate>() );
+}
 #endif
 
 /** Performs the QR factorization of the given matrix \a matrix. The result of
diff --git a/contrib/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/contrib/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
index 953d57c9d76239bad5dc38fc473a7c11b32468ee..013c7ae7a9cc0149ab07654f5c1f59f7f1c3d42a 100644
--- a/contrib/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+++ b/contrib/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
@@ -74,13 +74,35 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     };
   public:
     SPQR() 
-      : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
+      : m_analysisIsOk(false),
+        m_factorizationIsOk(false),
+        m_isRUpToDate(false),
+        m_ordering(SPQR_ORDERING_DEFAULT),
+        m_allow_tol(SPQR_DEFAULT_TOL),
+        m_tolerance (NumTraits<Scalar>::epsilon()),
+        m_cR(0),
+        m_E(0),
+        m_H(0),
+        m_HPinv(0),
+        m_HTau(0),
+        m_useDefaultThreshold(true)
     { 
       cholmod_l_start(&m_cc);
     }
     
     explicit SPQR(const _MatrixType& matrix)
-    : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
+      : m_analysisIsOk(false),
+        m_factorizationIsOk(false),
+        m_isRUpToDate(false),
+        m_ordering(SPQR_ORDERING_DEFAULT),
+        m_allow_tol(SPQR_DEFAULT_TOL),
+        m_tolerance (NumTraits<Scalar>::epsilon()),
+        m_cR(0),
+        m_E(0),
+        m_H(0),
+        m_HPinv(0),
+        m_HTau(0),
+        m_useDefaultThreshold(true)
     {
       cholmod_l_start(&m_cc);
       compute(matrix);
@@ -220,7 +242,7 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the sparse QR can not be computed
       */
     ComputationInfo info() const
diff --git a/contrib/eigen/Eigen/src/SVD/BDCSVD.h b/contrib/eigen/Eigen/src/SVD/BDCSVD.h
index a5b73f8f21f7e1b7785e84246ab420fe006f9c49..17f8e44364ddeb2ddcf41ede841868f10595504c 100644
--- a/contrib/eigen/Eigen/src/SVD/BDCSVD.h
+++ b/contrib/eigen/Eigen/src/SVD/BDCSVD.h
@@ -22,6 +22,11 @@
 // #define EIGEN_BDCSVD_DEBUG_VERBOSE
 // #define EIGEN_BDCSVD_SANITY_CHECKS
 
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+#undef eigen_internal_assert
+#define eigen_internal_assert(X) assert(X);
+#endif
+
 namespace Eigen {
 
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
@@ -34,6 +39,7 @@ namespace internal {
 
 template<typename _MatrixType> 
 struct traits<BDCSVD<_MatrixType> >
+        : traits<_MatrixType>
 {
   typedef _MatrixType MatrixType;
 };  
@@ -57,7 +63,7 @@ struct traits<BDCSVD<_MatrixType> >
  * recommended and can several order of magnitude faster.
  *
  * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations.
- * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless
+ * For instance, this concerns Intel's compiler (ICC), which performs such optimization by default unless
  * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will
  * significantly degrade the accuracy.
  *
@@ -105,7 +111,7 @@ public:
    * The default constructor is useful in cases in which the user intends to
    * perform decompositions via BDCSVD::compute(const MatrixType&).
    */
-  BDCSVD() : m_algoswap(16), m_numIters(0)
+  BDCSVD() : m_algoswap(16), m_isTranspose(false), m_compU(false), m_compV(false), m_numIters(0)
   {}
 
 
@@ -202,6 +208,7 @@ protected:
   using Base::m_computeThinV;
   using Base::m_matrixU;
   using Base::m_matrixV;
+  using Base::m_info;
   using Base::m_isInitialized;
   using Base::m_nonzeroSingularValues;
 
@@ -212,7 +219,7 @@ public:
 
 // Method to allocate and initialize matrix and attributes
 template<typename MatrixType>
-void BDCSVD<MatrixType>::allocate(Index rows, Index cols, unsigned int computationOptions)
+void BDCSVD<MatrixType>::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions)
 {
   m_isTranspose = (cols > rows);
 
@@ -250,16 +257,25 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
   {
     // FIXME this line involves temporaries
     JacobiSVD<MatrixType> jsvd(matrix,computationOptions);
-    if(computeU()) m_matrixU = jsvd.matrixU();
-    if(computeV()) m_matrixV = jsvd.matrixV();
-    m_singularValues = jsvd.singularValues();
-    m_nonzeroSingularValues = jsvd.nonzeroSingularValues();
     m_isInitialized = true;
+    m_info = jsvd.info();
+    if (m_info == Success || m_info == NoConvergence) {
+      if(computeU()) m_matrixU = jsvd.matrixU();
+      if(computeV()) m_matrixV = jsvd.matrixV();
+      m_singularValues = jsvd.singularValues();
+      m_nonzeroSingularValues = jsvd.nonzeroSingularValues();
+    }
     return *this;
   }
   
   //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows
-  RealScalar scale = matrix.cwiseAbs().maxCoeff();
+  RealScalar scale = matrix.cwiseAbs().template maxCoeff<PropagateNaN>();
+  if (!(numext::isfinite)(scale)) {
+    m_isInitialized = true;
+    m_info = InvalidInput;
+    return *this;
+  }
+
   if(scale==Literal(0)) scale = Literal(1);
   MatrixX copy;
   if (m_isTranspose) copy = matrix.adjoint()/scale;
@@ -276,7 +292,11 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
   m_computed.topRows(m_diagSize) = bid.bidiagonal().toDenseMatrix().transpose();
   m_computed.template bottomRows<1>().setZero();
   divide(0, m_diagSize - 1, 0, 0, 0);
-
+  if (m_info != Success && m_info != NoConvergence) {
+    m_isInitialized = true;
+    return *this;
+  }
+    
   //**** step 3 - Copy singular values and vectors
   for (int i=0; i<m_diagSize; i++)
   {
@@ -388,7 +408,7 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co
 //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix 
 // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper.
 template<typename MatrixType>
-void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift)
+void BDCSVD<MatrixType>::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift)
 {
   // requires rows = cols + 1;
   using std::pow;
@@ -408,6 +428,8 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
   {
     // FIXME this line involves temporaries
     JacobiSVD<MatrixXr> b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0));
+    m_info = b.info();
+    if (m_info != Success && m_info != NoConvergence) return;
     if (m_compU)
       m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = b.matrixU();
     else 
@@ -427,7 +449,9 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
   // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the 
   // right submatrix before the left one. 
   divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift);
+  if (m_info != Success && m_info != NoConvergence) return;
   divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1);
+  if (m_info != Success && m_info != NoConvergence) return;
 
   if (m_compU)
   {
@@ -568,7 +592,7 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
 // handling of round-off errors, be consistent in ordering
 // For instance, to solve the secular equation using FMM, see http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf
 template <typename MatrixType>
-void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V)
+void BDCSVD<MatrixType>::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V)
 {
   const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
   using std::abs;
@@ -591,7 +615,7 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
   // but others are interleaved and we must ignore them at this stage.
   // To this end, let's compute a permutation skipping them:
   Index actual_n = n;
-  while(actual_n>1 && diag(actual_n-1)==Literal(0)) --actual_n;
+  while(actual_n>1 && diag(actual_n-1)==Literal(0)) {--actual_n; eigen_internal_assert(col0(actual_n)==Literal(0)); }
   Index m = 0; // size of the deflated problem
   for(Index k=0;k<actual_n;++k)
     if(abs(col0(k))>considerZero)
@@ -618,13 +642,11 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
   std::cout << "  shift:    " << shifts.transpose() << "\n";
   
   {
-    Index actual_n = n;
-    while(actual_n>1 && abs(col0(actual_n-1))<considerZero) --actual_n;
     std::cout << "\n\n    mus:    " << mus.head(actual_n).transpose() << "\n\n";
     std::cout << "    check1 (expect0) : " << ((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n).transpose() << "\n\n";
+    assert((((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n) >= 0).all());
     std::cout << "    check2 (>0)      : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n";
-    std::cout << "    check3 (>0)      : " << ((diag.segment(1,actual_n-1)-singVals.head(actual_n-1).array()) / singVals.head(actual_n-1).array()).transpose() << "\n\n\n";
-    std::cout << "    check4 (>0)      : " << ((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).transpose() << "\n\n\n";
+    assert((((singVals.array()-diag) / singVals.array()).head(actual_n) >= 0).all());
   }
 #endif
   
@@ -652,13 +674,13 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
 #endif
   
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  assert(U.allFinite());
-  assert(V.allFinite());
-  assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 1e-14 * n);
-  assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 1e-14 * n);
   assert(m_naiveU.allFinite());
   assert(m_naiveV.allFinite());
   assert(m_computed.allFinite());
+  assert(U.allFinite());
+  assert(V.allFinite());
+//   assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 100*NumTraits<RealScalar>::epsilon() * n);
+//   assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits<RealScalar>::epsilon() * n);
 #endif
   
   // Because of deflation, the singular values might not be completely sorted.
@@ -673,6 +695,15 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
       if(m_compV) V.col(i).swap(V.col(i+1));
     }
   }
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  {
+    bool singular_values_sorted = (((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).array() >= 0).all();
+    if(!singular_values_sorted)
+      std::cout << "Singular values are not sorted: " << singVals.segment(1,actual_n).transpose() << "\n";
+    assert(singular_values_sorted);
+  }
+#endif
   
   // Reverse order so that singular values in increased order
   // Because of deflation, the zeros singular-values are already at the end
@@ -749,19 +780,22 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
     RealScalar mid = left + (right-left) / Literal(2);
     RealScalar fMid = secularEq(mid, col0, diag, perm, diag, Literal(0));
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-    std::cout << right-left << "\n";
-    std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, diag-left, left) << " " << secularEq(mid-right, col0, diag, perm, diag-right, right)   << "\n";
-    std::cout << "     = " << secularEq(0.1*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.2*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.3*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.4*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.49*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.5*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.51*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.6*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.7*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.8*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.9*(left+right), col0, diag, perm, diag, 0) << "\n";
+    std::cout << "right-left = " << right-left << "\n";
+//     std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, ArrayXr(diag-left), left)
+//                            << " " << secularEq(mid-right, col0, diag, perm, ArrayXr(diag-right), right)   << "\n";
+    std::cout << "     = " << secularEq(left+RealScalar(0.000001)*(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.1)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.2)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.3)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.4)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.49)    *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.5)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.51)    *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.6)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.7)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.8)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.9)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.999999)*(right-left), col0, diag, perm, diag, 0) << "\n";
 #endif
     RealScalar shift = (k == actual_n-1 || fMid > Literal(0)) ? left : right;
     
@@ -819,13 +853,16 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
       // And find mu such that f(mu)==0:
       RealScalar muZero = -a/b;
       RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift);
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      assert((numext::isfinite)(fZero));
+#endif
       
       muPrev = muCur;
       fPrev = fCur;
       muCur = muZero;
       fCur = fZero;
       
-      
       if (shift == left  && (muCur < Literal(0) || muCur > right - left)) useBisection = true;
       if (shift == right && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true;
       if (abs(fCur)>abs(fPrev)) useBisection = true;
@@ -858,20 +895,33 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
         else
           rightShifted = -(std::numeric_limits<RealScalar>::min)();
       }
-      
+
       RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
       eigen_internal_assert(fLeft<Literal(0));
 
-#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_DEBUG_VERBOSE
+#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_SANITY_CHECKS
       RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
 #endif
 
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      if(!(numext::isfinite)(fLeft))
+        std::cout << "f(" << leftShifted << ") =" << fLeft << " ; " << left << " " << shift << " " << right << "\n";
+      assert((numext::isfinite)(fLeft));
 
+      if(!(numext::isfinite)(fRight))
+        std::cout << "f(" << rightShifted << ") =" << fRight << " ; " << left << " " << shift << " " << right << "\n";
+      // assert((numext::isfinite)(fRight));
+#endif
+    
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
       if(!(fLeft * fRight<0))
       {
-        std::cout << "fLeft: " << leftShifted << " - " << diagShifted.head(10).transpose()  << "\n ; " << bool(left==shift) << " " << (left-shift) << "\n";
-        std::cout << k << " : " <<  fLeft << " * " << fRight << " == " << fLeft * fRight << "  ;  " << left << " - " << right << " -> " <<  leftShifted << " " << rightShifted << "   shift=" << shift << "\n";
+        std::cout << "f(leftShifted) using  leftShifted=" << leftShifted << " ;  diagShifted(1:10):" << diagShifted.head(10).transpose()  << "\n ; "
+                  << "left==shift=" << bool(left==shift) << " ; left-shift = " << (left-shift) << "\n";
+        std::cout << "k=" << k << ", " <<  fLeft << " * " << fRight << " == " << fLeft * fRight << "  ;  "
+                  << "[" << left << " .. " << right << "] -> [" << leftShifted << " " << rightShifted << "], shift=" << shift
+                  << " ,  f(right)=" << secularEq(0,     col0, diag, perm, diagShifted, shift)
+                           << " == " << secularEq(right, col0, diag, perm, diag, 0) << " == " << fRight << "\n";
       }
 #endif
       eigen_internal_assert(fLeft * fRight < Literal(0));
@@ -912,6 +962,15 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
     shifts[k] = shift;
     mus[k] = muCur;
 
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+    if(k+1<n)
+      std::cout << "found " << singVals[k] << " == " << shift << " + " << muCur << " from " << diag(k) << " .. "  << diag(k+1) << "\n";
+#endif
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+    assert(k==0 || singVals[k]>=singVals[k-1]);
+    assert(singVals[k]>=diag(k));
+#endif
+
     // perturb singular value slightly if it equals diagonal entry to avoid division by zero later
     // (deflation is supposed to avoid this from happening)
     // - this does no seem to be necessary anymore -
@@ -935,7 +994,7 @@ void BDCSVD<MatrixType>::perturbCol0
     zhat.setZero();
     return;
   }
-  Index last = perm(m-1);
+  Index lastIdx = perm(m-1);
   // The offset permits to skip deflated entries while computing zhat
   for (Index k = 0; k < n; ++k)
   {
@@ -945,15 +1004,43 @@ void BDCSVD<MatrixType>::perturbCol0
     {
       // see equation (3.6)
       RealScalar dk = diag(k);
-      RealScalar prod = (singVals(last) + dk) * (mus(last) + (shifts(last) - dk));
+      RealScalar prod = (singVals(lastIdx) + dk) * (mus(lastIdx) + (shifts(lastIdx) - dk));
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      if(prod<0) {
+        std::cout << "k = " << k << " ;  z(k)=" << col0(k) << ", diag(k)=" << dk << "\n";
+        std::cout << "prod = " << "(" << singVals(lastIdx) << " + " << dk << ") * (" << mus(lastIdx) << " + (" << shifts(lastIdx) << " - " << dk << "))" << "\n";
+        std::cout << "     = " << singVals(lastIdx) + dk << " * " << mus(lastIdx) + (shifts(lastIdx) - dk) <<  "\n";
+      }
+      assert(prod>=0);
+#endif
 
       for(Index l = 0; l<m; ++l)
       {
         Index i = perm(l);
         if(i!=k)
         {
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+          if(i>=k && (l==0 || l-1>=m))
+          {
+            std::cout << "Error in perturbCol0\n";
+            std::cout << "  " << k << "/" << n << " "  << l << "/" << m << " " << i << "/" << n << " ; " << col0(k) << " " << diag(k) << " "  <<  "\n";
+            std::cout << "  " <<diag(i) << "\n";
+            Index j = (i<k /*|| l==0*/) ? i : perm(l-1);
+            std::cout << "  " << "j=" << j << "\n";
+          }
+#endif
           Index j = i<k ? i : perm(l-1);
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+          if(!(dk!=Literal(0) || diag(i)!=Literal(0)))
+          {
+            std::cout << "k=" << k << ", i=" << i << ", l=" << l << ", perm.size()=" << perm.size() << "\n";
+          }
+          assert(dk!=Literal(0) || diag(i)!=Literal(0));
+#endif
           prod *= ((singVals(j)+dk) / ((diag(i)+dk))) * ((mus(j)+(shifts(j)-dk)) / ((diag(i)-dk)));
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+          assert(prod>=0);
+#endif
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
           if(i!=k && numext::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 )
             std::cout << "     " << ((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) << " == (" << (singVals(j)+dk) << " * " << (mus(j)+(shifts(j)-dk))
@@ -962,9 +1049,12 @@ void BDCSVD<MatrixType>::perturbCol0
         }
       }
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-      std::cout << "zhat(" << k << ") =  sqrt( " << prod << ")  ;  " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n";
+      std::cout << "zhat(" << k << ") =  sqrt( " << prod << ")  ;  " << (singVals(lastIdx) + dk) << " * " << mus(lastIdx) + shifts(lastIdx) << " - " << dk << "\n";
 #endif
       RealScalar tmp = sqrt(prod);
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      assert((numext::isfinite)(tmp));
+#endif
       zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp);
     }
   }
@@ -1018,7 +1108,7 @@ void BDCSVD<MatrixType>::computeSingVecs
 // i >= 1, di almost null and zi non null.
 // We use a rotation to zero out zi applied to the left of M
 template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation43(Index firstCol, Index shift, Index i, Index size)
+void BDCSVD<MatrixType>::deflation43(Eigen::Index firstCol, Eigen::Index shift, Eigen::Index i, Eigen::Index size)
 {
   using std::abs;
   using std::sqrt;
@@ -1047,7 +1137,7 @@ void BDCSVD<MatrixType>::deflation43(Index firstCol, Index shift, Index i, Index
 // We apply two rotations to have zj = 0;
 // TODO deflation44 is still broken and not properly tested
 template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size)
+void BDCSVD<MatrixType>::deflation44(Eigen::Index firstColu , Eigen::Index firstColm, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index i, Eigen::Index j, Eigen::Index size)
 {
   using std::abs;
   using std::sqrt;
@@ -1074,7 +1164,7 @@ void BDCSVD<MatrixType>::deflation44(Index firstColu , Index firstColm, Index fi
   }
   c/=r;
   s/=r;
-  m_computed(firstColm + i, firstColm) = r;  
+  m_computed(firstColm + i, firstColm) = r;
   m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
   m_computed(firstColm + j, firstColm) = Literal(0);
 
@@ -1087,7 +1177,7 @@ void BDCSVD<MatrixType>::deflation44(Index firstColu , Index firstColm, Index fi
 
 // acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive]
 template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift)
+void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index k, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift)
 {
   using std::sqrt;
   using std::abs;
@@ -1148,6 +1238,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
 #endif
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
   std::cout << "to be sorted: " << diag.transpose() << "\n\n";
+  std::cout << "            : " << col0.transpose() << "\n\n";
 #endif
   {
     // Check for total deflation
@@ -1238,7 +1329,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
        if( (diag(i) - diag(i-1)) < NumTraits<RealScalar>::epsilon()*maxDiag )
       {
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-        std::cout << "deflation 4.4 with i = " << i << " because " << (diag(i) - diag(i-1)) << " < " << NumTraits<RealScalar>::epsilon()*diag(i) << "\n";
+        std::cout << "deflation 4.4 with i = " << i << " because " << diag(i) << " - " << diag(i-1) << " == " << (diag(i) - diag(i-1)) << " < " << NumTraits<RealScalar>::epsilon()*/*diag(i)*/maxDiag << "\n";
 #endif
         eigen_internal_assert(abs(diag(i) - diag(i-1))<epsilon_coarse && " diagonal entries are not properly sorted");
         deflation44(firstCol, firstCol + shift, firstRowW, firstColW, i-1, i, length);
@@ -1257,7 +1348,6 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
 #endif
 }//end deflation
 
-#ifndef __CUDACC__
 /** \svd_module
   *
   * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
@@ -1270,7 +1360,6 @@ MatrixBase<Derived>::bdcSvd(unsigned int computationOptions) const
 {
   return BDCSVD<PlainObject>(*this, computationOptions);
 }
-#endif
 
 } // end namespace Eigen
 
diff --git a/contrib/eigen/Eigen/src/SVD/JacobiSVD.h b/contrib/eigen/Eigen/src/SVD/JacobiSVD.h
index 43488b1e0c20644eff540b8a5dd82020c5659988..9d95acdf67816edcefeb07e20114168dcef6db54 100644
--- a/contrib/eigen/Eigen/src/SVD/JacobiSVD.h
+++ b/contrib/eigen/Eigen/src/SVD/JacobiSVD.h
@@ -112,12 +112,12 @@ public:
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor))
-              : ColsAtCompileTime==1 ? (MatrixType::Options |   RowMajor)
-              : MatrixType::Options
+    Options = MatrixType::Options
   };
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, TrOptions, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
+
+  typedef typename internal::make_proper_matrix_type<
+    Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime
+  >::type TransposeTypeWithSameStorageOrder;
 
   void allocate(const JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd)
   {
@@ -202,13 +202,12 @@ public:
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor))
-              : ColsAtCompileTime==1 ? (MatrixType::Options |   RowMajor)
-              : MatrixType::Options
+    Options = MatrixType::Options
   };
 
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, TrOptions, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
+  typedef typename internal::make_proper_matrix_type<
+    Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime
+  >::type TransposeTypeWithSameStorageOrder;
 
   void allocate(const JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd)
   {
@@ -303,8 +302,9 @@ public:
     Options = MatrixType::Options
   };
 
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
+  typedef typename internal::make_proper_matrix_type<
+    Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime
+  >::type TransposeTypeWithSameStorageOrder;
 
   void allocate(const JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd)
   {
@@ -425,6 +425,7 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
 
 template<typename _MatrixType, int QRPreconditioner> 
 struct traits<JacobiSVD<_MatrixType,QRPreconditioner> >
+        : traits<_MatrixType>
 {
   typedef _MatrixType MatrixType;
 };
@@ -584,6 +585,7 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
     using Base::m_matrixU;
     using Base::m_matrixV;
     using Base::m_singularValues;
+    using Base::m_info;
     using Base::m_isInitialized;
     using Base::m_isAllocated;
     using Base::m_usePrescribedThreshold;
@@ -610,7 +612,7 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
 };
 
 template<typename MatrixType, int QRPreconditioner>
-void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, unsigned int computationOptions)
+void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions)
 {
   eigen_assert(rows >= 0 && cols >= 0);
 
@@ -624,6 +626,7 @@ void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, u
 
   m_rows = rows;
   m_cols = cols;
+  m_info = Success;
   m_isInitialized = false;
   m_isAllocated = true;
   m_computationOptions = computationOptions;
@@ -673,7 +676,12 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
   const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
 
   // Scaling factor to reduce over/under-flows
-  RealScalar scale = matrix.cwiseAbs().maxCoeff();
+  RealScalar scale = matrix.cwiseAbs().template maxCoeff<PropagateNaN>();
+  if (!(numext::isfinite)(scale)) {
+    m_isInitialized = true;
+    m_info = InvalidInput;
+    return *this;
+  }
   if(scale==RealScalar(0)) scale = RealScalar(1);
   
   /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */
diff --git a/contrib/eigen/Eigen/src/SVD/SVDBase.h b/contrib/eigen/Eigen/src/SVD/SVDBase.h
index 53da2848883b71bf6a58116e80f5486ed26ea324..bc7ab88b4cffd36f126c224c1ea2d209cc5dd1c5 100644
--- a/contrib/eigen/Eigen/src/SVD/SVDBase.h
+++ b/contrib/eigen/Eigen/src/SVD/SVDBase.h
@@ -17,6 +17,18 @@
 #define EIGEN_SVDBASE_H
 
 namespace Eigen {
+
+namespace internal {
+template<typename Derived> struct traits<SVDBase<Derived> >
+ : traits<Derived>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
+}
+
 /** \ingroup SVD_Module
  *
  *
@@ -39,20 +51,26 @@ namespace Eigen {
  * smaller value among \a n and \a p, there are only \a m singular vectors; the remaining columns of \a U and \a V do not correspond to actual
  * singular vectors. Asking for \em thin \a U or \a V means asking for only their \a m first columns to be formed. So \a U is then a n-by-m matrix,
  * and \a V is then a p-by-m matrix. Notice that thin \a U and \a V are all you need for (least squares) solving.
+ * 
+ * The status of the computation can be retrived using the \a info() method. Unless \a info() returns \a Success, the results should be not
+ * considered well defined.
  *  
- * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to
+ * If the input matrix has inf or nan coefficients, the result of the computation is undefined, and \a info() will return \a InvalidInput, but the computation is guaranteed to
  * terminate in finite (and reasonable) time.
  * \sa class BDCSVD, class JacobiSVD
  */
-template<typename Derived>
-class SVDBase
+template<typename Derived> class SVDBase
+ : public SolverBase<SVDBase<Derived> >
 {
+public: 
+   
+  template<typename Derived_>
+  friend struct internal::solve_assertion;
 
-public:
   typedef typename internal::traits<Derived>::MatrixType MatrixType;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef typename Eigen::internal::traits<SVDBase>::StorageIndex StorageIndex;
   typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
   enum {
     RowsAtCompileTime = MatrixType::RowsAtCompileTime,
@@ -82,7 +100,7 @@ public:
    */
   const MatrixUType& matrixU() const
   {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    _check_compute_assertions();
     eigen_assert(computeU() && "This SVD decomposition didn't compute U. Did you ask for it?");
     return m_matrixU;
   }
@@ -98,7 +116,7 @@ public:
    */
   const MatrixVType& matrixV() const
   {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    _check_compute_assertions();
     eigen_assert(computeV() && "This SVD decomposition didn't compute V. Did you ask for it?");
     return m_matrixV;
   }
@@ -110,14 +128,14 @@ public:
    */
   const SingularValuesType& singularValues() const
   {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    _check_compute_assertions();
     return m_singularValues;
   }
 
   /** \returns the number of singular values that are not exactly 0 */
   Index nonzeroSingularValues() const
   {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    _check_compute_assertions();
     return m_nonzeroSingularValues;
   }
   
@@ -130,7 +148,7 @@ public:
   inline Index rank() const
   {
     using std::abs;
-    eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
+    _check_compute_assertions();
     if(m_singularValues.size()==0) return 0;
     RealScalar premultiplied_threshold = numext::maxi<RealScalar>(m_singularValues.coeff(0) * threshold(), (std::numeric_limits<RealScalar>::min)());
     Index i = m_nonzeroSingularValues-1;
@@ -194,6 +212,7 @@ public:
   inline Index rows() const { return m_rows; }
   inline Index cols() const { return m_cols; }
   
+  #ifdef EIGEN_PARSED_BY_DOXYGEN
   /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
     *
     * \param b the right-hand-side of the equation to solve.
@@ -205,32 +224,55 @@ public:
     */
   template<typename Rhs>
   inline const Solve<Derived, Rhs>
-  solve(const MatrixBase<Rhs>& b) const
+  solve(const MatrixBase<Rhs>& b) const;
+  #endif
+
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful.
+   */
+  EIGEN_DEVICE_FUNC
+  ComputationInfo info() const
   {
     eigen_assert(m_isInitialized && "SVD is not initialized.");
-    eigen_assert(computeU() && computeV() && "SVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
-    return Solve<Derived, Rhs>(derived(), b.derived());
+    return m_info;
   }
-  
+
   #ifndef EIGEN_PARSED_BY_DOXYGEN
   template<typename RhsType, typename DstType>
-  EIGEN_DEVICE_FUNC
   void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+  template<bool Conjugate, typename RhsType, typename DstType>
+  void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
   #endif
 
 protected:
-  
+
   static void check_template_parameters()
   {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
   }
-  
+
+  void _check_compute_assertions() const {
+    eigen_assert(m_isInitialized && "SVD is not initialized.");
+  }
+
+  template<bool Transpose_, typename Rhs>
+  void _check_solve_assertion(const Rhs& b) const {
+      EIGEN_ONLY_USED_FOR_DEBUG(b);
+      _check_compute_assertions();
+      eigen_assert(computeU() && computeV() && "SVDBase::solve(): Both unitaries U and V are required to be computed (thin unitaries suffice).");
+      eigen_assert((Transpose_?cols():rows())==b.rows() && "SVDBase::solve(): invalid number of rows of the right hand side matrix b");
+  }
+
   // return true if already allocated
   bool allocate(Index rows, Index cols, unsigned int computationOptions) ;
 
   MatrixUType m_matrixU;
   MatrixVType m_matrixV;
   SingularValuesType m_singularValues;
+  ComputationInfo m_info;
   bool m_isInitialized, m_isAllocated, m_usePrescribedThreshold;
   bool m_computeFullU, m_computeThinU;
   bool m_computeFullV, m_computeThinV;
@@ -243,9 +285,14 @@ protected:
    * Default constructor of SVDBase
    */
   SVDBase()
-    : m_isInitialized(false),
+    : m_info(Success),
+      m_isInitialized(false),
       m_isAllocated(false),
       m_usePrescribedThreshold(false),
+      m_computeFullU(false),
+      m_computeThinU(false),
+      m_computeFullV(false),
+      m_computeThinV(false),
       m_computationOptions(0),
       m_rows(-1), m_cols(-1), m_diagSize(0)
   {
@@ -260,17 +307,30 @@ template<typename Derived>
 template<typename RhsType, typename DstType>
 void SVDBase<Derived>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  eigen_assert(rhs.rows() == rows());
-
   // A = U S V^*
   // So A^{-1} = V S^{-1} U^*
 
-  Matrix<Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
+  Matrix<typename RhsType::Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
   Index l_rank = rank();
   tmp.noalias() =  m_matrixU.leftCols(l_rank).adjoint() * rhs;
   tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
   dst = m_matrixV.leftCols(l_rank) * tmp;
 }
+
+template<typename Derived>
+template<bool Conjugate, typename RhsType, typename DstType>
+void SVDBase<Derived>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  // A = U S V^*
+  // So  A^{-*} = U S^{-1} V^*
+  // And A^{-T} = U_conj S^{-1} V^T
+  Matrix<typename RhsType::Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
+  Index l_rank = rank();
+
+  tmp.noalias() =  m_matrixV.leftCols(l_rank).transpose().template conjugateIf<Conjugate>() * rhs;
+  tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
+  dst = m_matrixU.template conjugateIf<!Conjugate>().leftCols(l_rank) * tmp;
+}
 #endif
 
 template<typename MatrixType>
@@ -288,6 +348,7 @@ bool SVDBase<MatrixType>::allocate(Index rows, Index cols, unsigned int computat
 
   m_rows = rows;
   m_cols = cols;
+  m_info = Success;
   m_isInitialized = false;
   m_isAllocated = true;
   m_computationOptions = computationOptions;
diff --git a/contrib/eigen/Eigen/src/SVD/UpperBidiagonalization.h b/contrib/eigen/Eigen/src/SVD/UpperBidiagonalization.h
index 11ac847e1db8cfdba5cdd203cc18ccd5755ed58c..997defc474419f5428529010054cfbed9b93bb96 100644
--- a/contrib/eigen/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/contrib/eigen/Eigen/src/SVD/UpperBidiagonalization.h
@@ -127,7 +127,7 @@ void upperbidiagonalization_inplace_unblocked(MatrixType& mat,
        .makeHouseholderInPlace(mat.coeffRef(k,k+1), upper_diagonal[k]);
     // apply householder transform to remaining part of mat on the left
     mat.bottomRightCorner(remainingRows-1, remainingCols)
-       .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).transpose(), mat.coeff(k,k+1), tempData);
+       .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).adjoint(), mat.coeff(k,k+1), tempData);
   }
 }
 
@@ -202,7 +202,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A,
       {
         SubColumnType y_k( Y.col(k).tail(remainingCols) );
         
-        // let's use the begining of column k of Y as a temporary vector
+        // let's use the beginning of column k of Y as a temporary vector
         SubColumnType tmp( Y.col(k).head(k) );
         y_k.noalias()  = A.block(k,k+1, remainingRows,remainingCols).adjoint() * v_k; // bottleneck
         tmp.noalias()  = V_k1.adjoint()  * v_k;
@@ -231,7 +231,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A,
       {
         SubColumnType x_k ( X.col(k).tail(remainingRows-1) );
         
-        // let's use the begining of column k of X as a temporary vectors
+        // let's use the beginning of column k of X as a temporary vectors
         // note that tmp0 and tmp1 overlaps
         SubColumnType tmp0 ( X.col(k).head(k) ),
                       tmp1 ( X.col(k).head(k+1) );
diff --git a/contrib/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h b/contrib/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h
index 369e6804afa778535208f59b5d40b2421ab6bcae..9f93e3255dac5e9dc591962351e4d27262fb50e2 100644
--- a/contrib/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/contrib/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -80,11 +80,19 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
 
     /** Default constructor */
     SimplicialCholeskyBase()
-      : m_info(Success), m_shiftOffset(0), m_shiftScale(1)
+      : m_info(Success),
+        m_factorizationIsOk(false),
+        m_analysisIsOk(false),
+        m_shiftOffset(0),
+        m_shiftScale(1)
     {}
 
     explicit SimplicialCholeskyBase(const MatrixType& matrix)
-      : m_info(Success), m_shiftOffset(0), m_shiftScale(1)
+      : m_info(Success),
+        m_factorizationIsOk(false),
+        m_analysisIsOk(false),
+        m_shiftOffset(0),
+        m_shiftScale(1)
     {
       derived().compute(matrix);
     }
@@ -101,7 +109,7 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
@@ -210,7 +218,7 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
       CholMatrixType tmp(size,size);
       ConstCholMatrixPtr pmat;
       
-      if(m_P.size()==0 && (UpLo&Upper)==Upper)
+      if(m_P.size() == 0 && (int(UpLo) & int(Upper)) == Upper)
       {
         // If there is no ordering, try to directly use the input matrix without any copy
         internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, tmp);
@@ -279,8 +287,8 @@ template<typename _MatrixType, int _UpLo, typename _Ordering> struct traits<Simp
   typedef SparseMatrix<Scalar, ColMajor, StorageIndex>        CholMatrixType;
   typedef TriangularView<const CholMatrixType, Eigen::Lower>  MatrixL;
   typedef TriangularView<const typename CholMatrixType::AdjointReturnType, Eigen::Upper>   MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
-  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
+  static inline MatrixL getL(const CholMatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const CholMatrixType& m) { return MatrixU(m.adjoint()); }
 };
 
 template<typename _MatrixType,int _UpLo, typename _Ordering> struct traits<SimplicialLDLT<_MatrixType,_UpLo,_Ordering> >
@@ -293,8 +301,8 @@ template<typename _MatrixType,int _UpLo, typename _Ordering> struct traits<Simpl
   typedef SparseMatrix<Scalar, ColMajor, StorageIndex>            CholMatrixType;
   typedef TriangularView<const CholMatrixType, Eigen::UnitLower>  MatrixL;
   typedef TriangularView<const typename CholMatrixType::AdjointReturnType, Eigen::UnitUpper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
-  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
+  static inline MatrixL getL(const CholMatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const CholMatrixType& m) { return MatrixU(m.adjoint()); }
 };
 
 template<typename _MatrixType, int _UpLo, typename _Ordering> struct traits<SimplicialCholesky<_MatrixType,_UpLo,_Ordering> >
diff --git a/contrib/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/contrib/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
index 7b6183d08b1864e68fbbad47c75b3b69b46bf6ad..72e1740c1938f9dc9e744624120d95658181ca38 100644
--- a/contrib/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
+++ b/contrib/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
@@ -2,46 +2,21 @@
 // for linear algebra.
 //
 // Copyright (C) 2008-2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 /*
-
-NOTE: thes functions vave been adapted from the LDL library:
+NOTE: these functions have been adapted from the LDL library:
 
 LDL Copyright (c) 2005 by Timothy A. Davis.  All Rights Reserved.
 
-LDL License:
-
-    Your use or distribution of LDL or any modified version of
-    LDL implies that you agree to this License.
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
-    USA
-
-    Permission is hereby granted to use or copy this program under the
-    terms of the GNU LGPL, provided that the Copyright, this License,
-    and the Availability of the original version is retained on all copies.
-    User documentation of any code that uses this code or any modified
-    version of this code must cite the Copyright, this License, the
-    Availability note, and "Used by permission." Permission to modify
-    the code and to distribute modified code is granted, provided the
-    Copyright, this License, and the Availability note are retained,
-    and a notice that the code was modified is included.
+The author of LDL, Timothy A. Davis., has executed a license with Google LLC
+to permit distribution of this code and derivative works as part of Eigen under
+the Mozilla Public License v. 2.0, as stated at the top of this file.
  */
 
-#include "../Core/util/NonMPL2.h"
-
 #ifndef EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H
 #define EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H
 
@@ -122,7 +97,7 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
   for(StorageIndex k = 0; k < size; ++k)
   {
     // compute nonzero pattern of kth row of L, in topological order
-    y[k] = 0.0;                     // Y(0:k) is now all zero
+    y[k] = Scalar(0);                     // Y(0:k) is now all zero
     StorageIndex top = size;               // stack for pattern is empty
     tags[k] = k;                    // mark node k as visited
     m_nonZerosPerCol[k] = 0;        // count of nonzeros in column k of L
@@ -146,12 +121,12 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
     /* compute numerical values kth row of L (a sparse triangular solve) */
 
     RealScalar d = numext::real(y[k]) * m_shiftScale + m_shiftOffset;    // get D(k,k), apply the shift function, and clear Y(k)
-    y[k] = 0.0;
+    y[k] = Scalar(0);
     for(; top < size; ++top)
     {
       Index i = pattern[top];       /* pattern[top:n-1] is pattern of L(:,k) */
       Scalar yi = y[i];             /* get and clear Y(i) */
-      y[i] = 0.0;
+      y[i] = Scalar(0);
 
       /* the nonzero entry L(k,i) */
       Scalar l_ki;
diff --git a/contrib/eigen/Eigen/src/SparseCore/CompressedStorage.h b/contrib/eigen/Eigen/src/SparseCore/CompressedStorage.h
index d89fa0dae47655ebea39339e75af54fa9717984a..acd986fab5cb51738a760e3f3099382be89d402e 100644
--- a/contrib/eigen/Eigen/src/SparseCore/CompressedStorage.h
+++ b/contrib/eigen/Eigen/src/SparseCore/CompressedStorage.h
@@ -207,6 +207,22 @@ class CompressedStorage
       return m_values[id];
     }
 
+    void moveChunk(Index from, Index to, Index chunkSize)
+    {
+      eigen_internal_assert(to+chunkSize <= m_size);
+      if(to>from && from+chunkSize>to)
+      {
+        // move backward
+        internal::smart_memmove(m_values+from,  m_values+from+chunkSize,  m_values+to);
+        internal::smart_memmove(m_indices+from, m_indices+from+chunkSize, m_indices+to);
+      }
+      else
+      {
+        internal::smart_copy(m_values+from,  m_values+from+chunkSize,  m_values+to);
+        internal::smart_copy(m_indices+from, m_indices+from+chunkSize, m_indices+to);
+      }
+    }
+
     void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
     {
       Index k = 0;
diff --git a/contrib/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/contrib/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
index 9db119b67f9ea7037336306d0f67bfa23e4462e2..948650253b5d773252c0118c765868ad0d2fd2df 100644
--- a/contrib/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+++ b/contrib/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
 #define EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -25,16 +25,16 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
   Index rows = lhs.innerSize();
   Index cols = rhs.outerSize();
   eigen_assert(lhs.outerSize() == rhs.innerSize());
-  
+
   ei_declare_aligned_stack_constructed_variable(bool,   mask,     rows, 0);
   ei_declare_aligned_stack_constructed_variable(ResScalar, values,   rows, 0);
   ei_declare_aligned_stack_constructed_variable(Index,  indices,  rows, 0);
-  
+
   std::memset(mask,0,sizeof(bool)*rows);
 
   evaluator<Lhs> lhsEval(lhs);
   evaluator<Rhs> rhsEval(rhs);
-  
+
   // estimate the number of non zero entries
   // given a rhs column containing Y non zeros, we assume that the respective Y columns
   // of the lhs differs in average of one non zeros, thus the number of non zeros for
@@ -141,7 +141,7 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,C
     typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
     typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrixAux;
     typedef typename sparse_eval<ColMajorMatrixAux,ResultType::RowsAtCompileTime,ResultType::ColsAtCompileTime,ColMajorMatrixAux::Flags>::type ColMajorMatrix;
-    
+
     // If the result is tall and thin (in the extreme case a column vector)
     // then it is faster to sort the coefficients inplace instead of transposing twice.
     // FIXME, the following heuristic is probably not very good.
@@ -155,7 +155,7 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,C
     else
     {
       ColMajorMatrixAux resCol(lhs.rows(),rhs.cols());
-      // ressort to transpose to sort the entries
+      // resort to transpose to sort the entries
       internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrixAux>(lhs, rhs, resCol, false);
       RowMajorMatrix resRow(resCol);
       res = resRow.markAsRValue();
diff --git a/contrib/eigen/Eigen/src/SparseCore/SparseAssign.h b/contrib/eigen/Eigen/src/SparseCore/SparseAssign.h
index 18352a847b254cd4d295f13a5e91dcdc8d0685da..905485c88ed11bc8c5a6c731459b84aa20d1e5c3 100644
--- a/contrib/eigen/Eigen/src/SparseCore/SparseAssign.h
+++ b/contrib/eigen/Eigen/src/SparseCore/SparseAssign.h
@@ -83,7 +83,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
     // eval without temporary
     dst.resize(src.rows(), src.cols());
     dst.setZero();
-    dst.reserve((std::max)(src.rows(),src.cols())*2);
+    dst.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2));
     for (Index j=0; j<outerEvaluationSize; ++j)
     {
       dst.startVec(j);
@@ -107,7 +107,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
     
     DstXprType temp(src.rows(), src.cols());
 
-    temp.reserve((std::max)(src.rows(),src.cols())*2);
+    temp.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2));
     for (Index j=0; j<outerEvaluationSize; ++j)
     {
       temp.startVec(j);
@@ -134,8 +134,8 @@ struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Sparse>
 };
 
 // Generic Sparse to Dense assignment
-template< typename DstXprType, typename SrcXprType, typename Functor>
-struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense>
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense, Weak>
 {
   static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
@@ -153,6 +153,73 @@ struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense>
   }
 };
 
+// Specialization for dense ?= dense +/- sparse and dense ?= sparse +/- dense
+template<typename DstXprType, typename Func1, typename Func2>
+struct assignment_from_dense_op_sparse
+{
+  template<typename SrcXprType, typename InitialFunc>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
+  {
+    #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN
+    EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN
+    #endif
+
+    call_assignment_no_alias(dst, src.lhs(), Func1());
+    call_assignment_no_alias(dst, src.rhs(), Func2());
+  }
+
+  // Specialization for dense1 = sparse + dense2; -> dense1 = dense2; dense1 += sparse;
+  template<typename Lhs, typename Rhs, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename internal::enable_if<internal::is_same<typename internal::evaluator_traits<Rhs>::Shape,DenseShape>::value>::type
+  run(DstXprType &dst, const CwiseBinaryOp<internal::scalar_sum_op<Scalar,Scalar>, const Lhs, const Rhs> &src,
+      const internal::assign_op<typename DstXprType::Scalar,Scalar>& /*func*/)
+  {
+    #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN
+    EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN
+    #endif
+
+    // Apply the dense matrix first, then the sparse one.
+    call_assignment_no_alias(dst, src.rhs(), Func1());
+    call_assignment_no_alias(dst, src.lhs(), Func2());
+  }
+
+  // Specialization for dense1 = sparse - dense2; -> dense1 = -dense2; dense1 += sparse;
+  template<typename Lhs, typename Rhs, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename internal::enable_if<internal::is_same<typename internal::evaluator_traits<Rhs>::Shape,DenseShape>::value>::type
+  run(DstXprType &dst, const CwiseBinaryOp<internal::scalar_difference_op<Scalar,Scalar>, const Lhs, const Rhs> &src,
+      const internal::assign_op<typename DstXprType::Scalar,Scalar>& /*func*/)
+  {
+    #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN
+    EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN
+    #endif
+
+    // Apply the dense matrix first, then the sparse one.
+    call_assignment_no_alias(dst, -src.rhs(), Func1());
+    call_assignment_no_alias(dst,  src.lhs(), add_assign_op<typename DstXprType::Scalar,typename Lhs::Scalar>());
+  }
+};
+
+#define EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(ASSIGN_OP,BINOP,ASSIGN_OP2) \
+  template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar> \
+  struct Assignment<DstXprType, CwiseBinaryOp<internal::BINOP<Scalar,Scalar>, const Lhs, const Rhs>, internal::ASSIGN_OP<typename DstXprType::Scalar,Scalar>, \
+                    Sparse2Dense, \
+                    typename internal::enable_if<   internal::is_same<typename internal::evaluator_traits<Lhs>::Shape,DenseShape>::value \
+                                                 || internal::is_same<typename internal::evaluator_traits<Rhs>::Shape,DenseShape>::value>::type> \
+    : assignment_from_dense_op_sparse<DstXprType, internal::ASSIGN_OP<typename DstXprType::Scalar,typename Lhs::Scalar>, internal::ASSIGN_OP2<typename DstXprType::Scalar,typename Rhs::Scalar> > \
+  {}
+
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(assign_op,    scalar_sum_op,add_assign_op);
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(add_assign_op,scalar_sum_op,add_assign_op);
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(sub_assign_op,scalar_sum_op,sub_assign_op);
+
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(assign_op,    scalar_difference_op,sub_assign_op);
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(add_assign_op,scalar_difference_op,sub_assign_op);
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(sub_assign_op,scalar_difference_op,add_assign_op);
+
+
 // Specialization for "dst = dec.solve(rhs)"
 // NOTE we need to specialize it for Sparse2Sparse to avoid ambiguous specialization error
 template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
@@ -179,35 +246,22 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse>
 {
   typedef typename DstXprType::StorageIndex StorageIndex;
   typedef typename DstXprType::Scalar Scalar;
-  typedef Array<StorageIndex,Dynamic,1> ArrayXI;
-  typedef Array<Scalar,Dynamic,1> ArrayXS;
-  template<int Options>
-  static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
-  {
-    Index dstRows = src.rows();
-    Index dstCols = src.cols();
-    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
-      dst.resize(dstRows, dstCols);
 
-    Index size = src.diagonal().size();
-    dst.makeCompressed();
-    dst.resizeNonZeros(size);
-    Map<ArrayXI>(dst.innerIndexPtr(), size).setLinSpaced(0,StorageIndex(size)-1);
-    Map<ArrayXI>(dst.outerIndexPtr(), size+1).setLinSpaced(0,StorageIndex(size));
-    Map<ArrayXS>(dst.valuePtr(), size) = src.diagonal();
-  }
+  template<int Options, typename AssignFunc>
+  static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const AssignFunc &func)
+  { dst.assignDiagonal(src.diagonal(), func); }
   
   template<typename DstDerived>
   static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
-  {
-    dst.diagonal() = src.diagonal();
-  }
+  { dst.derived().diagonal() = src.diagonal(); }
   
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
-  { dst.diagonal() += src.diagonal(); }
+  template<typename DstDerived>
+  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.derived().diagonal() += src.diagonal(); }
   
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
-  { dst.diagonal() -= src.diagonal(); }
+  template<typename DstDerived>
+  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.derived().diagonal() -= src.diagonal(); }
 };
 } // end namespace internal
 
diff --git a/contrib/eigen/Eigen/src/SparseCore/SparseBlock.h b/contrib/eigen/Eigen/src/SparseCore/SparseBlock.h
index 511e92b2f9b43470486c7461c7e00d58a9557649..5b4f6cc9f3f81e4857ba64021c4c34a8e0f23af9 100644
--- a/contrib/eigen/Eigen/src/SparseCore/SparseBlock.h
+++ b/contrib/eigen/Eigen/src/SparseCore/SparseBlock.h
@@ -164,7 +164,7 @@ public:
       }
       else
       {
-        if(m_matrix.isCompressed())
+        if(m_matrix.isCompressed() && nnz!=block_size)
         {
           // no need to realloc, simply copy the tail at its respective position and insert tmp
           matrix.data().resize(start + nnz + tail_size);
@@ -326,46 +326,6 @@ private:
 
 //----------
 
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major).
-  */
-template<typename Derived>
-typename SparseMatrixBase<Derived>::InnerVectorReturnType SparseMatrixBase<Derived>::innerVector(Index outer)
-{ return InnerVectorReturnType(derived(), outer); }
-
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major). Read-only.
-  */
-template<typename Derived>
-const typename SparseMatrixBase<Derived>::ConstInnerVectorReturnType SparseMatrixBase<Derived>::innerVector(Index outer) const
-{ return ConstInnerVectorReturnType(derived(), outer); }
-
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major).
-  */
-template<typename Derived>
-typename SparseMatrixBase<Derived>::InnerVectorsReturnType
-SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize)
-{
-  return Block<Derived,Dynamic,Dynamic,true>(derived(),
-                                             IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
-                                             IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
-
-}
-
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major). Read-only.
-  */
-template<typename Derived>
-const typename SparseMatrixBase<Derived>::ConstInnerVectorsReturnType
-SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize) const
-{
-  return Block<const Derived,Dynamic,Dynamic,true>(derived(),
-                                                  IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
-                                                  IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
-
-}
-
 /** Generic implementation of sparse Block expression.
   * Real-only.
   */
@@ -486,9 +446,13 @@ struct unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBa
     {}
 
     inline Index nonZerosEstimate() const {
-      Index nnz = m_block.nonZeros();
-      if(nnz<0)
-        return m_argImpl.nonZerosEstimate() * m_block.size() / m_block.nestedExpression().size();
+      const Index nnz = m_block.nonZeros();
+      if(nnz < 0) {
+        // Scale the non-zero estimate for the underlying expression linearly with block size.
+        // Return zero if the underlying block is empty.
+        const Index nested_sz = m_block.nestedExpression().size();        
+        return nested_sz == 0 ? 0 : m_argImpl.nonZerosEstimate() * m_block.size() / nested_sz;
+      }
       return nnz;
     }
 
@@ -503,22 +467,25 @@ template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 class unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased>::InnerVectorInnerIterator
  : public EvalIterator
 {
-  enum { IsRowMajor = unary_evaluator::IsRowMajor };
+  // NOTE MSVC fails to compile if we don't explicitely "import" IsRowMajor from unary_evaluator
+  //      because the base class EvalIterator has a private IsRowMajor enum too. (bug #1786)
+  // NOTE We cannot call it IsRowMajor because it would shadow unary_evaluator::IsRowMajor
+  enum { XprIsRowMajor = unary_evaluator::IsRowMajor };
   const XprType& m_block;
   Index m_end;
 public:
 
   EIGEN_STRONG_INLINE InnerVectorInnerIterator(const unary_evaluator& aEval, Index outer)
-    : EvalIterator(aEval.m_argImpl, outer + (IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol())),
+    : EvalIterator(aEval.m_argImpl, outer + (XprIsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol())),
       m_block(aEval.m_block),
-      m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows())
+      m_end(XprIsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows())
   {
-    while( (EvalIterator::operator bool()) && (EvalIterator::index() < (IsRowMajor ? m_block.startCol() : m_block.startRow())) )
+    while( (EvalIterator::operator bool()) && (EvalIterator::index() < (XprIsRowMajor ? m_block.startCol() : m_block.startRow())) )
       EvalIterator::operator++();
   }
 
-  inline StorageIndex index() const { return EvalIterator::index() - convert_index<StorageIndex>(IsRowMajor ? m_block.startCol() : m_block.startRow()); }
-  inline Index outer()  const { return EvalIterator::outer() - (IsRowMajor ? m_block.startRow() : m_block.startCol()); }
+  inline StorageIndex index() const { return EvalIterator::index() - convert_index<StorageIndex>(XprIsRowMajor ? m_block.startCol() : m_block.startRow()); }
+  inline Index outer()  const { return EvalIterator::outer() - (XprIsRowMajor ? m_block.startRow() : m_block.startCol()); }
   inline Index row()    const { return EvalIterator::row()   - m_block.startRow(); }
   inline Index col()    const { return EvalIterator::col()   - m_block.startCol(); }
 
@@ -528,7 +495,8 @@ public:
 template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 class unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased>::OuterVectorInnerIterator
 {
-  enum { IsRowMajor = unary_evaluator::IsRowMajor };
+  // NOTE see above
+  enum { XprIsRowMajor = unary_evaluator::IsRowMajor };
   const unary_evaluator& m_eval;
   Index m_outerPos;
   const Index m_innerIndex;
@@ -538,9 +506,9 @@ public:
 
   EIGEN_STRONG_INLINE OuterVectorInnerIterator(const unary_evaluator& aEval, Index outer)
     : m_eval(aEval),
-      m_outerPos( (IsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) ),
-      m_innerIndex(IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()),
-      m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()),
+      m_outerPos( (XprIsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) ),
+      m_innerIndex(XprIsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()),
+      m_end(XprIsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()),
       m_it(m_eval.m_argImpl, m_outerPos)
   {
     EIGEN_UNUSED_VARIABLE(outer);
@@ -551,10 +519,10 @@ public:
       ++(*this);
   }
 
-  inline StorageIndex index() const { return convert_index<StorageIndex>(m_outerPos - (IsRowMajor ? m_eval.m_block.startCol() : m_eval.m_block.startRow())); }
+  inline StorageIndex index() const { return convert_index<StorageIndex>(m_outerPos - (XprIsRowMajor ? m_eval.m_block.startCol() : m_eval.m_block.startRow())); }
   inline Index outer()  const { return 0; }
-  inline Index row()    const { return IsRowMajor ? 0 : index(); }
-  inline Index col()    const { return IsRowMajor ? index() : 0; }
+  inline Index row()    const { return XprIsRowMajor ? 0 : index(); }
+  inline Index col()    const { return XprIsRowMajor ? index() : 0; }
 
   inline Scalar value() const { return m_it.value(); }
   inline Scalar& valueRef() { return m_it.valueRef(); }
diff --git a/contrib/eigen/Eigen/src/SparseCore/SparseCompressedBase.h b/contrib/eigen/Eigen/src/SparseCore/SparseCompressedBase.h
index 5ccb46656144ec7b9c49a172838d51fe2e3c0a33..6a2c7a8ce6354fc70032e57734f12886d9f2d2e0 100644
--- a/contrib/eigen/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/contrib/eigen/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -128,6 +128,28 @@ class SparseCompressedBase
   protected:
     /** Default constructor. Do nothing. */
     SparseCompressedBase() {}
+
+    /** \internal return the index of the coeff at (row,col) or just before if it does not exist.
+      * This is an analogue of std::lower_bound.
+      */
+    internal::LowerBoundIndex lower_bound(Index row, Index col) const
+    {
+      eigen_internal_assert(row>=0 && row<this->rows() && col>=0 && col<this->cols());
+
+      const Index outer = Derived::IsRowMajor ? row : col;
+      const Index inner = Derived::IsRowMajor ? col : row;
+
+      Index start = this->outerIndexPtr()[outer];
+      Index end = this->isCompressed() ? this->outerIndexPtr()[outer+1] : this->outerIndexPtr()[outer] + this->innerNonZeroPtr()[outer];
+      eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist");
+      internal::LowerBoundIndex p;
+      p.value = std::lower_bound(this->innerIndexPtr()+start, this->innerIndexPtr()+end,inner) - this->innerIndexPtr();
+      p.found = (p.value<end) && (this->innerIndexPtr()[p.value]==inner);
+      return p;
+    }
+
+    friend struct internal::evaluator<SparseCompressedBase<Derived> >;
+
   private:
     template<typename OtherDerived> explicit SparseCompressedBase(const SparseCompressedBase<OtherDerived>&);
 };
@@ -185,6 +207,14 @@ class SparseCompressedBase<Derived>::InnerIterator
     }
 
     inline InnerIterator& operator++() { m_id++; return *this; }
+    inline InnerIterator& operator+=(Index i) { m_id += i ; return *this; }
+
+    inline InnerIterator operator+(Index i) 
+    { 
+        InnerIterator result = *this;
+        result += i;
+        return result;
+    }
 
     inline const Scalar& value() const { return m_values[m_id]; }
     inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
@@ -245,6 +275,14 @@ class SparseCompressedBase<Derived>::ReverseInnerIterator
     }
 
     inline ReverseInnerIterator& operator--() { --m_id; return *this; }
+    inline ReverseInnerIterator& operator-=(Index i) { m_id -= i; return *this; }
+
+    inline ReverseInnerIterator operator-(Index i) 
+    {
+        ReverseInnerIterator result = *this;
+        result -= i;
+        return result;
+    }
 
     inline const Scalar& value() const { return m_values[m_id-1]; }
     inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
@@ -317,17 +355,8 @@ protected:
 
   Index find(Index row, Index col) const
   {
-    eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols());
-
-    const Index outer = Derived::IsRowMajor ? row : col;
-    const Index inner = Derived::IsRowMajor ? col : row;
-
-    Index start = m_matrix->outerIndexPtr()[outer];
-    Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer];
-    eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist");
-    const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) - m_matrix->innerIndexPtr();
-
-    return ((p<end) && (m_matrix->innerIndexPtr()[p]==inner)) ? p : Dynamic;
+    internal::LowerBoundIndex p = m_matrix->lower_bound(row,col);
+    return p.found ? p.value : Dynamic;
   }
 
   const Derived *m_matrix;
diff --git a/contrib/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/contrib/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index e315e35506059e1b7ad1932faaefc33a0dfa4960..9b0d3f98dcd7c9ad9d446d65bc3de39e7b03c90a 100644
--- a/contrib/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/contrib/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -101,7 +101,7 @@ public:
       }
       else
       {
-        m_value = 0; // this is to avoid a compilation warning
+        m_value = Scalar(0); // this is to avoid a compilation warning
         m_id = -1;
       }
       return *this;
@@ -126,7 +126,7 @@ public:
   
   
   enum {
-    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    CoeffReadCost = int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
     Flags = XprType::Flags
   };
   
@@ -211,9 +211,8 @@ public:
 
 
   enum {
-    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
-    // Expose storage order of the sparse expression
-    Flags = (XprType::Flags & ~RowMajorBit) | (int(Rhs::Flags)&RowMajorBit)
+    CoeffReadCost = int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
   };
 
   explicit binary_evaluator(const XprType& xpr)
@@ -299,9 +298,8 @@ public:
 
 
   enum {
-    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
-    // Expose storage order of the sparse expression
-    Flags = (XprType::Flags & ~RowMajorBit) | (int(Lhs::Flags)&RowMajorBit)
+    CoeffReadCost = int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
   };
 
   explicit binary_evaluator(const XprType& xpr)
@@ -459,7 +457,7 @@ public:
   
   
   enum {
-    CoeffReadCost = evaluator<LhsArg>::CoeffReadCost + evaluator<RhsArg>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    CoeffReadCost = int(evaluator<LhsArg>::CoeffReadCost) + int(evaluator<RhsArg>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
     Flags = XprType::Flags
   };
   
@@ -532,9 +530,8 @@ public:
   
   
   enum {
-    CoeffReadCost = evaluator<LhsArg>::CoeffReadCost + evaluator<RhsArg>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
-    // Expose storage order of the sparse expression
-    Flags = (XprType::Flags & ~RowMajorBit) | (int(RhsArg::Flags)&RowMajorBit)
+    CoeffReadCost = int(evaluator<LhsArg>::CoeffReadCost) + int(evaluator<RhsArg>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
   };
   
   explicit sparse_conjunction_evaluator(const XprType& xpr)
@@ -607,9 +604,8 @@ public:
   
   
   enum {
-    CoeffReadCost = evaluator<LhsArg>::CoeffReadCost + evaluator<RhsArg>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
-    // Expose storage order of the sparse expression
-    Flags = (XprType::Flags & ~RowMajorBit) | (int(LhsArg::Flags)&RowMajorBit)
+    CoeffReadCost = int(evaluator<LhsArg>::CoeffReadCost) + int(evaluator<RhsArg>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
   };
   
   explicit sparse_conjunction_evaluator(const XprType& xpr)
diff --git a/contrib/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/contrib/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
index df6c28d2b89be903c1e2b98e5a38d6b8fd6ff06b..32dac0f7863eb8b572c1fd4bf21752755a034aaf 100644
--- a/contrib/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
+++ b/contrib/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
@@ -24,7 +24,7 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>
     class InnerIterator;
     
     enum {
-      CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
+      CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
       Flags = XprType::Flags
     };
     
@@ -79,7 +79,7 @@ struct unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>
     class InnerIterator;
     
     enum {
-      CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<ViewOp>::Cost,
+      CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<ViewOp>::Cost),
       Flags = XprType::Flags
     };
     
diff --git a/contrib/eigen/Eigen/src/SparseCore/SparseDenseProduct.h b/contrib/eigen/Eigen/src/SparseCore/SparseDenseProduct.h
index 0547db59689da03ac5ea195649f7ac0bbe87455a..f005a18a18e2b35e57fb56349f191edf8825f3da 100644
--- a/contrib/eigen/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/contrib/eigen/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -88,10 +88,11 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, A
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename LhsEval::InnerIterator LhsInnerIterator;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha)
   {
-    evaluator<Lhs> lhsEval(lhs);
+    LhsEval lhsEval(lhs);
     for(Index c=0; c<rhs.cols(); ++c)
     {
       for(Index j=0; j<lhs.outerSize(); ++j)
@@ -111,17 +112,38 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename LhsEval::InnerIterator LhsInnerIterator;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
   {
-    evaluator<Lhs> lhsEval(lhs);
-    for(Index j=0; j<lhs.outerSize(); ++j)
+    Index n = lhs.rows();
+    LhsEval lhsEval(lhs);
+
+#ifdef EIGEN_HAS_OPENMP
+    Eigen::initParallel();
+    Index threads = Eigen::nbThreads();
+    // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
+    // It basically represents the minimal amount of work to be done to be worth it.
+    if(threads>1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
     {
-      typename Res::RowXpr res_j(res.row(j));
-      for(LhsInnerIterator it(lhsEval,j); it ;++it)
-        res_j += (alpha*it.value()) * rhs.row(it.index());
+      #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
+      for(Index i=0; i<n; ++i)
+        processRow(lhsEval,rhs,res,alpha,i);
+    }
+    else
+#endif
+    {
+      for(Index i=0; i<n; ++i)
+        processRow(lhsEval, rhs, res, alpha, i);
     }
   }
+
+  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, Res& res, const typename Res::Scalar& alpha, Index i)
+  {
+    typename Res::RowXpr res_i(res.row(i));
+    for(LhsInnerIterator it(lhsEval,i); it ;++it)
+      res_i += (alpha*it.value()) * rhs.row(it.index());
+  }
 };
 
 template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
diff --git a/contrib/eigen/Eigen/src/SparseCore/SparseMatrix.h b/contrib/eigen/Eigen/src/SparseCore/SparseMatrix.h
index a5396538ba9b7d2a49a869178606460040341362..616b4a0c24e7b7b2e725bd4a3e7b62388a00e2a4 100644
--- a/contrib/eigen/Eigen/src/SparseCore/SparseMatrix.h
+++ b/contrib/eigen/Eigen/src/SparseCore/SparseMatrix.h
@@ -21,7 +21,7 @@ namespace Eigen {
   * This class implements a more versatile variants of the common \em compressed row/column storage format.
   * Each colmun's (resp. row) non zeros are stored as a pair of value with associated row (resp. colmiun) index.
   * All the non zeros are stored in a single large buffer. Unlike the \em compressed format, there might be extra
-  * space inbetween the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero
+  * space in between the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero
   * can be done with limited memory reallocation and copies.
   *
   * A call to the function makeCompressed() turns the matrix into the standard \em compressed format
@@ -99,6 +99,8 @@ class SparseMatrix
     typedef SparseCompressedBase<SparseMatrix> Base;
     using Base::convert_index;
     friend class SparseVector<_Scalar,0,_StorageIndex>;
+    template<typename, typename, typename, typename, typename>
+    friend struct internal::Assignment;
   public:
     using Base::isCompressed;
     using Base::nonZeros;
@@ -503,8 +505,8 @@ class SparseMatrix
         m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; 
       }
     }
-    
-    /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerence \a epsilon */
+
+    /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */
     void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
     {
       prune(default_prunning_func(reference,epsilon));
@@ -577,10 +579,12 @@ class SparseMatrix
       else if (innerChange < 0) 
       {
         // Inner size decreased: allocate a new m_innerNonZeros
-        m_innerNonZeros = static_cast<StorageIndex*>(std::malloc((m_outerSize+outerChange+1) * sizeof(StorageIndex)));
+        m_innerNonZeros = static_cast<StorageIndex*>(std::malloc((m_outerSize + outerChange) * sizeof(StorageIndex)));
         if (!m_innerNonZeros) internal::throw_std_bad_alloc();
-        for(Index i = 0; i < m_outerSize; i++)
+        for(Index i = 0; i < m_outerSize + (std::min)(outerChange, Index(0)); i++)
           m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i];
+        for(Index i = m_outerSize; i < m_outerSize + outerChange; i++)
+          m_innerNonZeros[i] = 0;
       }
       
       // Change the m_innerNonZeros in case of a decrease of inner size
@@ -605,9 +609,9 @@ class SparseMatrix
       m_outerIndex = newOuterIndex;
       if (outerChange > 0)
       {
-        StorageIndex last = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize];
+        StorageIndex lastIdx = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize];
         for(Index i=m_outerSize; i<m_outerSize+outerChange+1; i++)          
-          m_outerIndex[i] = last; 
+          m_outerIndex[i] = lastIdx; 
       }
       m_outerSize += outerChange;
     }
@@ -781,6 +785,9 @@ class SparseMatrix
     template<typename OtherDerived>
     inline SparseMatrix& operator=(const EigenBase<OtherDerived>& other)
     { return Base::operator=(other.derived()); }
+
+    template<typename Lhs, typename Rhs>
+    inline SparseMatrix& operator=(const Product<Lhs,Rhs,AliasFreeProduct>& other);
 #endif // EIGEN_PARSED_BY_DOXYGEN
 
     template<typename OtherDerived>
@@ -896,6 +903,113 @@ public:
       m_data.index(p) = convert_index(inner);
       return (m_data.value(p) = Scalar(0));
     }
+protected:
+    struct IndexPosPair {
+      IndexPosPair(Index a_i, Index a_p) : i(a_i), p(a_p) {}
+      Index i;
+      Index p;
+    };
+
+    /** \internal assign \a diagXpr to the diagonal of \c *this
+      * There are different strategies:
+      *   1 - if *this is overwritten (Func==assign_op) or *this is empty, then we can work treat *this as a dense vector expression.
+      *   2 - otherwise, for each diagonal coeff,
+      *     2.a - if it already exists, then we update it,
+      *     2.b - otherwise, if *this is uncompressed and that the current inner-vector has empty room for at least 1 element, then we perform an in-place insertion.
+      *     2.c - otherwise, we'll have to reallocate and copy everything, so instead of doing so for each new element, it is recorded in a std::vector.
+      *   3 - at the end, if some entries failed to be inserted in-place, then we alloc a new buffer, copy each chunk at the right position, and insert the new elements.
+      * 
+      * TODO: some piece of code could be isolated and reused for a general in-place update strategy.
+      * TODO: if we start to defer the insertion of some elements (i.e., case 2.c executed once),
+      *       then it *might* be better to disable case 2.b since they will have to be copied anyway.
+      */
+    template<typename DiagXpr, typename Func>
+    void assignDiagonal(const DiagXpr diagXpr, const Func& assignFunc)
+    {
+      Index n = diagXpr.size();
+
+      const bool overwrite = internal::is_same<Func, internal::assign_op<Scalar,Scalar> >::value;
+      if(overwrite)
+      {
+        if((this->rows()!=n) || (this->cols()!=n))
+          this->resize(n, n);
+      }
+
+      if(m_data.size()==0 || overwrite)
+      {
+        typedef Array<StorageIndex,Dynamic,1> ArrayXI;  
+        this->makeCompressed();
+        this->resizeNonZeros(n);
+        Eigen::Map<ArrayXI>(this->innerIndexPtr(), n).setLinSpaced(0,StorageIndex(n)-1);
+        Eigen::Map<ArrayXI>(this->outerIndexPtr(), n+1).setLinSpaced(0,StorageIndex(n));
+        Eigen::Map<Array<Scalar,Dynamic,1> > values = this->coeffs();
+        values.setZero();
+        internal::call_assignment_no_alias(values, diagXpr, assignFunc);
+      }
+      else
+      {
+        bool isComp = isCompressed();
+        internal::evaluator<DiagXpr> diaEval(diagXpr);
+        std::vector<IndexPosPair> newEntries;
+
+        // 1 - try in-place update and record insertion failures
+        for(Index i = 0; i<n; ++i)
+        {
+          internal::LowerBoundIndex lb = this->lower_bound(i,i);
+          Index p = lb.value;
+          if(lb.found)
+          {
+            // the coeff already exists
+            assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i));
+          }
+          else if((!isComp) && m_innerNonZeros[i] < (m_outerIndex[i+1]-m_outerIndex[i]))
+          {
+            // non compressed mode with local room for inserting one element
+            m_data.moveChunk(p, p+1, m_outerIndex[i]+m_innerNonZeros[i]-p);
+            m_innerNonZeros[i]++;
+            m_data.value(p) = Scalar(0);
+            m_data.index(p) = StorageIndex(i);
+            assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i));
+          }
+          else
+          {
+            // defer insertion
+            newEntries.push_back(IndexPosPair(i,p));
+          }
+        }
+        // 2 - insert deferred entries
+        Index n_entries = Index(newEntries.size());
+        if(n_entries>0)
+        {
+          Storage newData(m_data.size()+n_entries);
+          Index prev_p = 0;
+          Index prev_i = 0;
+          for(Index k=0; k<n_entries;++k)
+          {
+            Index i = newEntries[k].i;
+            Index p = newEntries[k].p;
+            internal::smart_copy(m_data.valuePtr()+prev_p, m_data.valuePtr()+p, newData.valuePtr()+prev_p+k);
+            internal::smart_copy(m_data.indexPtr()+prev_p, m_data.indexPtr()+p, newData.indexPtr()+prev_p+k);
+            for(Index j=prev_i;j<i;++j)
+              m_outerIndex[j+1] += k;
+            if(!isComp)
+              m_innerNonZeros[i]++;
+            prev_p = p;
+            prev_i = i;
+            newData.value(p+k) = Scalar(0);
+            newData.index(p+k) = StorageIndex(i);
+            assignFunc.assignCoeff(newData.value(p+k), diaEval.coeff(i));
+          }
+          {
+            internal::smart_copy(m_data.valuePtr()+prev_p, m_data.valuePtr()+m_data.size(), newData.valuePtr()+prev_p+n_entries);
+            internal::smart_copy(m_data.indexPtr()+prev_p, m_data.indexPtr()+m_data.size(), newData.indexPtr()+prev_p+n_entries);
+            for(Index j=prev_i+1;j<=m_outerSize;++j)
+              m_outerIndex[j] += n_entries;
+          }
+          m_data.swap(newData);
+        }
+      }
+    }
 
 private:
   static void check_template_parameters()
@@ -974,7 +1088,7 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa
   * \code
     typedef Triplet<double> T;
     std::vector<T> tripletList;
-    triplets.reserve(estimation_of_entries);
+    tripletList.reserve(estimation_of_entries);
     for(...)
     {
       // ...
@@ -987,7 +1101,7 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa
   *
   * \warning The list of triplets is read multiple times (at least twice). Therefore, it is not recommended to define
   * an abstract iterator over a complex data-structure that would be expensive to evaluate. The triplets should rather
-  * be explicitely stored into a std::vector for instance.
+  * be explicitly stored into a std::vector for instance.
   */
 template<typename Scalar, int _Options, typename _StorageIndex>
 template<typename InputIterators>
@@ -1233,7 +1347,7 @@ typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Sca
     }
     
     m_data.index(p) = convert_index(inner);
-    return (m_data.value(p) = 0);
+    return (m_data.value(p) = Scalar(0));
   }
   
   if(m_data.size() != m_data.allocatedSize())
diff --git a/contrib/eigen/Eigen/src/SparseCore/SparseMatrixBase.h b/contrib/eigen/Eigen/src/SparseCore/SparseMatrixBase.h
index c6b548f11adb4e2228611793ce94f222cdf85dfd..229449f02274463f3242ea2283faf3a93ab006cb 100644
--- a/contrib/eigen/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/contrib/eigen/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -87,6 +87,11 @@ template<typename Derived> class SparseMatrixBase
           * we are dealing with a column-vector (if there is only one column) or with
           * a row-vector (if there is only one row). */
 
+      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
+        /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors,
+         * and 2 for matrices.
+         */
+
       Flags = internal::traits<Derived>::Flags,
         /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
           * constructed from this one. See the \ref flags "list of flags".
@@ -350,18 +355,6 @@ template<typename Derived> class SparseMatrixBase
     const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); }
     const AdjointReturnType adjoint() const { return AdjointReturnType(transpose()); }
 
-    // inner-vector
-    typedef Block<Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true>       InnerVectorReturnType;
-    typedef Block<const Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true> ConstInnerVectorReturnType;
-    InnerVectorReturnType innerVector(Index outer);
-    const ConstInnerVectorReturnType innerVector(Index outer) const;
-
-    // set of inner-vectors
-    typedef Block<Derived,Dynamic,Dynamic,true> InnerVectorsReturnType;
-    typedef Block<const Derived,Dynamic,Dynamic,true> ConstInnerVectorsReturnType;
-    InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize);
-    const ConstInnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) const;
-
     DenseMatrixType toDense() const
     {
       return DenseMatrixType(derived());
diff --git a/contrib/eigen/Eigen/src/SparseCore/SparseProduct.h b/contrib/eigen/Eigen/src/SparseCore/SparseProduct.h
index 4cbf68781a84e8a3c21cb0287dc05621caa108c8..af8a7744ddb59848cd7104b41fbba781f537bc60 100644
--- a/contrib/eigen/Eigen/src/SparseCore/SparseProduct.h
+++ b/contrib/eigen/Eigen/src/SparseCore/SparseProduct.h
@@ -17,7 +17,7 @@ namespace Eigen {
   * The automatic pruning of the small values can be achieved by calling the pruned() function
   * in which case a totally different product algorithm is employed:
   * \code
-  * C = (A*B).pruned();             // supress numerical zeros (exact)
+  * C = (A*B).pruned();             // suppress numerical zeros (exact)
   * C = (A*B).pruned(ref);
   * C = (A*B).pruned(ref,epsilon);
   * \endcode
@@ -164,6 +164,18 @@ protected:
 
 } // end namespace internal
 
+// sparse matrix = sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template<typename Scalar, int _Options, typename _StorageIndex>
+template<typename Lhs, typename Rhs>
+SparseMatrix<Scalar,_Options,_StorageIndex>& SparseMatrix<Scalar,_Options,_StorageIndex>::operator=(const Product<Lhs,Rhs,AliasFreeProduct>& src)
+{
+  // std::cout << "in Assignment : " << DstOptions << "\n";
+  SparseMatrix dst(src.rows(),src.cols());
+  internal::generic_product_impl<Lhs, Rhs>::evalTo(dst,src.lhs(),src.rhs());
+  this->swap(dst);
+  return *this;
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSEPRODUCT_H
diff --git a/contrib/eigen/Eigen/src/SparseCore/SparseRef.h b/contrib/eigen/Eigen/src/SparseCore/SparseRef.h
index d91f38f97ce62e9df9b553a85b8f99b431773ed5..748f87d62639028b2306af14d26a7503d60524a6 100644
--- a/contrib/eigen/Eigen/src/SparseCore/SparseRef.h
+++ b/contrib/eigen/Eigen/src/SparseCore/SparseRef.h
@@ -201,7 +201,7 @@ class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
 
     ~Ref() {
       if(m_hasCopy) {
-        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(&m_storage);
         obj->~TPlainObjectType();
       }
     }
@@ -213,7 +213,7 @@ class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
     {
       if((Options & int(StandardCompressedFormat)) && (!expr.isCompressed()))
       {
-        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(&m_storage);
         ::new (obj) TPlainObjectType(expr);
         m_hasCopy = true;
         Base::construct(*obj);
@@ -227,14 +227,14 @@ class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
     template<typename Expression>
     void construct(const Expression& expr, internal::false_type)
     {
-      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(&m_storage);
       ::new (obj) TPlainObjectType(expr);
       m_hasCopy = true;
       Base::construct(*obj);
     }
 
   protected:
-    char m_object_bytes[sizeof(TPlainObjectType)];
+    typename internal::aligned_storage<sizeof(TPlainObjectType), EIGEN_ALIGNOF(TPlainObjectType)>::type m_storage;
     bool m_hasCopy;
 };
 
@@ -319,7 +319,7 @@ class Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType
 
     ~Ref() {
       if(m_hasCopy) {
-        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(&m_storage);
         obj->~TPlainObjectType();
       }
     }
@@ -335,14 +335,14 @@ class Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType
     template<typename Expression>
     void construct(const Expression& expr, internal::false_type)
     {
-      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(&m_storage);
       ::new (obj) TPlainObjectType(expr);
       m_hasCopy = true;
       Base::construct(*obj);
     }
 
   protected:
-    char m_object_bytes[sizeof(TPlainObjectType)];
+    typename internal::aligned_storage<sizeof(TPlainObjectType), EIGEN_ALIGNOF(TPlainObjectType)>::type m_storage;
     bool m_hasCopy;
 };
 
diff --git a/contrib/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h b/contrib/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 76117a0107e4dba6f826a930cdb31e3ec12b8b58..85b00e10e9586f0c7f14099d7880eba71dc9bc24 100644
--- a/contrib/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/contrib/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -142,6 +142,9 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
       return *this = src.twistedBy(pnull);
     }
 
+    // Since we override the copy-assignment operator, we need to explicitly re-declare the copy-constructor
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(SparseSelfAdjointView)
+
     template<typename SrcMatrixType,unsigned int SrcMode>
     SparseSelfAdjointView& operator=(const SparseSelfAdjointView<SrcMatrixType,SrcMode>& src)
     {
diff --git a/contrib/eigen/Eigen/src/SparseCore/SparseUtil.h b/contrib/eigen/Eigen/src/SparseCore/SparseUtil.h
index 74df0d49642062041e9f90f65de08dd8c39952b3..ceb9368879f1d8a1d9547160d46c30c297b7bbef 100644
--- a/contrib/eigen/Eigen/src/SparseCore/SparseUtil.h
+++ b/contrib/eigen/Eigen/src/SparseCore/SparseUtil.h
@@ -140,6 +140,14 @@ struct SparseSelfAdjointShape { static std::string debugName() { return "SparseS
 template<> struct glue_shapes<SparseShape,SelfAdjointShape> { typedef SparseSelfAdjointShape type;  };
 template<> struct glue_shapes<SparseShape,TriangularShape > { typedef SparseTriangularShape  type;  };
 
+// return type of SparseCompressedBase::lower_bound;
+struct LowerBoundIndex {
+  LowerBoundIndex() : value(-1), found(false) {}
+  LowerBoundIndex(Index val, bool ok) : value(val), found(ok) {}
+  Index value;
+  bool found;
+};
+
 } // end namespace internal
 
 /** \ingroup SparseCore_Module
diff --git a/contrib/eigen/Eigen/src/SparseCore/SparseVector.h b/contrib/eigen/Eigen/src/SparseCore/SparseVector.h
index 19b0fbc9d70ac8eac38ddf558e50ff2e3fda4b1c..05779be685b8f39d70f761e8064c7f0c2665d7b6 100644
--- a/contrib/eigen/Eigen/src/SparseCore/SparseVector.h
+++ b/contrib/eigen/Eigen/src/SparseCore/SparseVector.h
@@ -281,7 +281,7 @@ class SparseVector
     }
 
     /** Swaps the values of \c *this and \a other.
-      * Overloaded for performance: this version performs a \em shallow swap by swaping pointers and attributes only.
+      * Overloaded for performance: this version performs a \em shallow swap by swapping pointers and attributes only.
       * \sa SparseMatrixBase::swap()
       */
     inline void swap(SparseVector& other)
diff --git a/contrib/eigen/Eigen/src/SparseLU/SparseLU.h b/contrib/eigen/Eigen/src/SparseLU/SparseLU.h
index 87f0efe37ed8655f6a6ff96f193265f0f2715d17..0c8d8939be2e21089f6883341b4339d5f4868536 100644
--- a/contrib/eigen/Eigen/src/SparseLU/SparseLU.h
+++ b/contrib/eigen/Eigen/src/SparseLU/SparseLU.h
@@ -18,6 +18,63 @@ template <typename _MatrixType, typename _OrderingType = COLAMDOrdering<typename
 template <typename MappedSparseMatrixType> struct SparseLUMatrixLReturnType;
 template <typename MatrixLType, typename MatrixUType> struct SparseLUMatrixUReturnType;
 
+template <bool Conjugate,class SparseLUType>
+class SparseLUTransposeView : public SparseSolverBase<SparseLUTransposeView<Conjugate,SparseLUType> >
+{
+protected:
+  typedef SparseSolverBase<SparseLUTransposeView<Conjugate,SparseLUType> > APIBase;
+  using APIBase::m_isInitialized;
+public:
+  typedef typename SparseLUType::Scalar Scalar;
+  typedef typename SparseLUType::StorageIndex StorageIndex;
+  typedef typename SparseLUType::MatrixType MatrixType;
+  typedef typename SparseLUType::OrderingType OrderingType;
+
+  enum {
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+
+  SparseLUTransposeView() : m_sparseLU(NULL) {}
+  SparseLUTransposeView(const SparseLUTransposeView& view) {
+    this->m_sparseLU = view.m_sparseLU;
+  }
+  void setIsInitialized(const bool isInitialized) {this->m_isInitialized = isInitialized;}
+  void setSparseLU(SparseLUType* sparseLU) {m_sparseLU = sparseLU;}
+  using APIBase::_solve_impl;
+  template<typename Rhs, typename Dest>
+  bool _solve_impl(const MatrixBase<Rhs> &B, MatrixBase<Dest> &X_base) const
+  {
+    Dest& X(X_base.derived());
+    eigen_assert(m_sparseLU->info() == Success && "The matrix should be factorized first");
+    EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0,
+                        THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+
+
+    // this ugly const_cast_derived() helps to detect aliasing when applying the permutations
+    for(Index j = 0; j < B.cols(); ++j){
+      X.col(j) = m_sparseLU->colsPermutation() * B.const_cast_derived().col(j);
+    }
+    //Forward substitution with transposed or adjoint of U
+    m_sparseLU->matrixU().template solveTransposedInPlace<Conjugate>(X);
+
+    //Backward substitution with transposed or adjoint of L
+    m_sparseLU->matrixL().template solveTransposedInPlace<Conjugate>(X);
+
+    // Permute back the solution
+    for (Index j = 0; j < B.cols(); ++j)
+      X.col(j) = m_sparseLU->rowsPermutation().transpose() * X.col(j);
+    return true;
+  }
+  inline Index rows() const { return m_sparseLU->rows(); }
+  inline Index cols() const { return m_sparseLU->cols(); }
+
+private:
+  SparseLUType *m_sparseLU;
+  SparseLUTransposeView& operator=(const SparseLUTransposeView&);
+};
+
+
 /** \ingroup SparseLU_Module
   * \class SparseLU
   * 
@@ -26,7 +83,7 @@ template <typename MatrixLType, typename MatrixUType> struct SparseLUMatrixURetu
   * This class implements the supernodal LU factorization for general matrices.
   * It uses the main techniques from the sequential SuperLU package 
   * (http://crd-legacy.lbl.gov/~xiaoye/SuperLU/). It handles transparently real 
-  * and complex arithmetics with single and double precision, depending on the 
+  * and complex arithmetic with single and double precision, depending on the 
   * scalar type of your input matrix. 
   * The code has been optimized to provide BLAS-3 operations during supernode-panel updates. 
   * It benefits directly from the built-in high-performant Eigen BLAS routines. 
@@ -97,6 +154,7 @@ class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >,
     };
     
   public:
+
     SparseLU():m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
     {
       initperfvalues(); 
@@ -128,6 +186,45 @@ class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >,
       //Factorize
       factorize(matrix);
     } 
+
+    /** \returns an expression of the transposed of the factored matrix.
+      *
+      * A typical usage is to solve for the transposed problem A^T x = b:
+      * \code
+      * solver.compute(A);
+      * x = solver.transpose().solve(b);
+      * \endcode
+      *
+      * \sa adjoint(), solve()
+      */
+    const SparseLUTransposeView<false,SparseLU<_MatrixType,_OrderingType> > transpose()
+    {
+      SparseLUTransposeView<false,  SparseLU<_MatrixType,_OrderingType> > transposeView;
+      transposeView.setSparseLU(this);
+      transposeView.setIsInitialized(this->m_isInitialized);
+      return transposeView;
+    }
+
+
+    /** \returns an expression of the adjoint of the factored matrix
+      *
+      * A typical usage is to solve for the adjoint problem A' x = b:
+      * \code
+      * solver.compute(A);
+      * x = solver.adjoint().solve(b);
+      * \endcode
+      *
+      * For real scalar types, this function is equivalent to transpose().
+      *
+      * \sa transpose(), solve()
+      */
+    const SparseLUTransposeView<true, SparseLU<_MatrixType,_OrderingType> > adjoint()
+    {
+      SparseLUTransposeView<true,  SparseLU<_MatrixType,_OrderingType> > adjointView;
+      adjointView.setSparseLU(this);
+      adjointView.setIsInitialized(this->m_isInitialized);
+      return adjointView;
+    }
     
     inline Index rows() const { return m_mat.rows(); }
     inline Index cols() const { return m_mat.cols(); }
@@ -193,7 +290,7 @@ class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >,
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance
       *          \c InvalidInput if the input matrix is invalid
       *
@@ -355,6 +452,9 @@ class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >,
       return (m_detPermR * m_detPermC) > 0 ? det : -det;
     }
 
+    Index nnzL() const { return m_nnzL; };
+    Index nnzU() const { return m_nnzU; };
+
   protected:
     // Functions 
     void initperfvalues()
@@ -391,7 +491,6 @@ class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >,
   private:
     // Disable copy constructor 
     SparseLU (const SparseLU& );
-  
 }; // End class SparseLU
 
 
@@ -501,7 +600,6 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
   
   m_isInitialized = true;
   
-  
   // Apply the column permutation computed in analyzepattern()
   //   m_mat = matrix * m_perm_c.inverse(); 
   m_mat = matrix;
@@ -585,7 +683,6 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
   //  (a) a relaxed supernode at the bottom of the etree, or
   //  (b) panel_size contiguous columns, <panel_size> defined by the user
   Index jcol; 
-  IndexVector panel_histo(n);
   Index pivrow; // Pivotal row number in the original row matrix
   Index nseg1; // Number of segments in U-column above panel row jcol
   Index nseg; // Number of segments in each U-column 
@@ -704,13 +801,19 @@ struct SparseLUMatrixLReturnType : internal::no_assignment_operator
   typedef typename MappedSupernodalType::Scalar Scalar;
   explicit SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL)
   { }
-  Index rows() { return m_mapL.rows(); }
-  Index cols() { return m_mapL.cols(); }
+  Index rows() const { return m_mapL.rows(); }
+  Index cols() const { return m_mapL.cols(); }
   template<typename Dest>
   void solveInPlace( MatrixBase<Dest> &X) const
   {
     m_mapL.solveInPlace(X);
   }
+  template<bool Conjugate, typename Dest>
+  void solveTransposedInPlace( MatrixBase<Dest> &X) const
+  {
+    m_mapL.template solveTransposedInPlace<Conjugate>(X);
+  }
+
   const MappedSupernodalType& m_mapL;
 };
 
@@ -721,8 +824,8 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
   SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU)
   : m_mapL(mapL),m_mapU(mapU)
   { }
-  Index rows() { return m_mapL.rows(); }
-  Index cols() { return m_mapL.cols(); }
+  Index rows() const { return m_mapL.rows(); }
+  Index cols() const { return m_mapL.cols(); }
 
   template<typename Dest>   void solveInPlace(MatrixBase<Dest> &X) const
   {
@@ -745,8 +848,9 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
       }
       else
       {
+        // FIXME: the following lines should use Block expressions and not Map!
         Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
         U = A.template triangularView<Upper>().solve(U);
       }
 
@@ -764,6 +868,52 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
       }
     } // End For U-solve
   }
+
+  template<bool Conjugate, typename Dest>   void solveTransposedInPlace(MatrixBase<Dest> &X) const
+  {
+    using numext::conj;
+    Index nrhs = X.cols();
+    Index n    = X.rows();
+    // Forward solve with U
+    for (Index k = 0; k <=  m_mapL.nsuper(); k++)
+    {
+      Index fsupc = m_mapL.supToCol()[k];
+      Index lda = m_mapL.colIndexPtr()[fsupc+1] - m_mapL.colIndexPtr()[fsupc]; // leading dimension
+      Index nsupc = m_mapL.supToCol()[k+1] - fsupc;
+      Index luptr = m_mapL.colIndexPtr()[fsupc];
+
+      for (Index j = 0; j < nrhs; ++j)
+      {
+        for (Index jcol = fsupc; jcol < fsupc + nsupc; jcol++)
+        {
+          typename MatrixUType::InnerIterator it(m_mapU, jcol);
+          for ( ; it; ++it)
+          {
+            Index irow = it.index();
+            X(jcol, j) -= X(irow, j) * (Conjugate? conj(it.value()): it.value());
+          }
+        }
+      }
+      if (nsupc == 1)
+      {
+        for (Index j = 0; j < nrhs; j++)
+        {
+          X(fsupc, j) /= (Conjugate? conj(m_mapL.valuePtr()[luptr]) : m_mapL.valuePtr()[luptr]);
+        }
+      }
+      else
+      {
+        Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
+        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        if(Conjugate)
+          U = A.adjoint().template triangularView<Lower>().solve(U);
+        else
+          U = A.transpose().template triangularView<Lower>().solve(U);
+      }
+    }// End For U-solve
+  }
+
+
   const MatrixLType& m_mapL;
   const MatrixUType& m_mapU;
 };
diff --git a/contrib/eigen/Eigen/src/SparseLU/SparseLU_Memory.h b/contrib/eigen/Eigen/src/SparseLU/SparseLU_Memory.h
index 4dc42e87ba8eb14c619e928360044073b028405a..349bfd585bdcdbc7f192e78d966dbeae748a744e 100644
--- a/contrib/eigen/Eigen/src/SparseLU/SparseLU_Memory.h
+++ b/contrib/eigen/Eigen/src/SparseLU/SparseLU_Memory.h
@@ -51,7 +51,7 @@ inline Index LUTempSpace(Index&m, Index& w)
 
 
 /** 
-  * Expand the existing storage to accomodate more fill-ins
+  * Expand the existing storage to accommodate more fill-ins
   * \param vec Valid pointer to the vector to allocate or expand
   * \param[in,out] length  At input, contain the current length of the vector that is to be increased. At output, length of the newly allocated vector
   * \param[in] nbElts Current number of elements in the factors
diff --git a/contrib/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/contrib/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
index 721e1883ba8fcccc73dd958b4f054ac31fec923a..0be293d17fa51110a211fb99b22ad3e8c67c4560 100644
--- a/contrib/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
+++ b/contrib/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
@@ -75,12 +75,12 @@ class MappedSuperNodalMatrix
     /**
      * Number of rows
      */
-    Index rows() { return m_row; }
+    Index rows() const { return m_row; }
     
     /**
      * Number of columns
      */
-    Index cols() { return m_col; }
+    Index cols() const { return m_col; }
     
     /**
      * Return the array of nonzero values packed by column
@@ -156,6 +156,9 @@ class MappedSuperNodalMatrix
     class InnerIterator; 
     template<typename Dest>
     void solveInPlace( MatrixBase<Dest>&X) const;
+    template<bool Conjugate, typename Dest>
+    void solveTransposedInPlace( MatrixBase<Dest>&X) const;
+
     
       
       
@@ -294,6 +297,77 @@ void MappedSuperNodalMatrix<Scalar,Index_>::solveInPlace( MatrixBase<Dest>&X) co
     } 
 }
 
+template<typename Scalar, typename Index_>
+template<bool Conjugate, typename Dest>
+void MappedSuperNodalMatrix<Scalar,Index_>::solveTransposedInPlace( MatrixBase<Dest>&X) const
+{
+    using numext::conj;
+  Index n    = int(X.rows());
+  Index nrhs = Index(X.cols());
+  const Scalar * Lval = valuePtr();                 // Nonzero values
+  Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor> work(n, nrhs);     // working vector
+  work.setZero();
+  for (Index k = nsuper(); k >= 0; k--)
+  {
+    Index fsupc = supToCol()[k];                    // First column of the current supernode
+    Index istart = rowIndexPtr()[fsupc];            // Pointer index to the subscript of the current column
+    Index nsupr = rowIndexPtr()[fsupc+1] - istart;  // Number of rows in the current supernode
+    Index nsupc = supToCol()[k+1] - fsupc;          // Number of columns in the current supernode
+    Index nrow = nsupr - nsupc;                     // Number of rows in the non-diagonal part of the supernode
+    Index irow;                                     //Current index row
+
+    if (nsupc == 1 )
+    {
+      for (Index j = 0; j < nrhs; j++)
+      {
+        InnerIterator it(*this, fsupc);
+        ++it; // Skip the diagonal element
+        for (; it; ++it)
+        {
+          irow = it.row();
+          X(fsupc,j) -= X(irow, j) * (Conjugate?conj(it.value()):it.value());
+        }
+      }
+    }
+    else
+    {
+      // The supernode has more than one column
+      Index luptr = colIndexPtr()[fsupc];
+      Index lda = colIndexPtr()[fsupc+1] - luptr;
+
+      //Begin Gather
+      for (Index j = 0; j < nrhs; j++)
+      {
+        Index iptr = istart + nsupc;
+        for (Index i = 0; i < nrow; i++)
+        {
+          irow = rowIndex()[iptr];
+          work.topRows(nrow)(i,j)= X(irow,j); // Gather operation
+          iptr++;
+        }
+      }
+
+      // Matrix-vector product with transposed submatrix
+      Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
+      Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+      if(Conjugate)
+        U = U - A.adjoint() * work.topRows(nrow);
+      else
+        U = U - A.transpose() * work.topRows(nrow);
+
+      // Triangular solve (of transposed diagonal block)
+      new (&A) Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > ( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) );
+      if(Conjugate)
+        U = A.adjoint().template triangularView<UnitUpper>().solve(U);
+      else
+        U = A.transpose().template triangularView<UnitUpper>().solve(U);
+
+    }
+
+  }
+}
+
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/contrib/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h b/contrib/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h
index c98b30e323f1213cb59aeadd0d804ed945f89f37..5a2c941b4ab8123739e6a09c3dc523c0a475e36d 100644
--- a/contrib/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h
+++ b/contrib/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h
@@ -151,7 +151,7 @@ Index SparseLUImpl<Scalar,StorageIndex>::column_dfs(const Index m, const Index j
         StorageIndex ito = glu.xlsub(fsupc+1);
         glu.xlsub(jcolm1) = ito; 
         StorageIndex istop = ito + jptr - jm1ptr; 
-        xprune(jcolm1) = istop; // intialize xprune(jcol-1)
+        xprune(jcolm1) = istop; // initialize xprune(jcol-1)
         glu.xlsub(jcol) = istop; 
         
         for (StorageIndex ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
@@ -166,7 +166,7 @@ Index SparseLUImpl<Scalar,StorageIndex>::column_dfs(const Index m, const Index j
   // Tidy up the pointers before exit
   glu.xsup(nsuper+1) = jcolp1; 
   glu.supno(jcolp1) = nsuper; 
-  xprune(jcol) = StorageIndex(nextl);  // Intialize upper bound for pruning
+  xprune(jcol) = StorageIndex(nextl);  // Initialize upper bound for pruning
   glu.xlsub(jcolp1) = StorageIndex(nextl); 
   
   return 0; 
diff --git a/contrib/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/contrib/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
index 95ba7413f2982b4b9ec19dec75a593d7b73be839..e37c2fe0d028482549db8186a6650745ce0f6db7 100644
--- a/contrib/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
+++ b/contrib/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
@@ -215,7 +215,7 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
         if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize);  }\
                    pstore(C0+i+(I)*PacketSize, c0);
         
-        // agressive vectorization and peeling
+        // aggressive vectorization and peeling
         for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
         {
           EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL2");
diff --git a/contrib/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/contrib/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h
index 822cf32c3474eed5f026683572672b5f418565dd..f052001c8f8880c17909959687bd2be4a97ae02a 100644
--- a/contrib/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h
+++ b/contrib/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h
@@ -38,7 +38,7 @@ namespace internal {
  * \brief Performs numeric block updates (sup-panel) in topological order.
  * 
  * Before entering this routine, the original nonzeros in the panel
- * were already copied i nto the spa[m,w]
+ * were already copied into the spa[m,w]
  * 
  * \param m number of rows in the matrix
  * \param w Panel size
diff --git a/contrib/eigen/Eigen/src/SparseQR/SparseQR.h b/contrib/eigen/Eigen/src/SparseQR/SparseQR.h
index 7409fcae9402fed1325ddfc793f6ca0a97b5d29f..d1fb96f5cbac0d14fa0eedab79c384ca1cbe7094 100644
--- a/contrib/eigen/Eigen/src/SparseQR/SparseQR.h
+++ b/contrib/eigen/Eigen/src/SparseQR/SparseQR.h
@@ -41,15 +41,16 @@ namespace internal {
 /**
   * \ingroup SparseQR_Module
   * \class SparseQR
-  * \brief Sparse left-looking rank-revealing QR factorization
+  * \brief Sparse left-looking QR factorization with numerical column pivoting
   * 
-  * This class implements a left-looking rank-revealing QR decomposition 
-  * of sparse matrices. When a column has a norm less than a given tolerance
+  * This class implements a left-looking QR decomposition of sparse matrices
+  * with numerical column pivoting.
+  * When a column has a norm less than a given tolerance
   * it is implicitly permuted to the end. The QR factorization thus obtained is 
   * given by A*P = Q*R where R is upper triangular or trapezoidal. 
   * 
   * P is the column permutation which is the product of the fill-reducing and the
-  * rank-revealing permutations. Use colsPermutation() to get it.
+  * numerical permutations. Use colsPermutation() to get it.
   * 
   * Q is the orthogonal matrix represented as products of Householder reflectors. 
   * Use matrixQ() to get an expression and matrixQ().adjoint() to get the adjoint.
@@ -64,6 +65,17 @@ namespace internal {
   * 
   * \implsparsesolverconcept
   *
+  * The numerical pivoting strategy and default threshold are the same as in SuiteSparse QR, and
+  * detailed in the following paper:
+  * <i>
+  * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
+  * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011.
+  * </i>
+  * Even though it is qualified as "rank-revealing", this strategy might fail for some 
+  * rank deficient problems. When this class is used to solve linear or least-square problems
+  * it is thus strongly recommended to check the accuracy of the computed solution. If it
+  * failed, it usually helps to increase the threshold with setPivotThreshold.
+  * 
   * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()).
   * \warning For complex matrices matrixQ().transpose() will actually return the adjoint matrix.
   * 
@@ -331,7 +343,7 @@ void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
   m_R.resize(m, n);
   m_Q.resize(m, diagSize);
   
-  // Allocate space for nonzero elements : rough estimation
+  // Allocate space for nonzero elements: rough estimation
   m_R.reserve(2*mat.nonZeros()); //FIXME Get a more accurate estimation through symbolic factorization with the etree
   m_Q.reserve(2*mat.nonZeros());
   m_hcoeffs.resize(diagSize);
@@ -640,7 +652,8 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
       // Compute res = Q * other column by column
       for(Index j = 0; j < res.cols(); j++)
       {
-        for (Index k = diagSize-1; k >=0; k--)
+        Index start_k = internal::is_identity<Derived>::value ? numext::mini(j,diagSize-1) : diagSize-1;
+        for (Index k = start_k; k >=0; k--)
         {
           Scalar tau = Scalar(0);
           tau = m_qr.m_Q.col(k).dot(res.col(j));
diff --git a/contrib/eigen/Eigen/src/StlSupport/StdDeque.h b/contrib/eigen/Eigen/src/StlSupport/StdDeque.h
index af158f425d41c426e87c1fbff51a55add617f58e..6d47e7572206c781864286c4836312ad920c1e53 100644
--- a/contrib/eigen/Eigen/src/StlSupport/StdDeque.h
+++ b/contrib/eigen/Eigen/src/StlSupport/StdDeque.h
@@ -36,7 +36,7 @@ namespace std \
     deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : deque_base(first, last, a) {} \
     deque(const deque& c) : deque_base(c) {}  \
     explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \
-    deque(iterator start, iterator end) : deque_base(start, end) {}  \
+    deque(iterator start_, iterator end_) : deque_base(start_, end_) {}  \
     deque& operator=(const deque& x) {  \
       deque_base::operator=(x);  \
       return *this;  \
@@ -62,7 +62,7 @@ namespace std {
     : deque_base(first, last, a) {} \
     deque(const deque& c) : deque_base(c) {}  \
     explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \
-    deque(iterator start, iterator end) : deque_base(start, end) {}  \
+    deque(iterator start_, iterator end_) : deque_base(start_, end_) {}  \
     deque& operator=(const deque& x) {  \
       deque_base::operator=(x);  \
       return *this;  \
@@ -98,19 +98,7 @@ namespace std {
   { return deque_base::insert(position,x); }
   void insert(const_iterator position, size_type new_size, const value_type& x)
   { deque_base::insert(position, new_size, x); }
-#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2) && !EIGEN_GNUC_AT_LEAST(10, 1)
-  // workaround GCC std::deque implementation
-  // GCC 10.1 doesn't let us access _Deque_impl _M_impl anymore and we have to
-  // fall-back to the default case
-  void resize(size_type new_size, const value_type& x)
-  {
-    if (new_size < deque_base::size())
-      deque_base::_M_erase_at_end(this->_M_impl._M_start + new_size);
-    else
-      deque_base::insert(deque_base::end(), new_size - deque_base::size(), x);
-  }
 #else
-  // either non-GCC or GCC between 4.1 and 10.1
   // default implementation which should always work.
   void resize(size_type new_size, const value_type& x)
   {
diff --git a/contrib/eigen/Eigen/src/StlSupport/StdList.h b/contrib/eigen/Eigen/src/StlSupport/StdList.h
index e1eba4985967f2c2774451497cf9220ef0278f97..8ba3fada0a50534e4dd6ea14818c997faa8f32ed 100644
--- a/contrib/eigen/Eigen/src/StlSupport/StdList.h
+++ b/contrib/eigen/Eigen/src/StlSupport/StdList.h
@@ -35,7 +35,7 @@ namespace std \
     list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : list_base(first, last, a) {} \
     list(const list& c) : list_base(c) {}  \
     explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \
-    list(iterator start, iterator end) : list_base(start, end) {}  \
+    list(iterator start_, iterator end_) : list_base(start_, end_) {}  \
     list& operator=(const list& x) {  \
       list_base::operator=(x);  \
       return *this;  \
@@ -62,7 +62,7 @@ namespace std
     : list_base(first, last, a) {} \
     list(const list& c) : list_base(c) {}  \
     explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \
-    list(iterator start, iterator end) : list_base(start, end) {}  \
+    list(iterator start_, iterator end_) : list_base(start_, end_) {}  \
     list& operator=(const list& x) {  \
     list_base::operator=(x);  \
     return *this; \
diff --git a/contrib/eigen/Eigen/src/StlSupport/StdVector.h b/contrib/eigen/Eigen/src/StlSupport/StdVector.h
index ec22821d2612c31bd84a08e9df0565d58f37eb30..9fcf19bce8584bd5508c0bd96836c359b03f0e75 100644
--- a/contrib/eigen/Eigen/src/StlSupport/StdVector.h
+++ b/contrib/eigen/Eigen/src/StlSupport/StdVector.h
@@ -36,7 +36,7 @@ namespace std \
     vector(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : vector_base(first, last, a) {} \
     vector(const vector& c) : vector_base(c) {}  \
     explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \
-    vector(iterator start, iterator end) : vector_base(start, end) {}  \
+    vector(iterator start_, iterator end_) : vector_base(start_, end_) {}  \
     vector& operator=(const vector& x) {  \
       vector_base::operator=(x);  \
       return *this;  \
@@ -62,7 +62,7 @@ namespace std {
     : vector_base(first, last, a) {} \
     vector(const vector& c) : vector_base(c) {}  \
     explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \
-    vector(iterator start, iterator end) : vector_base(start, end) {}  \
+    vector(iterator start_, iterator end_) : vector_base(start_, end_) {}  \
     vector& operator=(const vector& x) {  \
       vector_base::operator=(x);  \
       return *this;  \
diff --git a/contrib/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h b/contrib/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h
index 7261c7d0fc8ca8b026980b697cdd0d93d6865701..d1d3ad7f192915d146fe1eaba403e02c716d22e9 100644
--- a/contrib/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/contrib/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -217,12 +217,12 @@ struct SluMatrix : SuperMatrix
     res.setScalarType<typename MatrixType::Scalar>();
 
     // FIXME the following is not very accurate
-    if (MatrixType::Flags & Upper)
+    if (int(MatrixType::Flags) & int(Upper))
       res.Mtype = SLU_TRU;
-    if (MatrixType::Flags & Lower)
+    if (int(MatrixType::Flags) & int(Lower))
       res.Mtype = SLU_TRL;
 
-    eigen_assert(((MatrixType::Flags & SelfAdjoint)==0) && "SelfAdjoint matrix shape not supported by SuperLU");
+    eigen_assert(((int(MatrixType::Flags) & int(SelfAdjoint))==0) && "SelfAdjoint matrix shape not supported by SuperLU");
 
     return res;
   }
@@ -352,7 +352,7 @@ class SuperLUBase : public SparseSolverBase<Derived>
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
@@ -650,9 +650,8 @@ void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>
 {
   eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
 
-  const Index size = m_matrix.rows();
   const Index rhsCols = b.cols();
-  eigen_assert(size==b.rows());
+  eigen_assert(m_matrix.rows()==b.rows());
 
   m_sluOptions.Trans = NOTRANS;
   m_sluOptions.Fact = FACTORED;
@@ -974,9 +973,8 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest
 {
   eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
 
-  const int size = m_matrix.rows();
   const int rhsCols = b.cols();
-  eigen_assert(size==b.rows());
+  eigen_assert(m_matrix.rows()==b.rows());
 
   m_sluOptions.Trans = NOTRANS;
   m_sluOptions.Fact = FACTORED;
diff --git a/contrib/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h b/contrib/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h
index 91c09ab13382413f77e2d65f7bc19b760cdd1692..e3a333f80f0511e861591c7a5e92da95ffa2142c 100644
--- a/contrib/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/contrib/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h
@@ -10,6 +10,16 @@
 #ifndef EIGEN_UMFPACKSUPPORT_H
 #define EIGEN_UMFPACKSUPPORT_H
 
+// for compatibility with super old version of umfpack,
+// not sure this is really needed, but this is harmless.
+#ifndef SuiteSparse_long
+#ifdef UF_long
+#define SuiteSparse_long UF_long
+#else
+#error neither SuiteSparse_long nor UF_long are defined
+#endif
+#endif
+
 namespace Eigen {
 
 /* TODO extract L, extract U, compute det, etc... */
@@ -17,42 +27,85 @@ namespace Eigen {
 // generic double/complex<double> wrapper functions:
 
 
-inline void umfpack_defaults(double control[UMFPACK_CONTROL], double)
+ // Defaults
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, int)
 { umfpack_di_defaults(control); }
 
-inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>)
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>, int)
 { umfpack_zi_defaults(control); }
 
-inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double)
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, SuiteSparse_long)
+{ umfpack_dl_defaults(control); }
+
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_defaults(control); }
+
+// Report info
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, int)
 { umfpack_di_report_info(control, info);}
 
-inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex<double>)
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex<double>, int)
 { umfpack_zi_report_info(control, info);}
 
-inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double)
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, SuiteSparse_long)
+{ umfpack_dl_report_info(control, info);}
+
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_report_info(control, info);}
+
+// Report status
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, int)
 { umfpack_di_report_status(control, status);}
 
-inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex<double>)
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex<double>, int)
 { umfpack_zi_report_status(control, status);}
 
-inline void umfpack_report_control(double control[UMFPACK_CONTROL], double)
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, SuiteSparse_long)
+{ umfpack_dl_report_status(control, status);}
+
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_report_status(control, status);}
+
+// report control
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, int)
 { umfpack_di_report_control(control);}
 
-inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex<double>)
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex<double>, int)
 { umfpack_zi_report_control(control);}
 
-inline void umfpack_free_numeric(void **Numeric, double)
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, SuiteSparse_long)
+{ umfpack_dl_report_control(control);}
+
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_report_control(control);}
+
+// Free numeric
+inline void umfpack_free_numeric(void **Numeric, double, int)
 { umfpack_di_free_numeric(Numeric); *Numeric = 0; }
 
-inline void umfpack_free_numeric(void **Numeric, std::complex<double>)
+inline void umfpack_free_numeric(void **Numeric, std::complex<double>, int)
 { umfpack_zi_free_numeric(Numeric); *Numeric = 0; }
 
-inline void umfpack_free_symbolic(void **Symbolic, double)
+inline void umfpack_free_numeric(void **Numeric, double, SuiteSparse_long)
+{ umfpack_dl_free_numeric(Numeric); *Numeric = 0; }
+
+inline void umfpack_free_numeric(void **Numeric, std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_free_numeric(Numeric); *Numeric = 0; }
+
+// Free symbolic
+inline void umfpack_free_symbolic(void **Symbolic, double, int)
 { umfpack_di_free_symbolic(Symbolic); *Symbolic = 0; }
 
-inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>)
+inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>, int)
 { umfpack_zi_free_symbolic(Symbolic); *Symbolic = 0; }
 
+inline void umfpack_free_symbolic(void **Symbolic, double, SuiteSparse_long)
+{ umfpack_dl_free_symbolic(Symbolic); *Symbolic = 0; }
+
+inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_free_symbolic(Symbolic); *Symbolic = 0; }
+
+// Symbolic
 inline int umfpack_symbolic(int n_row,int n_col,
                             const int Ap[], const int Ai[], const double Ax[], void **Symbolic,
                             const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
@@ -66,7 +119,21 @@ inline int umfpack_symbolic(int n_row,int n_col,
 {
   return umfpack_zi_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info);
 }
+inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col,
+                                          const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[], void **Symbolic,
+                                          const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
+{
+  return umfpack_dl_symbolic(n_row,n_col,Ap,Ai,Ax,Symbolic,Control,Info);
+}
 
+inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col,
+                                          const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex<double> Ax[], void **Symbolic,
+                                          const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
+{
+  return umfpack_zl_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info);
+}
+
+// Numeric
 inline int umfpack_numeric( const int Ap[], const int Ai[], const double Ax[],
                             void *Symbolic, void **Numeric,
                             const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
@@ -80,7 +147,21 @@ inline int umfpack_numeric( const int Ap[], const int Ai[], const std::complex<d
 {
   return umfpack_zi_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info);
 }
+inline SuiteSparse_long umfpack_numeric(const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[],
+                                        void *Symbolic, void **Numeric,
+                                        const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
+{
+  return umfpack_dl_numeric(Ap,Ai,Ax,Symbolic,Numeric,Control,Info);
+}
 
+inline SuiteSparse_long umfpack_numeric(const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex<double> Ax[],
+                                        void *Symbolic, void **Numeric,
+                                        const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
+{
+  return umfpack_zl_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info);
+}
+
+// solve
 inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const double Ax[],
                           double X[], const double B[], void *Numeric,
                           const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
@@ -95,6 +176,21 @@ inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const std::co
   return umfpack_zi_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info);
 }
 
+inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[],
+                                      double X[], const double B[], void *Numeric,
+                                      const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
+{
+  return umfpack_dl_solve(sys,Ap,Ai,Ax,X,B,Numeric,Control,Info);
+}
+
+inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex<double> Ax[],
+                                      std::complex<double> X[], const std::complex<double> B[], void *Numeric,
+                                      const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
+{
+  return umfpack_zl_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info);
+}
+
+// Get Lunz
 inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_udiag, void *Numeric, double)
 {
   return umfpack_di_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
@@ -105,6 +201,19 @@ inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_
   return umfpack_zi_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
 }
 
+inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col,
+                                          SuiteSparse_long *nz_udiag, void *Numeric, double)
+{
+  return umfpack_dl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
+}
+
+inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col,
+                                          SuiteSparse_long *nz_udiag, void *Numeric, std::complex<double>)
+{
+  return umfpack_zl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
+}
+
+// Get Numeric
 inline int umfpack_get_numeric(int Lp[], int Lj[], double Lx[], int Up[], int Ui[], double Ux[],
                                int P[], int Q[], double Dx[], int *do_recip, double Rs[], void *Numeric)
 {
@@ -120,18 +229,45 @@ inline int umfpack_get_numeric(int Lp[], int Lj[], std::complex<double> Lx[], in
   return umfpack_zi_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q,
                                 Dx?&dx0_real:0,0,do_recip,Rs,Numeric);
 }
+inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], double Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], double Ux[],
+                                            SuiteSparse_long P[], SuiteSparse_long Q[], double Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric)
+{
+  return umfpack_dl_get_numeric(Lp,Lj,Lx,Up,Ui,Ux,P,Q,Dx,do_recip,Rs,Numeric);
+}
 
-inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO])
+inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], std::complex<double> Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], std::complex<double> Ux[],
+                                            SuiteSparse_long P[], SuiteSparse_long Q[], std::complex<double> Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric)
+{
+  double& lx0_real = numext::real_ref(Lx[0]);
+  double& ux0_real = numext::real_ref(Ux[0]);
+  double& dx0_real = numext::real_ref(Dx[0]);
+  return umfpack_zl_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q,
+                                Dx?&dx0_real:0,0,do_recip,Rs,Numeric);
+}
+
+// Get Determinant
+inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int)
 {
   return umfpack_di_get_determinant(Mx,Ex,NumericHandle,User_Info);
 }
 
-inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO])
+inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int)
 {
   double& mx_real = numext::real_ref(*Mx);
   return umfpack_zi_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info);
 }
 
+inline SuiteSparse_long umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long)
+{
+  return umfpack_dl_get_determinant(Mx,Ex,NumericHandle,User_Info);
+}
+
+inline SuiteSparse_long umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long)
+{
+  double& mx_real = numext::real_ref(*Mx);
+  return umfpack_zl_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info);
+}
+
 
 /** \ingroup UmfPackSupport_Module
   * \brief A sparse LU factorization and solver based on UmfPack
@@ -164,7 +300,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
     typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
     typedef SparseMatrix<Scalar> LUMatrixType;
-    typedef SparseMatrix<Scalar,ColMajor,int> UmfpackMatrixType;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> UmfpackMatrixType;
     typedef Ref<const UmfpackMatrixType, StandardCompressedFormat> UmfpackMatrixRef;
     enum {
       ColsAtCompileTime = MatrixType::ColsAtCompileTime,
@@ -192,8 +328,8 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
 
     ~UmfPackLU()
     {
-      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
-      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(), StorageIndex());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar(), StorageIndex());
     }
 
     inline Index rows() const { return mp_matrix.rows(); }
@@ -201,7 +337,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
@@ -241,8 +377,8 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     template<typename InputMatrixType>
     void compute(const InputMatrixType& matrix)
     {
-      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
-      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex());
       grab(matrix.derived());
       analyzePattern_impl();
       factorize_impl();
@@ -257,8 +393,8 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     template<typename InputMatrixType>
     void analyzePattern(const InputMatrixType& matrix)
     {
-      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
-      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex());
 
       grab(matrix.derived());
 
@@ -309,7 +445,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     {
       eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
       if(m_numeric)
-        umfpack_free_numeric(&m_numeric,Scalar());
+        umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex());
 
       grab(matrix.derived());
 
@@ -320,28 +456,28 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
       *
       * \sa umfpackControl()
       */
-    void umfpackReportControl()
+    void printUmfpackControl()
     {
-      umfpack_report_control(m_control.data(), Scalar());
+      umfpack_report_control(m_control.data(), Scalar(),StorageIndex());
     }
 
     /** Prints statistics collected by UmfPack.
       *
       * \sa analyzePattern(), compute()
       */
-    void umfpackReportInfo()
+    void printUmfpackInfo()
     {
       eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
-      umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar());
+      umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar(),StorageIndex());
     }
 
     /** Prints the status of the previous factorization operation performed by UmfPack (symbolic or numerical factorization).
       *
       * \sa analyzePattern(), compute()
       */
-    void umfpackReportStatus() {
+    void printUmfpackStatus() {
       eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
-      umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar());
+      umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar(),StorageIndex());
     }
 
     /** \internal */
@@ -362,13 +498,13 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
       m_symbolic              = 0;
       m_extractedDataAreDirty = true;
 
-      umfpack_defaults(m_control.data(), Scalar());
+      umfpack_defaults(m_control.data(), Scalar(),StorageIndex());
     }
 
     void analyzePattern_impl()
     {
-      m_fact_errorCode = umfpack_symbolic(internal::convert_index<int>(mp_matrix.rows()),
-                                          internal::convert_index<int>(mp_matrix.cols()),
+      m_fact_errorCode = umfpack_symbolic(internal::convert_index<StorageIndex>(mp_matrix.rows()),
+                                          internal::convert_index<StorageIndex>(mp_matrix.cols()),
                                           mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
                                           &m_symbolic, m_control.data(), m_umfpackInfo.data());
 
@@ -408,7 +544,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
 
     // cached data to reduce reallocation, etc.
     mutable LUMatrixType m_l;
-    int m_fact_errorCode;
+    StorageIndex m_fact_errorCode;
     UmfpackControl m_control;
     mutable UmfpackInfo m_umfpackInfo;
 
@@ -438,7 +574,7 @@ void UmfPackLU<MatrixType>::extractData() const
   if (m_extractedDataAreDirty)
   {
     // get size of the data
-    int lnz, unz, rows, cols, nz_udiag;
+    StorageIndex lnz, unz, rows, cols, nz_udiag;
     umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar());
 
     // allocate data
@@ -464,7 +600,7 @@ template<typename MatrixType>
 typename UmfPackLU<MatrixType>::Scalar UmfPackLU<MatrixType>::determinant() const
 {
   Scalar det;
-  umfpack_get_determinant(&det, 0, m_numeric, 0);
+  umfpack_get_determinant(&det, 0, m_numeric, 0, StorageIndex());
   return det;
 }
 
@@ -477,7 +613,6 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas
   eigen_assert((XDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major result yet");
   eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve");
 
-  int errorCode;
   Scalar* x_ptr = 0;
   Matrix<Scalar,Dynamic,1> x_tmp;
   if(x.innerStride()!=1)
@@ -489,9 +624,10 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas
   {
     if(x.innerStride()==1)
       x_ptr = &x.col(j).coeffRef(0);
-    errorCode = umfpack_solve(UMFPACK_A,
-        mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
-        x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, m_control.data(), m_umfpackInfo.data());
+    StorageIndex errorCode = umfpack_solve(UMFPACK_A,
+                                mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+                                x_ptr, &b.const_cast_derived().col(j).coeffRef(0),
+                                m_numeric, m_control.data(), m_umfpackInfo.data());
     if(x.innerStride()!=1)
       x.col(j) = x_tmp;
     if (errorCode!=0)
diff --git a/contrib/eigen/Eigen/src/misc/lapacke.h b/contrib/eigen/Eigen/src/misc/lapacke.h
index 8c7e79b034c71de56f8c98b7a03c23850b4402dc..3d8e24f5a2ea640658a65f70f21f656c40185fea 100755
--- a/contrib/eigen/Eigen/src/misc/lapacke.h
+++ b/contrib/eigen/Eigen/src/misc/lapacke.h
@@ -43,10 +43,6 @@
 #include "lapacke_config.h"
 #endif
 
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
 #include <stdlib.h>
 
 #ifndef lapack_int
@@ -108,6 +104,11 @@ lapack_complex_double lapack_make_complex_double( double re, double im );
 
 #endif
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
 #ifndef LAPACKE_malloc
 #define LAPACKE_malloc( size ) malloc( size )
 #endif
diff --git a/contrib/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/contrib/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h
index 1f8a531af5966f86d10ffc7568dd4d1473ec1f99..0e5d5445b18fcda01a43e33e958a9a964b8d1354 100644
--- a/contrib/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h
+++ b/contrib/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h
@@ -75,6 +75,32 @@ max
   return (max)(Derived::PlainObject::Constant(rows(), cols(), other));
 }
 
+/** \returns an expression of the coefficient-wise absdiff of \c *this and \a other
+  *
+  * Example: \include Cwise_absolute_difference.cpp
+  * Output: \verbinclude Cwise_absolute_difference.out
+  *
+  * \sa absolute_difference()
+  */
+EIGEN_MAKE_CWISE_BINARY_OP(absolute_difference,absolute_difference)
+
+/** \returns an expression of the coefficient-wise absolute_difference of \c *this and scalar \a other
+  *
+  * \sa absolute_difference()
+  */
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_absolute_difference_op<Scalar,Scalar>, const Derived,
+                                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+absolute_difference
+#else
+(absolute_difference)
+#endif
+(const Scalar &other) const
+{
+  return (absolute_difference)(Derived::PlainObject::Constant(rows(), cols(), other));
+}
+
 /** \returns an expression of the coefficient-wise power of \c *this to the given array of \a exponents.
   *
   * This function computes the coefficient-wise power.
@@ -119,7 +145,7 @@ OP(const Scalar& s) const { \
   return this->OP(Derived::PlainObject::Constant(rows(), cols(), s)); \
 } \
 EIGEN_DEVICE_FUNC friend EIGEN_STRONG_INLINE const RCmp ## COMPARATOR ## ReturnType \
-OP(const Scalar& s, const Derived& d) { \
+OP(const Scalar& s, const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>& d) { \
   return Derived::PlainObject::Constant(d.rows(), d.cols(), s).OP(d); \
 }
 
@@ -314,9 +340,9 @@ polygamma(const EIGEN_CURRENT_STORAGE_BASE_CLASS<DerivedN> &n) const
   *
   * It returns the Riemann zeta function of two arguments \c *this and \a q:
   *
-  * \param *this is the exposent, it must be > 1
   * \param q is the shift, it must be > 0
   *
+  * \note *this is the exponent, it must be > 1.
   * \note This function supports only float and double scalar types. To support other scalar types, the user has
   * to provide implementations of zeta(T,T) for any scalar type T to be supported.
   *
diff --git a/contrib/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/contrib/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index ebaa3f192b11719e74ab628d510760954ee506ef..13c55f4b115880c0becb6246bdc1fb86c9f404fa 100644
--- a/contrib/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/contrib/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -10,9 +10,11 @@ typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> Inverse
 typedef CwiseUnaryOp<internal::scalar_boolean_not_op<Scalar>, const Derived> BooleanNotReturnType;
 
 typedef CwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived> ExpReturnType;
+typedef CwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived> Expm1ReturnType;
 typedef CwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived> LogReturnType;
 typedef CwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived> Log1pReturnType;
 typedef CwiseUnaryOp<internal::scalar_log10_op<Scalar>, const Derived> Log10ReturnType;
+typedef CwiseUnaryOp<internal::scalar_log2_op<Scalar>, const Derived> Log2ReturnType;
 typedef CwiseUnaryOp<internal::scalar_cos_op<Scalar>, const Derived> CosReturnType;
 typedef CwiseUnaryOp<internal::scalar_sin_op<Scalar>, const Derived> SinReturnType;
 typedef CwiseUnaryOp<internal::scalar_tan_op<Scalar>, const Derived> TanReturnType;
@@ -20,11 +22,18 @@ typedef CwiseUnaryOp<internal::scalar_acos_op<Scalar>, const Derived> AcosReturn
 typedef CwiseUnaryOp<internal::scalar_asin_op<Scalar>, const Derived> AsinReturnType;
 typedef CwiseUnaryOp<internal::scalar_atan_op<Scalar>, const Derived> AtanReturnType;
 typedef CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> TanhReturnType;
+typedef CwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived> LogisticReturnType;
 typedef CwiseUnaryOp<internal::scalar_sinh_op<Scalar>, const Derived> SinhReturnType;
+#if EIGEN_HAS_CXX11_MATH
+typedef CwiseUnaryOp<internal::scalar_atanh_op<Scalar>, const Derived> AtanhReturnType;
+typedef CwiseUnaryOp<internal::scalar_asinh_op<Scalar>, const Derived> AsinhReturnType;
+typedef CwiseUnaryOp<internal::scalar_acosh_op<Scalar>, const Derived> AcoshReturnType;
+#endif
 typedef CwiseUnaryOp<internal::scalar_cosh_op<Scalar>, const Derived> CoshReturnType;
 typedef CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> SquareReturnType;
 typedef CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived> CubeReturnType;
 typedef CwiseUnaryOp<internal::scalar_round_op<Scalar>, const Derived> RoundReturnType;
+typedef CwiseUnaryOp<internal::scalar_rint_op<Scalar>, const Derived> RintReturnType;
 typedef CwiseUnaryOp<internal::scalar_floor_op<Scalar>, const Derived> FloorReturnType;
 typedef CwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived> CeilReturnType;
 typedef CwiseUnaryOp<internal::scalar_isnan_op<Scalar>, const Derived> IsNaNReturnType;
@@ -90,6 +99,20 @@ exp() const
   return ExpReturnType(derived());
 }
 
+/** \returns an expression of the coefficient-wise exponential of *this minus 1.
+  *
+  * In exact arithmetic, \c x.expm1() is equivalent to \c x.exp() - 1,
+  * however, with finite precision, this function is much more accurate when \c x is close to zero.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_expm1">Math functions</a>, exp()
+  */
+EIGEN_DEVICE_FUNC
+inline const Expm1ReturnType
+expm1() const
+{
+  return Expm1ReturnType(derived());
+}
+
 /** \returns an expression of the coefficient-wise logarithm of *this.
   *
   * This function computes the coefficient-wise logarithm. The function MatrixBase::log() in the
@@ -98,7 +121,7 @@ exp() const
   * Example: \include Cwise_log.cpp
   * Output: \verbinclude Cwise_log.out
   *
-  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log">Math functions</a>, exp()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log">Math functions</a>, log()
   */
 EIGEN_DEVICE_FUNC
 inline const LogReturnType
@@ -137,6 +160,18 @@ log10() const
   return Log10ReturnType(derived());
 }
 
+/** \returns an expression of the coefficient-wise base-2 logarithm of *this.
+  *
+  * This function computes the coefficient-wise base-2 logarithm.
+  *
+  */
+EIGEN_DEVICE_FUNC
+inline const Log2ReturnType
+log2() const
+{
+  return Log2ReturnType(derived());
+}
+
 /** \returns an expression of the coefficient-wise square root of *this.
   *
   * This function computes the coefficient-wise square root. The function MatrixBase::sqrt() in the
@@ -311,7 +346,7 @@ sinh() const
   * Example: \include Cwise_cosh.cpp
   * Output: \verbinclude Cwise_cosh.out
   *
-  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tan(), sinh(), cosh()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tanh(), sinh(), cosh()
   */
 EIGEN_DEVICE_FUNC
 inline const CoshReturnType
@@ -320,6 +355,50 @@ cosh() const
   return CoshReturnType(derived());
 }
 
+#if EIGEN_HAS_CXX11_MATH
+/** \returns an expression of the coefficient-wise inverse hyperbolic tan of *this.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_atanh">Math functions</a>, atanh(), asinh(), acosh()
+  */
+EIGEN_DEVICE_FUNC
+inline const AtanhReturnType
+atanh() const
+{
+  return AtanhReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise inverse hyperbolic sin of *this.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_asinh">Math functions</a>, atanh(), asinh(), acosh()
+  */
+EIGEN_DEVICE_FUNC
+inline const AsinhReturnType
+asinh() const
+{
+  return AsinhReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise inverse hyperbolic cos of *this.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_acosh">Math functions</a>, atanh(), asinh(), acosh()
+  */
+EIGEN_DEVICE_FUNC
+inline const AcoshReturnType
+acosh() const
+{
+  return AcoshReturnType(derived());
+}
+#endif
+
+/** \returns an expression of the coefficient-wise logistic of *this.
+  */
+EIGEN_DEVICE_FUNC
+inline const LogisticReturnType
+logistic() const
+{
+  return LogisticReturnType(derived());
+}
+
 /** \returns an expression of the coefficient-wise inverse of *this.
   *
   * Example: \include Cwise_inverse.cpp
@@ -362,6 +441,20 @@ cube() const
   return CubeReturnType(derived());
 }
 
+/** \returns an expression of the coefficient-wise rint of *this.
+  *
+  * Example: \include Cwise_rint.cpp
+  * Output: \verbinclude Cwise_rint.out
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_rint">Math functions</a>, ceil(), floor()
+  */
+EIGEN_DEVICE_FUNC
+inline const RintReturnType
+rint() const
+{
+  return RintReturnType(derived());
+}
+
 /** \returns an expression of the coefficient-wise round of *this.
   *
   * Example: \include Cwise_round.cpp
@@ -404,6 +497,45 @@ ceil() const
   return CeilReturnType(derived());
 }
 
+template<int N> struct ShiftRightXpr {
+  typedef CwiseUnaryOp<internal::scalar_shift_right_op<Scalar, N>, const Derived> Type;
+};
+
+/** \returns an expression of \c *this with the \a Scalar type arithmetically
+  * shifted right by \a N bit positions.
+  *
+  * The template parameter \a N specifies the number of bit positions to shift.
+  * 
+  * \sa shiftLeft()
+  */
+template<int N>
+EIGEN_DEVICE_FUNC
+typename ShiftRightXpr<N>::Type
+shiftRight() const
+{
+  return typename ShiftRightXpr<N>::Type(derived());
+}
+
+
+template<int N> struct ShiftLeftXpr {
+  typedef CwiseUnaryOp<internal::scalar_shift_left_op<Scalar, N>, const Derived> Type;
+};
+
+/** \returns an expression of \c *this with the \a Scalar type logically
+  * shifted left by \a N bit positions.
+  *
+  * The template parameter \a N specifies the number of bit positions to shift.
+  *
+  * \sa shiftRight()
+  */
+template<int N>
+EIGEN_DEVICE_FUNC
+typename ShiftLeftXpr<N>::Type
+shiftLeft() const
+{
+  return typename ShiftLeftXpr<N>::Type(derived());
+}
+
 /** \returns an expression of the coefficient-wise isnan of *this.
   *
   * Example: \include Cwise_isNaN.cpp
@@ -471,14 +603,12 @@ typedef CwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived> LgammaRe
 typedef CwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived> DigammaReturnType;
 typedef CwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> ErfReturnType;
 typedef CwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> ErfcReturnType;
+typedef CwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived> NdtriReturnType;
 
 /** \cpp11 \returns an expression of the coefficient-wise ln(|gamma(*this)|).
   *
   * \specialfunctions_module
   *
-  * Example: \include Cwise_lgamma.cpp
-  * Output: \verbinclude Cwise_lgamma.out
-  *
   * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
   * or float/double in non c++11 mode, the user has to provide implementations of lgamma(T) for any scalar
   * type T to be supported.
@@ -514,9 +644,6 @@ digamma() const
   *
   * \specialfunctions_module
   *
-  * Example: \include Cwise_erf.cpp
-  * Output: \verbinclude Cwise_erf.out
-  *
   * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
   * or float/double in non c++11 mode, the user has to provide implementations of erf(T) for any scalar
   * type T to be supported.
@@ -535,9 +662,6 @@ erf() const
   *
   * \specialfunctions_module
   *
-  * Example: \include Cwise_erfc.cpp
-  * Output: \verbinclude Cwise_erfc.out
-  *
   * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
   * or float/double in non c++11 mode, the user has to provide implementations of erfc(T) for any scalar
   * type T to be supported.
@@ -550,3 +674,23 @@ erfc() const
 {
   return ErfcReturnType(derived());
 }
+
+/** \returns an expression of the coefficient-wise inverse of the CDF of the Normal distribution function
+  * function of *this.
+  *
+  * \specialfunctions_module
+  * 
+  * In other words, considering `x = ndtri(y)`, it returns the argument, x, for which the area under the
+  * Gaussian probability density function (integrated from minus infinity to x) is equal to y.
+  *
+  * \note This function supports only float and double scalar types. To support other scalar types,
+  * the user has to provide implementations of ndtri(T) for any scalar type T to be supported.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ndtri">Math functions</a>
+  */
+EIGEN_DEVICE_FUNC
+inline const NdtriReturnType
+ndtri() const
+{
+  return NdtriReturnType(derived());
+}
diff --git a/contrib/eigen/Eigen/src/plugins/BlockMethods.h b/contrib/eigen/Eigen/src/plugins/BlockMethods.h
index ac35a0086cf8befddaa5f38ff3ca7588e5750712..63a52a6ffaa9bcadef7961d05fabe353f52aadc2 100644
--- a/contrib/eigen/Eigen/src/plugins/BlockMethods.h
+++ b/contrib/eigen/Eigen/src/plugins/BlockMethods.h
@@ -40,68 +40,126 @@ typedef const VectorBlock<const Derived> ConstSegmentReturnType;
 template<int Size> struct FixedSegmentReturnType { typedef VectorBlock<Derived, Size> Type; };
 template<int Size> struct ConstFixedSegmentReturnType { typedef const VectorBlock<const Derived, Size> Type; };
 
+/// \internal inner-vector
+typedef Block<Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true>       InnerVectorReturnType;
+typedef Block<const Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true> ConstInnerVectorReturnType;
+
+/// \internal set of inner-vectors
+typedef Block<Derived,Dynamic,Dynamic,true> InnerVectorsReturnType;
+typedef Block<const Derived,Dynamic,Dynamic,true> ConstInnerVectorsReturnType;
+
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
-/// \returns a dynamic-size expression of a block in *this.
+/// \returns an expression of a block in \c *this with either dynamic or fixed sizes.
 ///
-/// \param startRow the first row in the block
-/// \param startCol the first column in the block
-/// \param blockRows the number of rows in the block
-/// \param blockCols the number of columns in the block
+/// \param  startRow  the first row in the block
+/// \param  startCol  the first column in the block
+/// \param  blockRows number of rows in the block, specified at either run-time or compile-time
+/// \param  blockCols number of columns in the block, specified at either run-time or compile-time
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
 ///
-/// Example: \include MatrixBase_block_int_int_int_int.cpp
+/// Example using runtime (aka dynamic) sizes: \include MatrixBase_block_int_int_int_int.cpp
 /// Output: \verbinclude MatrixBase_block_int_int_int_int.out
 ///
-/// \note Even though the returned expression has dynamic size, in the case
+/// \newin{3.4}:
+///
+/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments. In the later case, \c n plays the role of a runtime fallback value in case \c N equals Eigen::Dynamic.
+/// Here is an example with a fixed number of rows \c NRows and dynamic number of columns \c cols:
+/// \code
+/// mat.block(i,j,fix<NRows>,cols)
+/// \endcode
+///
+/// This function thus fully covers the features offered by the following overloads block<NRows,NCols>(Index, Index),
+/// and block<NRows,NCols>(Index, Index, Index, Index) that are thus obsolete. Indeed, this generic version avoids
+/// redundancy, it preserves the argument order, and prevents the need to rely on the template keyword in templated code.
+///
+/// but with less redundancy and more consistency as it does not modify the argument order
+/// and seamlessly enable hybrid fixed/dynamic sizes.
+///
+/// \note Even in the case that the returned expression has dynamic size, in the case
 /// when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
 /// which means that evaluating it does not cause a dynamic memory allocation.
 ///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa class Block, block(Index,Index)
+/// \sa class Block, fix, fix<N>(int)
 ///
-EIGEN_DEVICE_FUNC
-inline BlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols)
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename FixedBlockXpr<...,...>::Type
+#endif
+block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols)
 {
-  return BlockXpr(derived(), startRow, startCol, blockRows, blockCols);
+  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type(
+            derived(), startRow, startCol, internal::get_runtime_value(blockRows), internal::get_runtime_value(blockCols));
 }
 
-/// This is the const version of block(Index,Index,Index,Index). */
-EIGEN_DEVICE_FUNC
-inline const ConstBlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) const
+/// This is the const version of block(Index,Index,NRowsType,NColsType)
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+const typename ConstFixedBlockXpr<...,...>::Type
+#endif
+block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) const
 {
-  return ConstBlockXpr(derived(), startRow, startCol, blockRows, blockCols);
+  return typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type(
+            derived(), startRow, startCol, internal::get_runtime_value(blockRows), internal::get_runtime_value(blockCols));
 }
 
 
 
-
-/// \returns a dynamic-size expression of a top-right corner of *this.
+/// \returns a expression of a top-right corner of \c *this with either dynamic or fixed sizes.
 ///
 /// \param cRows the number of rows in the corner
 /// \param cCols the number of columns in the corner
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
 ///
-/// Example: \include MatrixBase_topRightCorner_int_int.cpp
+/// Example with dynamic sizes: \include MatrixBase_topRightCorner_int_int.cpp
 /// Output: \verbinclude MatrixBase_topRightCorner_int_int.out
 ///
+/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
-inline BlockXpr topRightCorner(Index cRows, Index cCols)
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename FixedBlockXpr<...,...>::Type
+#endif
+topRightCorner(NRowsType cRows, NColsType cCols)
 {
-  return BlockXpr(derived(), 0, cols() - cCols, cRows, cCols);
+  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, cols() - internal::get_runtime_value(cCols), internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
 }
 
-/// This is the const version of topRightCorner(Index, Index).
-EIGEN_DEVICE_FUNC
-inline const ConstBlockXpr topRightCorner(Index cRows, Index cCols) const
+/// This is the const version of topRightCorner(NRowsType, NColsType).
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+const typename ConstFixedBlockXpr<...,...>::Type
+#endif
+topRightCorner(NRowsType cRows, NColsType cCols) const
 {
-  return ConstBlockXpr(derived(), 0, cols() - cCols, cRows, cCols);
+  return typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, cols() - internal::get_runtime_value(cCols), internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
 }
 
-/// \returns an expression of a fixed-size top-right corner of *this.
+/// \returns an expression of a fixed-size top-right corner of \c *this.
 ///
 /// \tparam CRows the number of rows in the corner
 /// \tparam CCols the number of columns in the corner
@@ -114,21 +172,21 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa class Block, block<int,int>(Index,Index)
 ///
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC
-inline typename FixedBlockXpr<CRows,CCols>::Type topRightCorner()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<CRows,CCols>::Type topRightCorner()
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - CCols);
 }
 
 /// This is the const version of topRightCorner<int, int>().
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC
-inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner() const
 {
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - CCols);
 }
 
-/// \returns an expression of a top-right corner of *this.
+/// \returns an expression of a top-right corner of \c *this.
 ///
 /// \tparam CRows number of rows in corner as specified at compile-time
 /// \tparam CCols number of columns in corner as specified at compile-time
@@ -148,46 +206,67 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa class Block
 ///
 template<int CRows, int CCols>
-inline typename FixedBlockXpr<CRows,CCols>::Type topRightCorner(Index cRows, Index cCols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<CRows,CCols>::Type topRightCorner(Index cRows, Index cCols)
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
 /// This is the const version of topRightCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
-inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner(Index cRows, Index cCols) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner(Index cRows, Index cCols) const
 {
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
 
 
-/// \returns a dynamic-size expression of a top-left corner of *this.
+/// \returns an expression of a top-left corner of \c *this  with either dynamic or fixed sizes.
 ///
 /// \param cRows the number of rows in the corner
 /// \param cCols the number of columns in the corner
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
 ///
 /// Example: \include MatrixBase_topLeftCorner_int_int.cpp
 /// Output: \verbinclude MatrixBase_topLeftCorner_int_int.out
 ///
+/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
-inline BlockXpr topLeftCorner(Index cRows, Index cCols)
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename FixedBlockXpr<...,...>::Type
+#endif
+topLeftCorner(NRowsType cRows, NColsType cCols)
 {
-  return BlockXpr(derived(), 0, 0, cRows, cCols);
+  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, 0, internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
 }
 
 /// This is the const version of topLeftCorner(Index, Index).
-EIGEN_DEVICE_FUNC
-inline const ConstBlockXpr topLeftCorner(Index cRows, Index cCols) const
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+const typename ConstFixedBlockXpr<...,...>::Type
+#endif
+topLeftCorner(NRowsType cRows, NColsType cCols) const
 {
-  return ConstBlockXpr(derived(), 0, 0, cRows, cCols);
+  return typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, 0, internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
 }
 
-/// \returns an expression of a fixed-size top-left corner of *this.
+/// \returns an expression of a fixed-size top-left corner of \c *this.
 ///
 /// The template parameters CRows and CCols are the number of rows and columns in the corner.
 ///
@@ -196,24 +275,24 @@ inline const ConstBlockXpr topLeftCorner(Index cRows, Index cCols) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC
-inline typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner()
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0);
 }
 
 /// This is the const version of topLeftCorner<int, int>().
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC
-inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner() const
 {
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0);
 }
 
-/// \returns an expression of a top-left corner of *this.
+/// \returns an expression of a top-left corner of \c *this.
 ///
 /// \tparam CRows number of rows in corner as specified at compile-time
 /// \tparam CCols number of columns in corner as specified at compile-time
@@ -233,46 +312,69 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa class Block
 ///
 template<int CRows, int CCols>
-inline typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index cRows, Index cCols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index cRows, Index cCols)
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0, cRows, cCols);
 }
 
 /// This is the const version of topLeftCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
-inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index cRows, Index cCols) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index cRows, Index cCols) const
 {
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0, cRows, cCols);
 }
 
 
 
-/// \returns a dynamic-size expression of a bottom-right corner of *this.
+/// \returns an expression of a bottom-right corner of \c *this  with either dynamic or fixed sizes.
 ///
 /// \param cRows the number of rows in the corner
 /// \param cCols the number of columns in the corner
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
 ///
 /// Example: \include MatrixBase_bottomRightCorner_int_int.cpp
 /// Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out
 ///
+/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
-inline BlockXpr bottomRightCorner(Index cRows, Index cCols)
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename FixedBlockXpr<...,...>::Type
+#endif
+bottomRightCorner(NRowsType cRows, NColsType cCols)
 {
-  return BlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), rows() - internal::get_runtime_value(cRows), cols() - internal::get_runtime_value(cCols),
+                        internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
 }
 
-/// This is the const version of bottomRightCorner(Index, Index).
-EIGEN_DEVICE_FUNC
-inline const ConstBlockXpr bottomRightCorner(Index cRows, Index cCols) const
+/// This is the const version of bottomRightCorner(NRowsType, NColsType).
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+const typename ConstFixedBlockXpr<...,...>::Type
+#endif
+bottomRightCorner(NRowsType cRows, NColsType cCols) const
 {
-  return ConstBlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+  return typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), rows() - internal::get_runtime_value(cRows), cols() - internal::get_runtime_value(cCols),
+                        internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
 }
 
-/// \returns an expression of a fixed-size bottom-right corner of *this.
+/// \returns an expression of a fixed-size bottom-right corner of \c *this.
 ///
 /// The template parameters CRows and CCols are the number of rows and columns in the corner.
 ///
@@ -281,24 +383,24 @@ inline const ConstBlockXpr bottomRightCorner(Index cRows, Index cCols) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC
-inline typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner()
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, cols() - CCols);
 }
 
 /// This is the const version of bottomRightCorner<int, int>().
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC
-inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner() const
 {
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, cols() - CCols);
 }
 
-/// \returns an expression of a bottom-right corner of *this.
+/// \returns an expression of a bottom-right corner of \c *this.
 ///
 /// \tparam CRows number of rows in corner as specified at compile-time
 /// \tparam CCols number of columns in corner as specified at compile-time
@@ -318,46 +420,69 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa class Block
 ///
 template<int CRows, int CCols>
-inline typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner(Index cRows, Index cCols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner(Index cRows, Index cCols)
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
 /// This is the const version of bottomRightCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
-inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner(Index cRows, Index cCols) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner(Index cRows, Index cCols) const
 {
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
 
 
-/// \returns a dynamic-size expression of a bottom-left corner of *this.
+/// \returns an expression of a bottom-left corner of \c *this  with either dynamic or fixed sizes.
 ///
 /// \param cRows the number of rows in the corner
 /// \param cCols the number of columns in the corner
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
 ///
 /// Example: \include MatrixBase_bottomLeftCorner_int_int.cpp
 /// Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out
 ///
+/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
-inline BlockXpr bottomLeftCorner(Index cRows, Index cCols)
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename FixedBlockXpr<...,...>::Type
+#endif
+bottomLeftCorner(NRowsType cRows, NColsType cCols)
 {
-  return BlockXpr(derived(), rows() - cRows, 0, cRows, cCols);
+  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), rows() - internal::get_runtime_value(cRows), 0,
+                        internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
 }
 
-/// This is the const version of bottomLeftCorner(Index, Index).
-EIGEN_DEVICE_FUNC
-inline const ConstBlockXpr bottomLeftCorner(Index cRows, Index cCols) const
+/// This is the const version of bottomLeftCorner(NRowsType, NColsType).
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename ConstFixedBlockXpr<...,...>::Type
+#endif
+bottomLeftCorner(NRowsType cRows, NColsType cCols) const
 {
-  return ConstBlockXpr(derived(), rows() - cRows, 0, cRows, cCols);
+  return typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), rows() - internal::get_runtime_value(cRows), 0,
+                        internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
 }
 
-/// \returns an expression of a fixed-size bottom-left corner of *this.
+/// \returns an expression of a fixed-size bottom-left corner of \c *this.
 ///
 /// The template parameters CRows and CCols are the number of rows and columns in the corner.
 ///
@@ -366,24 +491,24 @@ inline const ConstBlockXpr bottomLeftCorner(Index cRows, Index cCols) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC
-inline typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner()
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, 0);
 }
 
 /// This is the const version of bottomLeftCorner<int, int>().
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC
-inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner() const
 {
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, 0);
 }
 
-/// \returns an expression of a bottom-left corner of *this.
+/// \returns an expression of a bottom-left corner of \c *this.
 ///
 /// \tparam CRows number of rows in corner as specified at compile-time
 /// \tparam CCols number of columns in corner as specified at compile-time
@@ -403,45 +528,66 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa class Block
 ///
 template<int CRows, int CCols>
-inline typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Index cRows, Index cCols)
+EIGEN_STRONG_INLINE
+typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Index cRows, Index cCols)
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
 /// This is the const version of bottomLeftCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
-inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Index cRows, Index cCols) const
+EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Index cRows, Index cCols) const
 {
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
 
 
-/// \returns a block consisting of the top rows of *this.
+/// \returns a block consisting of the top rows of \c *this.
 ///
 /// \param n the number of rows in the block
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
 ///
 /// Example: \include MatrixBase_topRows_int.cpp
 /// Output: \verbinclude MatrixBase_topRows_int.out
 ///
+/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
-inline RowsBlockXpr topRows(Index n)
+template<typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+typename NRowsBlockXpr<...>::Type
+#endif
+topRows(NRowsType n)
 {
-  return RowsBlockXpr(derived(), 0, 0, n, cols());
+  return typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+            (derived(), 0, 0, internal::get_runtime_value(n), cols());
 }
 
-/// This is the const version of topRows(Index).
-EIGEN_DEVICE_FUNC
-inline ConstRowsBlockXpr topRows(Index n) const
+/// This is the const version of topRows(NRowsType).
+template<typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+const typename ConstNRowsBlockXpr<...>::Type
+#endif
+topRows(NRowsType n) const
 {
-  return ConstRowsBlockXpr(derived(), 0, 0, n, cols());
+  return typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+            (derived(), 0, 0, internal::get_runtime_value(n), cols());
 }
 
-/// \returns a block consisting of the top rows of *this.
+/// \returns a block consisting of the top rows of \c *this.
 ///
 /// \tparam N the number of rows in the block as specified at compile-time
 /// \param n the number of rows in the block as specified at run-time
@@ -454,50 +600,69 @@ inline ConstRowsBlockXpr topRows(Index n) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename NRowsBlockXpr<N>::Type topRows(Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename NRowsBlockXpr<N>::Type topRows(Index n = N)
 {
   return typename NRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
 }
 
 /// This is the const version of topRows<int>().
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const
 {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
 }
 
 
 
-/// \returns a block consisting of the bottom rows of *this.
+/// \returns a block consisting of the bottom rows of \c *this.
 ///
 /// \param n the number of rows in the block
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
 ///
 /// Example: \include MatrixBase_bottomRows_int.cpp
 /// Output: \verbinclude MatrixBase_bottomRows_int.out
 ///
+/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
-inline RowsBlockXpr bottomRows(Index n)
+template<typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+typename NRowsBlockXpr<...>::Type
+#endif
+bottomRows(NRowsType n)
 {
-  return RowsBlockXpr(derived(), rows() - n, 0, n, cols());
+  return typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+            (derived(), rows() - internal::get_runtime_value(n), 0, internal::get_runtime_value(n), cols());
 }
 
-/// This is the const version of bottomRows(Index).
-EIGEN_DEVICE_FUNC
-inline ConstRowsBlockXpr bottomRows(Index n) const
+/// This is the const version of bottomRows(NRowsType).
+template<typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+const typename ConstNRowsBlockXpr<...>::Type
+#endif
+bottomRows(NRowsType n) const
 {
-  return ConstRowsBlockXpr(derived(), rows() - n, 0, n, cols());
+  return typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+            (derived(), rows() - internal::get_runtime_value(n), 0, internal::get_runtime_value(n), cols());
 }
 
-/// \returns a block consisting of the bottom rows of *this.
+/// \returns a block consisting of the bottom rows of \c *this.
 ///
 /// \tparam N the number of rows in the block as specified at compile-time
 /// \param n the number of rows in the block as specified at run-time
@@ -510,51 +675,70 @@ inline ConstRowsBlockXpr bottomRows(Index n) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename NRowsBlockXpr<N>::Type bottomRows(Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename NRowsBlockXpr<N>::Type bottomRows(Index n = N)
 {
   return typename NRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
 }
 
 /// This is the const version of bottomRows<int>().
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const
 {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
 }
 
 
 
-/// \returns a block consisting of a range of rows of *this.
+/// \returns a block consisting of a range of rows of \c *this.
 ///
 /// \param startRow the index of the first row in the block
 /// \param n the number of rows in the block
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
 ///
 /// Example: \include DenseBase_middleRows_int.cpp
 /// Output: \verbinclude DenseBase_middleRows_int.out
 ///
+/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
-inline RowsBlockXpr middleRows(Index startRow, Index n)
+template<typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+typename NRowsBlockXpr<...>::Type
+#endif
+middleRows(Index startRow, NRowsType n)
 {
-  return RowsBlockXpr(derived(), startRow, 0, n, cols());
+  return typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+            (derived(), startRow, 0, internal::get_runtime_value(n), cols());
 }
 
-/// This is the const version of middleRows(Index,Index).
-EIGEN_DEVICE_FUNC
-inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const
+/// This is the const version of middleRows(Index,NRowsType).
+template<typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+const typename ConstNRowsBlockXpr<...>::Type
+#endif
+middleRows(Index startRow, NRowsType n) const
 {
-  return ConstRowsBlockXpr(derived(), startRow, 0, n, cols());
+  return typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+            (derived(), startRow, 0, internal::get_runtime_value(n), cols());
 }
 
-/// \returns a block consisting of a range of rows of *this.
+/// \returns a block consisting of a range of rows of \c *this.
 ///
 /// \tparam N the number of rows in the block as specified at compile-time
 /// \param startRow the index of the first row in the block
@@ -568,50 +752,69 @@ inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N)
 {
   return typename NRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
 }
 
 /// This is the const version of middleRows<int>().
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N) const
 {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
 }
 
 
 
-/// \returns a block consisting of the left columns of *this.
+/// \returns a block consisting of the left columns of \c *this.
 ///
 /// \param n the number of columns in the block
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
 ///
 /// Example: \include MatrixBase_leftCols_int.cpp
 /// Output: \verbinclude MatrixBase_leftCols_int.out
 ///
+/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
-inline ColsBlockXpr leftCols(Index n)
+template<typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename NColsBlockXpr<...>::Type
+#endif
+leftCols(NColsType n)
 {
-  return ColsBlockXpr(derived(), 0, 0, rows(), n);
+  return typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, 0, rows(), internal::get_runtime_value(n));
 }
 
-/// This is the const version of leftCols(Index).
-EIGEN_DEVICE_FUNC
-inline ConstColsBlockXpr leftCols(Index n) const
+/// This is the const version of leftCols(NColsType).
+template<typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+const typename ConstNColsBlockXpr<...>::Type
+#endif
+leftCols(NColsType n) const
 {
-  return ConstColsBlockXpr(derived(), 0, 0, rows(), n);
+  return typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, 0, rows(), internal::get_runtime_value(n));
 }
 
-/// \returns a block consisting of the left columns of *this.
+/// \returns a block consisting of the left columns of \c *this.
 ///
 /// \tparam N the number of columns in the block as specified at compile-time
 /// \param n the number of columns in the block as specified at run-time
@@ -624,50 +827,69 @@ inline ConstColsBlockXpr leftCols(Index n) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename NColsBlockXpr<N>::Type leftCols(Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename NColsBlockXpr<N>::Type leftCols(Index n = N)
 {
   return typename NColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
 }
 
 /// This is the const version of leftCols<int>().
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const
 {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
 }
 
 
 
-/// \returns a block consisting of the right columns of *this.
+/// \returns a block consisting of the right columns of \c *this.
 ///
 /// \param n the number of columns in the block
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
 ///
 /// Example: \include MatrixBase_rightCols_int.cpp
 /// Output: \verbinclude MatrixBase_rightCols_int.out
 ///
+/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
-inline ColsBlockXpr rightCols(Index n)
+template<typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename NColsBlockXpr<...>::Type
+#endif
+rightCols(NColsType n)
 {
-  return ColsBlockXpr(derived(), 0, cols() - n, rows(), n);
+  return typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, cols() - internal::get_runtime_value(n), rows(), internal::get_runtime_value(n));
 }
 
-/// This is the const version of rightCols(Index).
-EIGEN_DEVICE_FUNC
-inline ConstColsBlockXpr rightCols(Index n) const
+/// This is the const version of rightCols(NColsType).
+template<typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+const typename ConstNColsBlockXpr<...>::Type
+#endif
+rightCols(NColsType n) const
 {
-  return ConstColsBlockXpr(derived(), 0, cols() - n, rows(), n);
+  return typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, cols() - internal::get_runtime_value(n), rows(), internal::get_runtime_value(n));
 }
 
-/// \returns a block consisting of the right columns of *this.
+/// \returns a block consisting of the right columns of \c *this.
 ///
 /// \tparam N the number of columns in the block as specified at compile-time
 /// \param n the number of columns in the block as specified at run-time
@@ -680,51 +902,70 @@ inline ConstColsBlockXpr rightCols(Index n) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename NColsBlockXpr<N>::Type rightCols(Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename NColsBlockXpr<N>::Type rightCols(Index n = N)
 {
   return typename NColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
 }
 
 /// This is the const version of rightCols<int>().
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const
 {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
 }
 
 
 
-/// \returns a block consisting of a range of columns of *this.
+/// \returns a block consisting of a range of columns of \c *this.
 ///
 /// \param startCol the index of the first column in the block
 /// \param numCols the number of columns in the block
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
 ///
 /// Example: \include DenseBase_middleCols_int.cpp
 /// Output: \verbinclude DenseBase_middleCols_int.out
 ///
+/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
-EIGEN_DEVICE_FUNC
-inline ColsBlockXpr middleCols(Index startCol, Index numCols)
+template<typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+typename NColsBlockXpr<...>::Type
+#endif
+middleCols(Index startCol, NColsType numCols)
 {
-  return ColsBlockXpr(derived(), 0, startCol, rows(), numCols);
+  return typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, startCol, rows(), internal::get_runtime_value(numCols));
 }
 
-/// This is the const version of middleCols(Index,Index).
-EIGEN_DEVICE_FUNC
-inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const
+/// This is the const version of middleCols(Index,NColsType).
+template<typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+const typename ConstNColsBlockXpr<...>::Type
+#endif
+middleCols(Index startCol, NColsType numCols) const
 {
-  return ConstColsBlockXpr(derived(), 0, startCol, rows(), numCols);
+  return typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+            (derived(), 0, startCol, rows(), internal::get_runtime_value(numCols));
 }
 
-/// \returns a block consisting of a range of columns of *this.
+/// \returns a block consisting of a range of columns of \c *this.
 ///
 /// \tparam N the number of columns in the block as specified at compile-time
 /// \param startCol the index of the first column in the block
@@ -738,26 +979,26 @@ inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N)
 {
   return typename NColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
 }
 
 /// This is the const version of middleCols<int>().
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N) const
 {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
 }
 
 
 
-/// \returns a fixed-size expression of a block in *this.
+/// \returns a fixed-size expression of a block of \c *this.
 ///
 /// The template parameters \a NRows and \a NCols are the number of
 /// rows and columns in the block.
@@ -768,29 +1009,35 @@ inline typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n =
 /// Example: \include MatrixBase_block_int_int.cpp
 /// Output: \verbinclude MatrixBase_block_int_int.out
 ///
+/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic
+/// block(Index,Index,NRowsType,NColsType), here is the one-to-one equivalence:
+/// \code
+/// mat.template block<NRows,NCols>(i,j)  <-->  mat.block(i,j,fix<NRows>,fix<NCols>)
+/// \endcode
+///
 /// \note since block is a templated member, the keyword template has to be used
 /// if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode
 ///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template<int NRows, int NCols>
-EIGEN_DEVICE_FUNC
-inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol)
 {
   return typename FixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol);
 }
 
 /// This is the const version of block<>(Index, Index). */
 template<int NRows, int NCols>
-EIGEN_DEVICE_FUNC
-inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol) const
 {
   return typename ConstFixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol);
 }
 
-/// \returns an expression of a block in *this.
+/// \returns an expression of a block of \c *this.
 ///
 /// \tparam NRows number of rows in block as specified at compile-time
 /// \tparam NCols number of columns in block as specified at compile-time
@@ -805,14 +1052,25 @@ inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow
 /// \a NRows is \a Dynamic, and the same for the number of columns.
 ///
 /// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
-/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.out
+///
+/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic
+/// block(Index,Index,NRowsType,NColsType), here is the one-to-one complete equivalence:
+/// \code
+/// mat.template block<NRows,NCols>(i,j,rows,cols)     <-->  mat.block(i,j,fix<NRows>(rows),fix<NCols>(cols))
+/// \endcode
+/// If we known that, e.g., NRows==Dynamic and NCols!=Dynamic, then the equivalence becomes:
+/// \code
+/// mat.template block<Dynamic,NCols>(i,j,rows,NCols)  <-->  mat.block(i,j,rows,fix<NCols>)
+/// \endcode
 ///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa class Block, block(Index,Index,Index,Index)
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template<int NRows, int NCols>
-inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
                                                   Index blockRows, Index blockCols)
 {
   return typename FixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
@@ -820,13 +1078,14 @@ inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index sta
 
 /// This is the const version of block<>(Index, Index, Index, Index).
 template<int NRows, int NCols>
-inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
                                                               Index blockRows, Index blockCols) const
 {
   return typename ConstFixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
 }
 
-/// \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0.
+/// \returns an expression of the \a i-th column of \c *this. Note that the numbering starts at 0.
 ///
 /// Example: \include MatrixBase_col.cpp
 /// Output: \verbinclude MatrixBase_col.out
@@ -834,20 +1093,20 @@ inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 /**
   * \sa row(), class Block */
-EIGEN_DEVICE_FUNC
-inline ColXpr col(Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ColXpr col(Index i)
 {
   return ColXpr(derived(), i);
 }
 
 /// This is the const version of col().
-EIGEN_DEVICE_FUNC
-inline ConstColXpr col(Index i) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ConstColXpr col(Index i) const
 {
   return ConstColXpr(derived(), i);
 }
 
-/// \returns an expression of the \a i-th row of *this. Note that the numbering starts at 0.
+/// \returns an expression of the \a i-th row of \c *this. Note that the numbering starts at 0.
 ///
 /// Example: \include MatrixBase_row.cpp
 /// Output: \verbinclude MatrixBase_row.out
@@ -855,109 +1114,166 @@ inline ConstColXpr col(Index i) const
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 /**
   * \sa col(), class Block */
-EIGEN_DEVICE_FUNC
-inline RowXpr row(Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+RowXpr row(Index i)
 {
   return RowXpr(derived(), i);
 }
 
 /// This is the const version of row(). */
-EIGEN_DEVICE_FUNC
-inline ConstRowXpr row(Index i) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ConstRowXpr row(Index i) const
 {
   return ConstRowXpr(derived(), i);
 }
 
-/// \returns a dynamic-size expression of a segment (i.e. a vector block) in *this.
+/// \returns an expression of a segment (i.e. a vector block) in \c *this with either dynamic or fixed sizes.
 ///
 /// \only_for_vectors
 ///
 /// \param start the first coefficient in the segment
 /// \param n the number of coefficients in the segment
+/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index.
 ///
 /// Example: \include MatrixBase_segment_int_int.cpp
 /// Output: \verbinclude MatrixBase_segment_int_int.out
 ///
-/// \note Even though the returned expression has dynamic size, in the case
+/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+/// \note Even in the case that the returned expression has dynamic size, in the case
 /// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
 /// which means that evaluating it does not cause a dynamic memory allocation.
 ///
-/// \sa class Block, segment(Index)
+/// \sa block(Index,Index,NRowsType,NColsType), fix<N>, fix<N>(int), class Block
 ///
-EIGEN_DEVICE_FUNC
-inline SegmentReturnType segment(Index start, Index n)
+template<typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+typename FixedSegmentReturnType<...>::Type
+#endif
+segment(Index start, NType n)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return SegmentReturnType(derived(), start, n);
+  return typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+            (derived(), start, internal::get_runtime_value(n));
 }
 
 
-/// This is the const version of segment(Index,Index).
-EIGEN_DEVICE_FUNC
-inline ConstSegmentReturnType segment(Index start, Index n) const
+/// This is the const version of segment(Index,NType).
+template<typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+const typename ConstFixedSegmentReturnType<...>::Type
+#endif
+segment(Index start, NType n) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return ConstSegmentReturnType(derived(), start, n);
+  return typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+            (derived(), start, internal::get_runtime_value(n));
 }
 
-/// \returns a dynamic-size expression of the first coefficients of *this.
+/// \returns an expression of the first coefficients of \c *this with either dynamic or fixed sizes.
 ///
 /// \only_for_vectors
 ///
 /// \param n the number of coefficients in the segment
+/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index.
 ///
 /// Example: \include MatrixBase_start_int.cpp
 /// Output: \verbinclude MatrixBase_start_int.out
 ///
-/// \note Even though the returned expression has dynamic size, in the case
+/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+/// \note Even in the case that the returned expression has dynamic size, in the case
 /// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
 /// which means that evaluating it does not cause a dynamic memory allocation.
 ///
 /// \sa class Block, block(Index,Index)
 ///
-EIGEN_DEVICE_FUNC
-inline SegmentReturnType head(Index n)
+template<typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+typename FixedSegmentReturnType<...>::Type
+#endif
+head(NType n)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return SegmentReturnType(derived(), 0, n);
+  return typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+              (derived(), 0, internal::get_runtime_value(n));
 }
 
-/// This is the const version of head(Index).
-EIGEN_DEVICE_FUNC
-inline ConstSegmentReturnType head(Index n) const
+/// This is the const version of head(NType).
+template<typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+const typename ConstFixedSegmentReturnType<...>::Type
+#endif
+head(NType n) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return ConstSegmentReturnType(derived(), 0, n);
+  return typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+            (derived(), 0, internal::get_runtime_value(n));
 }
 
-/// \returns a dynamic-size expression of the last coefficients of *this.
+/// \returns an expression of a last coefficients of \c *this with either dynamic or fixed sizes.
 ///
 /// \only_for_vectors
 ///
 /// \param n the number of coefficients in the segment
+/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index.
 ///
 /// Example: \include MatrixBase_end_int.cpp
 /// Output: \verbinclude MatrixBase_end_int.out
 ///
-/// \note Even though the returned expression has dynamic size, in the case
+/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+/// \note Even in the case that the returned expression has dynamic size, in the case
 /// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
 /// which means that evaluating it does not cause a dynamic memory allocation.
 ///
 /// \sa class Block, block(Index,Index)
 ///
-EIGEN_DEVICE_FUNC
-inline SegmentReturnType tail(Index n)
+template<typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+typename FixedSegmentReturnType<...>::Type
+#endif
+tail(NType n)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return SegmentReturnType(derived(), this->size() - n, n);
+  return typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+            (derived(), this->size() - internal::get_runtime_value(n), internal::get_runtime_value(n));
 }
 
 /// This is the const version of tail(Index).
-EIGEN_DEVICE_FUNC
-inline ConstSegmentReturnType tail(Index n) const
+template<typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+const typename ConstFixedSegmentReturnType<...>::Type
+#endif
+tail(NType n) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return ConstSegmentReturnType(derived(), this->size() - n, n);
+  return typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+            (derived(), this->size() - internal::get_runtime_value(n), internal::get_runtime_value(n));
 }
 
 /// \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this
@@ -974,11 +1290,11 @@ inline ConstSegmentReturnType tail(Index n) const
 /// Example: \include MatrixBase_template_int_segment.cpp
 /// Output: \verbinclude MatrixBase_template_int_segment.out
 ///
-/// \sa class Block
+/// \sa segment(Index,NType), class Block
 ///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), start, n);
@@ -986,14 +1302,14 @@ inline typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N
 
 /// This is the const version of segment<int>(Index).
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstFixedSegmentReturnType<N>::Type segment(Index start, Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstFixedSegmentReturnType<N>::Type segment(Index start, Index n = N) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), start, n);
 }
 
-/// \returns a fixed-size expression of the first coefficients of *this.
+/// \returns a fixed-size expression of the first coefficients of \c *this.
 ///
 /// \only_for_vectors
 ///
@@ -1006,11 +1322,11 @@ inline typename ConstFixedSegmentReturnType<N>::Type segment(Index start, Index
 /// Example: \include MatrixBase_template_int_start.cpp
 /// Output: \verbinclude MatrixBase_template_int_start.out
 ///
-/// \sa class Block
+/// \sa head(NType), class Block
 ///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename FixedSegmentReturnType<N>::Type head(Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedSegmentReturnType<N>::Type head(Index n = N)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), 0, n);
@@ -1018,14 +1334,14 @@ inline typename FixedSegmentReturnType<N>::Type head(Index n = N)
 
 /// This is the const version of head<int>().
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), 0, n);
 }
 
-/// \returns a fixed-size expression of the last coefficients of *this.
+/// \returns a fixed-size expression of the last coefficients of \c *this.
 ///
 /// \only_for_vectors
 ///
@@ -1038,11 +1354,11 @@ inline typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const
 /// Example: \include MatrixBase_template_int_end.cpp
 /// Output: \verbinclude MatrixBase_template_int_end.out
 ///
-/// \sa class Block
+/// \sa tail(NType), class Block
 ///
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename FixedSegmentReturnType<N>::Type tail(Index n = N)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename FixedSegmentReturnType<N>::Type tail(Index n = N)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), size() - n);
@@ -1050,9 +1366,77 @@ inline typename FixedSegmentReturnType<N>::Type tail(Index n = N)
 
 /// This is the const version of tail<int>.
 template<int N>
-EIGEN_DEVICE_FUNC
-inline typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), size() - n);
 }
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major).
+///
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+InnerVectorReturnType innerVector(Index outer)
+{ return InnerVectorReturnType(derived(), outer); }
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major). Read-only.
+///
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const ConstInnerVectorReturnType innerVector(Index outer) const
+{ return ConstInnerVectorReturnType(derived(), outer); }
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major).
+///
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+InnerVectorsReturnType
+innerVectors(Index outerStart, Index outerSize)
+{
+  return Block<Derived,Dynamic,Dynamic,true>(derived(),
+                                             IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
+                                             IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
+
+}
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major). Read-only.
+///
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const ConstInnerVectorsReturnType
+innerVectors(Index outerStart, Index outerSize) const
+{
+  return Block<const Derived,Dynamic,Dynamic,true>(derived(),
+                                                  IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
+                                                  IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
+
+}
+
+/** \returns the i-th subvector (column or vector) according to the \c Direction
+  * \sa subVectors()
+  */
+template<DirectionType Direction>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename internal::conditional<Direction==Vertical,ColXpr,RowXpr>::type
+subVector(Index i)
+{
+  return typename internal::conditional<Direction==Vertical,ColXpr,RowXpr>::type(derived(),i);
+}
+
+/** This is the const version of subVector(Index) */
+template<DirectionType Direction>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename internal::conditional<Direction==Vertical,ConstColXpr,ConstRowXpr>::type
+subVector(Index i) const
+{
+  return typename internal::conditional<Direction==Vertical,ConstColXpr,ConstRowXpr>::type(derived(),i);
+}
+
+/** \returns the number of subvectors (rows or columns) in the direction \c Direction
+  * \sa subVector(Index)
+  */
+template<DirectionType Direction>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR
+Index subVectors() const
+{ return (Direction==Vertical)?cols():rows(); }
diff --git a/contrib/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h b/contrib/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h
index 89f4faaac6b60f1699d99d7979f52b2687e858af..5418dc4154f2b078e964383d6bb5acebe0bf3210 100644
--- a/contrib/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h
+++ b/contrib/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h
@@ -76,6 +76,20 @@ conjugate() const
   return ConjugateReturnType(derived());
 }
 
+/// \returns an expression of the complex conjugate of \c *this if Cond==true, returns derived() otherwise.
+///
+EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate)
+///
+/// \sa conjugate()
+template<bool Cond>
+EIGEN_DEVICE_FUNC
+inline typename internal::conditional<Cond,ConjugateReturnType,const Derived&>::type
+conjugateIf() const
+{
+  typedef typename internal::conditional<Cond,ConjugateReturnType,const Derived&>::type ReturnType;
+  return ReturnType(derived());
+}
+
 /// \returns a read-only expression of the real part of \c *this.
 ///
 EIGEN_DOC_UNARY_ADDONS(real,real part function)
diff --git a/contrib/eigen/Eigen/src/plugins/IndexedViewMethods.h b/contrib/eigen/Eigen/src/plugins/IndexedViewMethods.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bfb19ac6cdfd5a66fdf15f001d72f7efc2a8300
--- /dev/null
+++ b/contrib/eigen/Eigen/src/plugins/IndexedViewMethods.h
@@ -0,0 +1,262 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if !defined(EIGEN_PARSED_BY_DOXYGEN)
+
+// This file is automatically included twice to generate const and non-const versions
+
+#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
+#define EIGEN_INDEXED_VIEW_METHOD_CONST const
+#define EIGEN_INDEXED_VIEW_METHOD_TYPE  ConstIndexedViewType
+#else
+#define EIGEN_INDEXED_VIEW_METHOD_CONST
+#define EIGEN_INDEXED_VIEW_METHOD_TYPE IndexedViewType
+#endif
+
+#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
+protected:
+
+// define some aliases to ease readability
+
+template<typename Indices>
+struct IvcRowType : public internal::IndexedViewCompatibleType<Indices,RowsAtCompileTime> {};
+
+template<typename Indices>
+struct IvcColType : public internal::IndexedViewCompatibleType<Indices,ColsAtCompileTime> {};
+
+template<typename Indices>
+struct IvcType : public internal::IndexedViewCompatibleType<Indices,SizeAtCompileTime> {};
+
+typedef typename internal::IndexedViewCompatibleType<Index,1>::type IvcIndex;
+
+template<typename Indices>
+typename IvcRowType<Indices>::type
+ivcRow(const Indices& indices) const {
+  return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic<Index,RowsAtCompileTime>(derived().rows()),Specialized);
+}
+
+template<typename Indices>
+typename IvcColType<Indices>::type
+ivcCol(const Indices& indices) const {
+  return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic<Index,ColsAtCompileTime>(derived().cols()),Specialized);
+}
+
+template<typename Indices>
+typename IvcColType<Indices>::type
+ivcSize(const Indices& indices) const {
+  return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic<Index,SizeAtCompileTime>(derived().size()),Specialized);
+}
+
+public:
+
+#endif
+
+template<typename RowIndices, typename ColIndices>
+struct EIGEN_INDEXED_VIEW_METHOD_TYPE {
+  typedef IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,
+                      typename IvcRowType<RowIndices>::type,
+                      typename IvcColType<ColIndices>::type> type;
+};
+
+// This is the generic version
+
+template<typename RowIndices, typename ColIndices>
+typename internal::enable_if<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
+  && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsIndexedView,
+  typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type >::type
+operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  return typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type
+            (derived(), ivcRow(rowIndices), ivcCol(colIndices));
+}
+
+// The following overload returns a Block<> object
+
+template<typename RowIndices, typename ColIndices>
+typename internal::enable_if<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
+  && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsBlock,
+  typename internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::BlockType>::type
+operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  typedef typename internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::BlockType BlockType;
+  typename IvcRowType<RowIndices>::type actualRowIndices = ivcRow(rowIndices);
+  typename IvcColType<ColIndices>::type actualColIndices = ivcCol(colIndices);
+  return BlockType(derived(),
+                   internal::first(actualRowIndices),
+                   internal::first(actualColIndices),
+                   internal::size(actualRowIndices),
+                   internal::size(actualColIndices));
+}
+
+// The following overload returns a Scalar
+
+template<typename RowIndices, typename ColIndices>
+typename internal::enable_if<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
+  && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsScalar,
+  CoeffReturnType >::type
+operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  return Base::operator()(internal::eval_expr_given_size(rowIndices,rows()),internal::eval_expr_given_size(colIndices,cols()));
+}
+
+#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE
+
+// The following three overloads are needed to handle raw Index[N] arrays.
+
+template<typename RowIndicesT, std::size_t RowIndicesN, typename ColIndices>
+IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN],typename IvcColType<ColIndices>::type>
+operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN],typename IvcColType<ColIndices>::type>
+                    (derived(), rowIndices, ivcCol(colIndices));
+}
+
+template<typename RowIndices, typename ColIndicesT, std::size_t ColIndicesN>
+IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcRowType<RowIndices>::type, const ColIndicesT (&)[ColIndicesN]>
+operator()(const RowIndices& rowIndices, const ColIndicesT (&colIndices)[ColIndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcRowType<RowIndices>::type,const ColIndicesT (&)[ColIndicesN]>
+                    (derived(), ivcRow(rowIndices), colIndices);
+}
+
+template<typename RowIndicesT, std::size_t RowIndicesN, typename ColIndicesT, std::size_t ColIndicesN>
+IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN], const ColIndicesT (&)[ColIndicesN]>
+operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&colIndices)[ColIndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN],const ColIndicesT (&)[ColIndicesN]>
+                    (derived(), rowIndices, colIndices);
+}
+
+#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE
+
+// Overloads for 1D vectors/arrays
+
+template<typename Indices>
+typename internal::enable_if<
+  IsRowMajor && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_valid_index_type<Indices>::value)),
+  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,typename IvcType<Indices>::type> >::type
+operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,typename IvcType<Indices>::type>
+            (derived(), IvcIndex(0), ivcCol(indices));
+}
+
+template<typename Indices>
+typename internal::enable_if<
+  (!IsRowMajor) && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_valid_index_type<Indices>::value)),
+  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcType<Indices>::type,IvcIndex> >::type
+operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcType<Indices>::type,IvcIndex>
+            (derived(), ivcRow(indices), IvcIndex(0));
+}
+
+template<typename Indices>
+typename internal::enable_if<
+  (internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1) && (!internal::is_valid_index_type<Indices>::value) && (!symbolic::is_symbolic<Indices>::value),
+  VectorBlock<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,internal::array_size<Indices>::value> >::type
+operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  typename IvcType<Indices>::type actualIndices = ivcSize(indices);
+  return VectorBlock<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,internal::array_size<Indices>::value>
+            (derived(), internal::first(actualIndices), internal::size(actualIndices));
+}
+
+template<typename IndexType>
+typename internal::enable_if<symbolic::is_symbolic<IndexType>::value, CoeffReturnType >::type
+operator()(const IndexType& id) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  return Base::operator()(internal::eval_expr_given_size(id,size()));
+}
+
+#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE
+
+template<typename IndicesT, std::size_t IndicesN>
+typename internal::enable_if<IsRowMajor,
+  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,const IndicesT (&)[IndicesN]> >::type
+operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,const IndicesT (&)[IndicesN]>
+            (derived(), IvcIndex(0), indices);
+}
+
+template<typename IndicesT, std::size_t IndicesN>
+typename internal::enable_if<!IsRowMajor,
+  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const IndicesT (&)[IndicesN],IvcIndex> >::type
+operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const IndicesT (&)[IndicesN],IvcIndex>
+            (derived(), indices, IvcIndex(0));
+}
+
+#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE
+
+#undef EIGEN_INDEXED_VIEW_METHOD_CONST
+#undef EIGEN_INDEXED_VIEW_METHOD_TYPE
+
+#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
+#define EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
+#include "IndexedViewMethods.h"
+#undef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
+#endif
+
+#else // EIGEN_PARSED_BY_DOXYGEN
+
+/**
+  * \returns a generic submatrix view defined by the rows and columns indexed \a rowIndices and \a colIndices respectively.
+  *
+  * Each parameter must either be:
+  *  - An integer indexing a single row or column
+  *  - Eigen::all indexing the full set of respective rows or columns in increasing order
+  *  - An ArithmeticSequence as returned by the Eigen::seq and Eigen::seqN functions
+  *  - Any %Eigen's vector/array of integers or expressions
+  *  - Plain C arrays: \c int[N]
+  *  - And more generally any type exposing the following two member functions:
+  * \code
+  * <integral type> operator[](<integral type>) const;
+  * <integral type> size() const;
+  * \endcode
+  * where \c <integral \c type>  stands for any integer type compatible with Eigen::Index (i.e. \c std::ptrdiff_t).
+  *
+  * The last statement implies compatibility with \c std::vector, \c std::valarray, \c std::array, many of the Range-v3's ranges, etc.
+  *
+  * If the submatrix can be represented using a starting position \c (i,j) and positive sizes \c (rows,columns), then this
+  * method will returns a Block object after extraction of the relevant information from the passed arguments. This is the case
+  * when all arguments are either:
+  *  - An integer
+  *  - Eigen::all
+  *  - An ArithmeticSequence with compile-time increment strictly equal to 1, as returned by Eigen::seq(a,b), and Eigen::seqN(a,N).
+  *
+  * Otherwise a more general IndexedView<Derived,RowIndices',ColIndices'> object will be returned, after conversion of the inputs
+  * to more suitable types \c RowIndices' and \c ColIndices'.
+  *
+  * For 1D vectors and arrays, you better use the operator()(const Indices&) overload, which behave the same way but taking a single parameter.
+  *
+  * See also this <a href="https://stackoverflow.com/questions/46110917/eigen-replicate-items-along-one-dimension-without-useless-allocations">question</a> and its answer for an example of how to duplicate coefficients.
+  *
+  * \sa operator()(const Indices&), class Block, class IndexedView, DenseBase::block(Index,Index,Index,Index)
+  */
+template<typename RowIndices, typename ColIndices>
+IndexedView_or_Block
+operator()(const RowIndices& rowIndices, const ColIndices& colIndices);
+
+/** This is an overload of operator()(const RowIndices&, const ColIndices&) for 1D vectors or arrays
+  *
+  * \only_for_vectors
+  */
+template<typename Indices>
+IndexedView_or_VectorBlock
+operator()(const Indices& indices);
+
+#endif  // EIGEN_PARSED_BY_DOXYGEN
diff --git a/contrib/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/contrib/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h
index f1084abefbcdcb8368683fe01a4f01c71cc85621..a0feef8716a4911e2da8011177a38c7dd051b6ae 100644
--- a/contrib/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h
+++ b/contrib/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h
@@ -39,10 +39,10 @@ cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   */
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-inline const CwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>
+inline const CwiseBinaryOp<numext::equal_to<Scalar>, const Derived, const OtherDerived>
 cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<numext::equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise != operator of *this and \a other
@@ -59,10 +59,10 @@ cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   */
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-inline const CwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>
+inline const CwiseBinaryOp<numext::not_equal_to<Scalar>, const Derived, const OtherDerived>
 cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<numext::not_equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise min of *this and \a other
diff --git a/contrib/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h b/contrib/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h
index b1be3d566c66f2008146ebae6eb3bcd14a36decc..0514d8f78b5ff44ed659113f85e5aa0e512cc6c5 100644
--- a/contrib/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h
+++ b/contrib/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h
@@ -14,6 +14,7 @@
 
 typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> CwiseAbsReturnType;
 typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> CwiseAbs2ReturnType;
+typedef CwiseUnaryOp<internal::scalar_arg_op<Scalar>, const Derived> CwiseArgReturnType;
 typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType;
 typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> CwiseSignReturnType;
 typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType;
@@ -82,4 +83,13 @@ EIGEN_DEVICE_FUNC
 inline const CwiseInverseReturnType
 cwiseInverse() const { return CwiseInverseReturnType(derived()); }
 
+/// \returns an expression of the coefficient-wise phase angle of \c *this
+///
+/// Example: \include MatrixBase_cwiseArg.cpp
+/// Output: \verbinclude MatrixBase_cwiseArg.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseArg,arg)
 
+EIGEN_DEVICE_FUNC
+inline const CwiseArgReturnType
+cwiseArg() const { return CwiseArgReturnType(derived()); }
diff --git a/contrib/eigen/Eigen/src/plugins/ReshapedMethods.h b/contrib/eigen/Eigen/src/plugins/ReshapedMethods.h
new file mode 100644
index 0000000000000000000000000000000000000000..482a6b045ef5a530517f55785b2ce27b448595d8
--- /dev/null
+++ b/contrib/eigen/Eigen/src/plugins/ReshapedMethods.h
@@ -0,0 +1,149 @@
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+
+/// \returns an expression of \c *this with reshaped sizes.
+///
+/// \param nRows the number of rows in the reshaped expression, specified at either run-time or compile-time, or AutoSize
+/// \param nCols the number of columns in the reshaped expression, specified at either run-time or compile-time, or AutoSize
+/// \tparam Order specifies whether the coefficients should be processed in column-major-order (ColMajor), in row-major-order (RowMajor),
+///               or follows the \em natural order of the nested expression (AutoOrder). The default is ColMajor.
+/// \tparam NRowsType the type of the value handling the number of rows, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns, typically Index.
+///
+/// Dynamic size example: \include MatrixBase_reshaped_int_int.cpp
+/// Output: \verbinclude MatrixBase_reshaped_int_int.out
+///
+/// The number of rows \a nRows and columns \a nCols can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments. In the later case, \c n plays the role of a runtime fallback value in case \c N equals Eigen::Dynamic.
+/// Here is an example with a fixed number of rows and columns:
+/// \include MatrixBase_reshaped_fixed.cpp
+/// Output: \verbinclude MatrixBase_reshaped_fixed.out
+///
+/// Finally, one of the sizes parameter can be automatically deduced from the other one by passing AutoSize as in the following example:
+/// \include MatrixBase_reshaped_auto.cpp
+/// Output: \verbinclude MatrixBase_reshaped_auto.out
+/// AutoSize does preserve compile-time sizes when possible, i.e., when the sizes of the input are known at compile time \b and
+/// that the other size is passed at compile-time using Eigen::fix<N> as above.
+///
+/// \sa class Reshaped, fix, fix<N>(int)
+///
+template<int Order = ColMajor, typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
+inline Reshaped<Derived,...>
+reshaped(NRowsType nRows, NColsType nCols);
+
+/// This is the const version of reshaped(NRowsType,NColsType).
+template<int Order = ColMajor, typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
+inline const Reshaped<const Derived,...>
+reshaped(NRowsType nRows, NColsType nCols) const;
+
+/// \returns an expression of \c *this with columns (or rows) stacked to a linear column vector
+///
+/// \tparam Order specifies whether the coefficients should be processed in column-major-order (ColMajor), in row-major-order (RowMajor),
+///               or follows the \em natural order of the nested expression (AutoOrder). The default is ColMajor.
+///
+/// This overloads is essentially a shortcut for `A.reshaped<Order>(AutoSize,fix<1>)`.
+///
+/// - If `Order==ColMajor` (the default), then it returns a column-vector from the stacked columns of \c *this.
+/// - If `Order==RowMajor`, then it returns a column-vector from the stacked rows of \c *this.
+/// - If `Order==AutoOrder`, then it returns a column-vector with elements stacked following the storage order of \c *this.
+///   This mode is the recommended one when the particular ordering of the element is not relevant.
+///
+/// Example:
+/// \include MatrixBase_reshaped_to_vector.cpp
+/// Output: \verbinclude MatrixBase_reshaped_to_vector.out
+///
+/// If you want more control, you can still fall back to reshaped(NRowsType,NColsType).
+///
+/// \sa reshaped(NRowsType,NColsType), class Reshaped
+///
+template<int Order = ColMajor>
+EIGEN_DEVICE_FUNC
+inline Reshaped<Derived,...>
+reshaped();
+
+/// This is the const version of reshaped().
+template<int Order = ColMajor>
+EIGEN_DEVICE_FUNC
+inline const Reshaped<const Derived,...>
+reshaped() const;
+
+#else
+
+// This file is automatically included twice to generate const and non-const versions
+
+#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS
+#define EIGEN_RESHAPED_METHOD_CONST const
+#else
+#define EIGEN_RESHAPED_METHOD_CONST
+#endif
+
+#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS
+
+// This part is included once
+
+#endif
+
+template<typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
+inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
+                internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
+                internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value>
+reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST
+{
+  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
+                  internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
+                  internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value>
+                (derived(),
+                 internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()),
+                 internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size()));
+}
+
+template<int Order, typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC
+inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
+                internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
+                internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value,
+                internal::get_compiletime_reshape_order<Flags,Order>::value>
+reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST
+{
+  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
+                  internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
+                  internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value,
+                  internal::get_compiletime_reshape_order<Flags,Order>::value>
+                (derived(),
+                 internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()),
+                 internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size()));
+}
+
+// Views as linear vectors
+
+EIGEN_DEVICE_FUNC
+inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,SizeAtCompileTime,1>
+reshaped() EIGEN_RESHAPED_METHOD_CONST
+{
+  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,SizeAtCompileTime,1>(derived(),size(),1);
+}
+
+template<int Order>
+EIGEN_DEVICE_FUNC
+inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
+                internal::get_compiletime_reshape_order<Flags,Order>::value>
+reshaped() EIGEN_RESHAPED_METHOD_CONST
+{
+  EIGEN_STATIC_ASSERT(Order==RowMajor || Order==ColMajor || Order==AutoOrder, INVALID_TEMPLATE_PARAMETER);
+  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
+                  internal::get_compiletime_reshape_order<Flags,Order>::value>
+                (derived(), size(), 1);
+}
+
+#undef EIGEN_RESHAPED_METHOD_CONST
+
+#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS
+#define EIGEN_RESHAPED_METHOD_2ND_PASS
+#include "ReshapedMethods.h"
+#undef EIGEN_RESHAPED_METHOD_2ND_PASS
+#endif
+
+#endif // EIGEN_PARSED_BY_DOXYGEN
diff --git a/contrib/eigen/README.md b/contrib/eigen/README.md
index 4654a81c37243018abc50c6dbc353abf4164bd70..9b40e9ed490cc37fbd5e17e9153187ef00a924d2 100644
--- a/contrib/eigen/README.md
+++ b/contrib/eigen/README.md
@@ -1,3 +1,5 @@
 **Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.**
 
 For more information go to http://eigen.tuxfamily.org/.
+
+For ***pull request***, ***bug reports***, and ***feature requests***, go to https://gitlab.com/libeigen/eigen.
diff --git a/contrib/eigen/bench/BenchTimer.h b/contrib/eigen/bench/BenchTimer.h
index ea28496b77cb83c11b56bb935b97099c1baa4db1..8a0dbbe817109329a0e7a49fdc4f8a9afe120634 100644
--- a/contrib/eigen/bench/BenchTimer.h
+++ b/contrib/eigen/bench/BenchTimer.h
@@ -28,11 +28,15 @@
 #endif
 
 static void escape(void *p) {
+#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
   asm volatile("" : : "g"(p) : "memory");
+#endif
 }
 
 static void clobber() {
+#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
   asm volatile("" : : : "memory");
+#endif
 }
 
 #include <Eigen/Core>
diff --git a/contrib/eigen/bench/analyze-blocking-sizes.cpp b/contrib/eigen/bench/analyze-blocking-sizes.cpp
index d563a1d2da4c98d5eb7241278240c61e98ffce4c..6bc4aca3dbf7cafb182b7eb9f1f5685030343350 100644
--- a/contrib/eigen/bench/analyze-blocking-sizes.cpp
+++ b/contrib/eigen/bench/analyze-blocking-sizes.cpp
@@ -825,7 +825,7 @@ int main(int argc, char* argv[])
   }
   for (int i = 1; i < argc; i++) {
     bool arg_handled = false;
-    // Step 1. Try to match action invokation names.
+    // Step 1. Try to match action invocation names.
     for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
       if (!strcmp(argv[i], (*it)->invokation_name())) {
         if (!action) {
diff --git a/contrib/eigen/bench/basicbenchmark.h b/contrib/eigen/bench/basicbenchmark.h
index 3fdc35732f02010997e1bcfd745854a60cdf69be..8059375b5e1fde71a28583b2c62f667f5cb84154 100644
--- a/contrib/eigen/bench/basicbenchmark.h
+++ b/contrib/eigen/bench/basicbenchmark.h
@@ -16,13 +16,13 @@ void benchBasic_loop(const MatrixType& I, MatrixType& m, int iterations)
     {
       asm("#begin_bench_loop LazyEval");
       if (MatrixType::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize");
-      m = (I + 0.00005 * (m + m.lazy() * m)).eval();
+      m = (I + 0.00005 * (m + m.lazyProduct(m))).eval();
     }
     else if (Mode==OmpEval)
     {
       asm("#begin_bench_loop OmpEval");
       if (MatrixType::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize");
-      m = (I + 0.00005 * (m + m.lazy() * m)).evalOMP();
+      m = (I + 0.00005 * (m + m.lazyProduct(m))).eval();
     }
     else
     {
diff --git a/contrib/eigen/bench/benchCholesky.cpp b/contrib/eigen/bench/benchCholesky.cpp
index 9a8e7cf638857b3c2999a3c9bda91ddac868fd11..0dc94e5b4983974a379c012808e54d7cc701778e 100644
--- a/contrib/eigen/bench/benchCholesky.cpp
+++ b/contrib/eigen/bench/benchCholesky.cpp
@@ -1,5 +1,4 @@
-
-// g++ -DNDEBUG -O3 -I.. benchLLT.cpp  -o benchLLT && ./benchLLT
+// g++ -DNDEBUG -O3 -I.. benchCholesky.cpp  -o benchCholesky && ./benchCholesky
 // options:
 //  -DBENCH_GSL -lgsl /usr/lib/libcblas.so.3
 //  -DEIGEN_DONT_VECTORIZE
diff --git a/contrib/eigen/bench/bench_gemm.cpp b/contrib/eigen/bench/bench_gemm.cpp
index dccab96a8bb51290666fa0761e6a45f998ba5cc7..78ca1cd133f4dba4b297a6bfc61eed5998a60cef 100644
--- a/contrib/eigen/bench/bench_gemm.cpp
+++ b/contrib/eigen/bench/bench_gemm.cpp
@@ -11,8 +11,9 @@
 //
 
 #include <iostream>
-#include <Eigen/Core>
 #include <bench/BenchTimer.h>
+#include <Eigen/Core>
+
 
 using namespace std;
 using namespace Eigen;
@@ -30,10 +31,22 @@ using namespace Eigen;
 #define SCALARB SCALAR
 #endif
 
+#ifdef ROWMAJ_A
+const int opt_A = RowMajor;
+#else
+const int opt_A = ColMajor;
+#endif
+
+#ifdef ROWMAJ_B
+const int opt_B = RowMajor;
+#else
+const int opt_B = ColMajor;
+#endif
+
 typedef SCALAR Scalar;
 typedef NumTraits<Scalar>::Real RealScalar;
-typedef Matrix<SCALARA,Dynamic,Dynamic> A;
-typedef Matrix<SCALARB,Dynamic,Dynamic> B;
+typedef Matrix<SCALARA,Dynamic,Dynamic,opt_A> A;
+typedef Matrix<SCALARB,Dynamic,Dynamic,opt_B> B;
 typedef Matrix<Scalar,Dynamic,Dynamic> C;
 typedef Matrix<RealScalar,Dynamic,Dynamic> M;
 
@@ -58,45 +71,61 @@ static char lower = 'L';
 static char right = 'R';
 static int intone = 1;
 
-void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
+#ifdef ROWMAJ_A
+const char transA = trans;
+#else
+const char transA = notrans;
+#endif
+
+#ifdef ROWMAJ_B
+const char transB = trans;
+#else
+const char transB = notrans;
+#endif
+
+template<typename A,typename B>
+void blas_gemm(const A& a, const B& b, MatrixXf& c)
 {
   int M = c.rows(); int N = c.cols(); int K = a.cols();
-  int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
+  int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows();
 
-  sgemm_(&notrans,&notrans,&M,&N,&K,&fone,
+  sgemm_(&transA,&transB,&M,&N,&K,&fone,
          const_cast<float*>(a.data()),&lda,
          const_cast<float*>(b.data()),&ldb,&fone,
          c.data(),&ldc);
 }
 
-EIGEN_DONT_INLINE void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c)
+template<typename A,typename B>
+void blas_gemm(const A& a, const B& b, MatrixXd& c)
 {
   int M = c.rows(); int N = c.cols(); int K = a.cols();
-  int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
+  int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows();
 
-  dgemm_(&notrans,&notrans,&M,&N,&K,&done,
+  dgemm_(&transA,&transB,&M,&N,&K,&done,
          const_cast<double*>(a.data()),&lda,
          const_cast<double*>(b.data()),&ldb,&done,
          c.data(),&ldc);
 }
 
-void blas_gemm(const MatrixXcf& a, const MatrixXcf& b, MatrixXcf& c)
+template<typename A,typename B>
+void blas_gemm(const A& a, const B& b, MatrixXcf& c)
 {
   int M = c.rows(); int N = c.cols(); int K = a.cols();
-  int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
+  int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows();
 
-  cgemm_(&notrans,&notrans,&M,&N,&K,(float*)&cfone,
+  cgemm_(&transA,&transB,&M,&N,&K,(float*)&cfone,
          const_cast<float*>((const float*)a.data()),&lda,
          const_cast<float*>((const float*)b.data()),&ldb,(float*)&cfone,
          (float*)c.data(),&ldc);
 }
 
-void blas_gemm(const MatrixXcd& a, const MatrixXcd& b, MatrixXcd& c)
+template<typename A,typename B>
+void blas_gemm(const A& a, const B& b, MatrixXcd& c)
 {
   int M = c.rows(); int N = c.cols(); int K = a.cols();
-  int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
+  int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows();
 
-  zgemm_(&notrans,&notrans,&M,&N,&K,(double*)&cdone,
+  zgemm_(&transA,&transB,&M,&N,&K,(double*)&cdone,
          const_cast<double*>((const double*)a.data()),&lda,
          const_cast<double*>((const double*)b.data()),&ldb,(double*)&cdone,
          (double*)c.data(),&ldc);
@@ -127,10 +156,12 @@ void matlab_cplx_real(const M& ar, const M& ai, const M& b, M& cr, M& ci)
   ci.noalias() += ai * b;
 }
 
+
+
 template<typename A, typename B, typename C>
 EIGEN_DONT_INLINE void gemm(const A& a, const B& b, C& c)
 {
- c.noalias() += a * b;
+  c.noalias() += a * b;
 }
 
 int main(int argc, char ** argv)
@@ -180,8 +211,8 @@ int main(int argc, char ** argv)
       }
       else if(argv[i][1]=='t')
       {
+        tries = atoi(argv[++i]);
         ++i;
-        tries = atoi(argv[i++]);
       }
       else if(argv[i][1]=='p')
       {
@@ -217,7 +248,7 @@ int main(int argc, char ** argv)
   std::cout << "Matrix sizes = " << m << "x" << p << " * " << p << "x" << n << "\n";
   std::ptrdiff_t mc(m), nc(n), kc(p);
   internal::computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
-  std::cout << "blocking size (mc x kc) = " << mc << " x " << kc << "\n";
+  std::cout << "blocking size (mc x kc) = " << mc << " x " << kc << " x " << nc << "\n";
 
   C r = c;
 
@@ -241,7 +272,7 @@ int main(int argc, char ** argv)
     blas_gemm(a,b,r);
     c.noalias() += a * b;
     if(!r.isApprox(c)) {
-      std::cout << (r  - c).norm() << "\n";
+      std::cout << (r  - c).norm()/r.norm() << "\n";
       std::cerr << "Warning, your product is crap!\n\n";
     }
   #else
@@ -250,7 +281,7 @@ int main(int argc, char ** argv)
       gemm(a,b,c);
       r.noalias() += a.cast<Scalar>() .lazyProduct( b.cast<Scalar>() );
       if(!r.isApprox(c)) {
-        std::cout << (r  - c).norm() << "\n";
+        std::cout << (r  - c).norm()/r.norm() << "\n";
         std::cerr << "Warning, your product is crap!\n\n";
       }
     }
@@ -264,6 +295,9 @@ int main(int argc, char ** argv)
   std::cout << "blas  real        " << tblas.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tblas.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tblas.total(REAL_TIMER) << "s)\n";
   #endif
 
+  // warm start
+  if(b.norm()+a.norm()==123.554) std::cout << "\n";
+
   BenchTimer tmt;
   c = rc;
   BENCH(tmt, tries, rep, gemm(a,b,c));
@@ -286,11 +320,11 @@ int main(int argc, char ** argv)
   
   if(1.*m*n*p<30*30*30)
   {
-      BenchTimer tmt;
-      c = rc;
-      BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b));
-      std::cout << "lazy cpu         " << tmt.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tmt.total(CPU_TIMER)  << "s)\n";
-      std::cout << "lazy real        " << tmt.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n";
+    BenchTimer tmt;
+    c = rc;
+    BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b));
+    std::cout << "lazy cpu         " << tmt.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tmt.total(CPU_TIMER)  << "s)\n";
+    std::cout << "lazy real        " << tmt.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n";
   }
   
   #ifdef DECOUPLED
diff --git a/contrib/eigen/bench/bench_move_semantics.cpp b/contrib/eigen/bench/bench_move_semantics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..323d80417c0c87316c6f616ba031c4dfa01b8a6b
--- /dev/null
+++ b/contrib/eigen/bench/bench_move_semantics.cpp
@@ -0,0 +1,57 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Sebastien Boisvert <seb@boisvert.info>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "BenchTimer.h"
+#include "../test/MovableScalar.h"
+
+#include <Eigen/Core>
+
+#include <iostream>
+#include <utility>
+
+template <typename MatrixType>
+void copy_matrix(MatrixType& m)
+{
+  MatrixType tmp(m);
+  m = tmp;
+}
+
+template <typename MatrixType>
+void move_matrix(MatrixType&& m)
+{
+  MatrixType tmp(std::move(m));
+  m = std::move(tmp);
+}
+
+template<typename Scalar>
+void bench(const std::string& label)
+{
+  using MatrixType = Eigen::Matrix<Eigen::MovableScalar<Scalar>,1,10>;
+  Eigen::BenchTimer t;
+
+  int tries = 10;
+  int rep = 1000000;
+
+  MatrixType data = MatrixType::Random().eval();
+  MatrixType dest;
+
+  BENCH(t, tries, rep, copy_matrix(data));
+  std::cout << label << " copy semantics: " << 1e3*t.best(Eigen::CPU_TIMER) << " ms" << std::endl;
+
+  BENCH(t, tries, rep, move_matrix(std::move(data)));
+  std::cout << label << " move semantics: " << 1e3*t.best(Eigen::CPU_TIMER) << " ms" << std::endl;
+}
+
+int main()
+{
+  bench<float>("float");
+  bench<double>("double");
+  return 0;
+}
+
diff --git a/contrib/eigen/bench/bench_norm.cpp b/contrib/eigen/bench/bench_norm.cpp
index 129afcfb25635afb2553e27d586695ae88ab5c15..592f25d66e0744dac7581d30669d5cade3cc8d5f 100644
--- a/contrib/eigen/bench/bench_norm.cpp
+++ b/contrib/eigen/bench/bench_norm.cpp
@@ -111,12 +111,12 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v)
     int nbig, ibeta, it, iemin, iemax, iexp;
     Scalar abig, eps;
 
-    nbig  = std::numeric_limits<int>::max();            // largest integer
-    ibeta = std::numeric_limits<Scalar>::radix; //NumTraits<Scalar>::Base;                    // base for floating-point numbers
-    it    = std::numeric_limits<Scalar>::digits; //NumTraits<Scalar>::Mantissa;                // number of base-beta digits in mantissa
-    iemin = std::numeric_limits<Scalar>::min_exponent;  // minimum exponent
-    iemax = std::numeric_limits<Scalar>::max_exponent;  // maximum exponent
-    rbig  = std::numeric_limits<Scalar>::max();         // largest floating-point number
+    nbig  = NumTraits<int>::highest();          // largest integer
+    ibeta = std::numeric_limits<Scalar>::radix; // NumTraits<Scalar>::Base;                    // base for floating-point numbers
+    it    = NumTraits<Scalar>::digits();        // NumTraits<Scalar>::Mantissa;                // number of base-beta digits in mantissa
+    iemin = NumTraits<Scalar>::min_exponent();  // minimum exponent
+    iemax = NumTraits<Scalar>::max_exponent();  // maximum exponent
+    rbig  = NumTraits<Scalar>::highest();       // largest floating-point number
 
     // Check the basic machine-dependent constants.
     if(iemin > 1 - 2*it || 1+it>iemax || (it==2 && ibeta<5)
@@ -134,7 +134,7 @@ EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v)
     iexp  = - ((iemax+it)/2);
     s2m   = std::pow(ibeta,iexp);   // scaling factor for upper range
 
-    overfl  = rbig*s2m;          // overfow boundary for abig
+    overfl  = rbig*s2m;          // overflow boundary for abig
     eps     = std::pow(ibeta, 1-it);
     relerr  = std::sqrt(eps);      // tolerance for neglecting asml
     abig    = 1.0/eps - 1.0;
diff --git a/contrib/eigen/bench/btl/CMakeLists.txt b/contrib/eigen/bench/btl/CMakeLists.txt
index 38ff9f483284b1749b0adba445e82f99612c515c..42094e867d29b4063419d30b58c1355c5e18cdf7 100644
--- a/contrib/eigen/bench/btl/CMakeLists.txt
+++ b/contrib/eigen/bench/btl/CMakeLists.txt
@@ -1,35 +1,35 @@
-PROJECT(BTL)
+project(BTL)
 
-CMAKE_MINIMUM_REQUIRED(VERSION 2.6.2)
+cmake_minimum_required(VERSION 2.6.2)
 
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ${Eigen_SOURCE_DIR}/cmake)
 include(MacroOptionalAddSubdirectory)
 
-OPTION(BTL_NOVEC "Disable SSE/Altivec optimizations when possible" OFF)
+option(BTL_NOVEC "Disable SSE/Altivec optimizations when possible" OFF)
 
-SET(CMAKE_INCLUDE_CURRENT_DIR ON)
+set(CMAKE_INCLUDE_CURRENT_DIR ON)
 
 string(REGEX MATCH icpc IS_ICPC ${CMAKE_CXX_COMPILER})
-IF(CMAKE_COMPILER_IS_GNUCXX OR IS_ICPC)
-  SET(CMAKE_CXX_FLAGS "-g0 -O3 -DNDEBUG ${CMAKE_CXX_FLAGS}")
-  SET(CMAKE_Fortran_FLAGS "-g0 -O3 -DNDEBUG ${CMAKE_Fortran_FLAGS}")
-  IF(BTL_NOVEC)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_DONT_VECTORIZE")
-  ENDIF(BTL_NOVEC)
-ENDIF(CMAKE_COMPILER_IS_GNUCXX OR IS_ICPC)
-
-IF(MSVC)
-  SET(CMAKE_CXX_FLAGS " /O2 /Ot /GL /fp:fast -DNDEBUG")
-#   SET(CMAKE_Fortran_FLAGS "-g0 -O3 -DNDEBUG")
-  IF(BTL_NOVEC)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_DONT_VECTORIZE")
-  ENDIF(BTL_NOVEC)
-ENDIF(MSVC)
+if(CMAKE_COMPILER_IS_GNUCXX OR IS_ICPC)
+  set(CMAKE_CXX_FLAGS "-g0 -O3 -DNDEBUG ${CMAKE_CXX_FLAGS}")
+  set(CMAKE_Fortran_FLAGS "-g0 -O3 -DNDEBUG ${CMAKE_Fortran_FLAGS}")
+  if(BTL_NOVEC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_DONT_VECTORIZE")
+  endif(BTL_NOVEC)
+endif(CMAKE_COMPILER_IS_GNUCXX OR IS_ICPC)
+
+if(MSVC)
+  set(CMAKE_CXX_FLAGS " /O2 /Ot /GL /fp:fast -DNDEBUG")
+#   set(CMAKE_Fortran_FLAGS "-g0 -O3 -DNDEBUG")
+  if(BTL_NOVEC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_DONT_VECTORIZE")
+  endif(BTL_NOVEC)
+endif(MSVC)
 
 if(IS_ICPC)
   set(CMAKE_CXX_FLAGS "-fast ${CMAKE_CXX_FLAGS}")
   set(CMAKE_Fortran_FLAGS "-fast ${CMAKE_Fortran_FLAGS}")
-endif(IS_ICPC)
+endif()
 
 include_directories(
   ${PROJECT_SOURCE_DIR}/actions
@@ -41,7 +41,7 @@ include_directories(
 # if (MKL_FOUND)
 #   add_definitions(-DHAVE_MKL)
 #   set(DEFAULT_LIBRARIES ${MKL_LIBRARIES})
-# endif (MKL_FOUND)
+# endif ()
 
 find_library(EIGEN_BTL_RT_LIBRARY rt)
 # if we cannot find it easily, then we don't need it!
@@ -49,11 +49,11 @@ if(NOT EIGEN_BTL_RT_LIBRARY)
   set(EIGEN_BTL_RT_LIBRARY "")
 endif()
 
-MACRO(BTL_ADD_BENCH targetname)
+macro(BTL_ADD_BENCH targetname)
 
   foreach(_current_var ${ARGN})
     set(_last_var ${_current_var})
-  endforeach(_current_var)
+  endforeach()
 
   set(_sources ${ARGN})
   list(LENGTH _sources _argn_length)
@@ -64,17 +64,17 @@ MACRO(BTL_ADD_BENCH targetname)
 
   if (${_argn_length} EQUAL ${_src_length})
     set(_last_var ON)
-  endif (${_argn_length} EQUAL ${_src_length})
+  endif ()
 
-  OPTION(BUILD_${targetname} "Build benchmark ${targetname}" ${_last_var})
+  option(BUILD_${targetname} "Build benchmark ${targetname}" ${_last_var})
 
-  IF(BUILD_${targetname})
-    ADD_EXECUTABLE(${targetname} ${_sources})
-    ADD_TEST(${targetname} "${targetname}")
+  if(BUILD_${targetname})
+    add_executable(${targetname} ${_sources})
+    add_test(${targetname} "${targetname}")
     target_link_libraries(${targetname} ${DEFAULT_LIBRARIES} ${EIGEN_BTL_RT_LIBRARY})
-  ENDIF(BUILD_${targetname})
+  endif(BUILD_${targetname})
 
-ENDMACRO(BTL_ADD_BENCH)
+endmacro(BTL_ADD_BENCH)
 
 macro(btl_add_target_property target prop value)
 
@@ -86,9 +86,9 @@ macro(btl_add_target_property target prop value)
     set_target_properties(${target} PROPERTIES ${prop} "${previous} ${value}")
   endif()
 
-endmacro(btl_add_target_property)
+endmacro()
 
-ENABLE_TESTING()
+enable_testing()
 
 add_subdirectory(libs/eigen3)
 add_subdirectory(libs/eigen2)
diff --git a/contrib/eigen/bench/btl/README b/contrib/eigen/bench/btl/README
index f3f5fb36f783e2edbe76453ab274eafc04f07beb..ebed8896078fe7050e647266acc04af0e112e3cf 100644
--- a/contrib/eigen/bench/btl/README
+++ b/contrib/eigen/bench/btl/README
@@ -36,7 +36,7 @@ For instance:
 
 You can also select a given set of actions defining the environment variable BTL_CONFIG this way:
   BTL_CONFIG="-a action1{:action2}*" ctest -V
-An exemple:
+An example:
   BTL_CONFIG="-a axpy:vector_matrix:trisolve:ata" ctest -V -R eigen2
 
 Finally, if bench results already exist (the bench*.dat files) then they merges by keeping the best for each matrix size. If you want to overwrite the previous ones you can simply add the "--overwrite" option:
diff --git a/contrib/eigen/bench/btl/actions/basic_actions.hh b/contrib/eigen/bench/btl/actions/basic_actions.hh
index a3333ea26f6b8e29344e53ff37a2ef6c05ed3f90..62442f01fb42d51eeb486651e8616663de6beb00 100644
--- a/contrib/eigen/bench/btl/actions/basic_actions.hh
+++ b/contrib/eigen/bench/btl/actions/basic_actions.hh
@@ -6,7 +6,7 @@
 #include "action_atv_product.hh"
 
 #include "action_matrix_matrix_product.hh"
-// #include "action_ata_product.hh"
+#include "action_ata_product.hh"
 #include "action_aat_product.hh"
 
 #include "action_trisolve.hh"
diff --git a/contrib/eigen/bench/btl/cmake/FindACML.cmake b/contrib/eigen/bench/btl/cmake/FindACML.cmake
index 4989fa2f4c60a2c4afe2242b32c02f1c4e9dd705..daeeb535da721cee1b3ae87c95900894ec19a4bb 100644
--- a/contrib/eigen/bench/btl/cmake/FindACML.cmake
+++ b/contrib/eigen/bench/btl/cmake/FindACML.cmake
@@ -1,7 +1,7 @@
 
 if (ACML_LIBRARIES)
   set(ACML_FIND_QUIETLY TRUE)
-endif (ACML_LIBRARIES)
+endif ()
 
 find_library(ACML_LIBRARIES
   NAMES
diff --git a/contrib/eigen/bench/btl/cmake/FindATLAS.cmake b/contrib/eigen/bench/btl/cmake/FindATLAS.cmake
index 4136a989d61565965f03cd1704f9284a83e8c8b0..572a4c0b212c06f3ce1277bc41f62ca67354d0db 100644
--- a/contrib/eigen/bench/btl/cmake/FindATLAS.cmake
+++ b/contrib/eigen/bench/btl/cmake/FindATLAS.cmake
@@ -1,7 +1,7 @@
 
 if (ATLAS_LIBRARIES)
   set(ATLAS_FIND_QUIETLY TRUE)
-endif (ATLAS_LIBRARIES)
+endif ()
 
 find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib /usr/lib/atlas /usr/lib64 /usr/lib64/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 find_library(ATLAS_LIB satlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
@@ -23,7 +23,7 @@ if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
 #     set(ATLAS_LIBRARIES ${ATLAS_LIBRARIES} ${ATLAS_REFERENCE_LAPACK})
 #   endif()
   
-endif(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
+endif()
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(ATLAS DEFAULT_MSG ATLAS_LIBRARIES)
diff --git a/contrib/eigen/bench/btl/cmake/FindBLAZE.cmake b/contrib/eigen/bench/btl/cmake/FindBLAZE.cmake
index dba4c89f2de55e394a148388cf594babc8cbf11f..18a878ff9aeb8e4262dff6abe9efcef9a82d9422 100644
--- a/contrib/eigen/bench/btl/cmake/FindBLAZE.cmake
+++ b/contrib/eigen/bench/btl/cmake/FindBLAZE.cmake
@@ -15,7 +15,7 @@ if (BLAZE_INCLUDE_DIR)
   # in cache already
   set(BLAZE_FOUND TRUE)
 
-else (BLAZE_INCLUDE_DIR)
+else ()
 
 find_path(BLAZE_INCLUDE_DIR NAMES blaze/Blaze.h
      PATHS
@@ -27,5 +27,5 @@ find_package_handle_standard_args(BLAZE DEFAULT_MSG BLAZE_INCLUDE_DIR)
 
 mark_as_advanced(BLAZE_INCLUDE_DIR)
 
-endif(BLAZE_INCLUDE_DIR)
+endif()
 
diff --git a/contrib/eigen/bench/btl/cmake/FindBlitz.cmake b/contrib/eigen/bench/btl/cmake/FindBlitz.cmake
index 92880bbed7cc687dee65ba2419e236e383c26479..7ab375fd87b626d08b4ac6d2e1c7cc582903efeb 100644
--- a/contrib/eigen/bench/btl/cmake/FindBlitz.cmake
+++ b/contrib/eigen/bench/btl/cmake/FindBlitz.cmake
@@ -15,7 +15,7 @@
 
 if (BLITZ_INCLUDES AND BLITZ_LIBRARIES)
   set(Blitz_FIND_QUIETLY TRUE)
-endif (BLITZ_INCLUDES AND BLITZ_LIBRARIES)
+endif ()
 
 find_path(BLITZ_INCLUDES
   NAMES
diff --git a/contrib/eigen/bench/btl/cmake/FindCBLAS.cmake b/contrib/eigen/bench/btl/cmake/FindCBLAS.cmake
index ce0f2f2b2d2d09973e45e8169a39d9f3cb738a50..43a90f7f656e4060ebfdd46af4b10c033facdbbd 100644
--- a/contrib/eigen/bench/btl/cmake/FindCBLAS.cmake
+++ b/contrib/eigen/bench/btl/cmake/FindCBLAS.cmake
@@ -2,7 +2,7 @@
 
 if (CBLAS_INCLUDES AND CBLAS_LIBRARIES)
   set(CBLAS_FIND_QUIETLY TRUE)
-endif (CBLAS_INCLUDES AND CBLAS_LIBRARIES)
+endif ()
 
 find_path(CBLAS_INCLUDES
   NAMES
diff --git a/contrib/eigen/bench/btl/cmake/FindGMM.cmake b/contrib/eigen/bench/btl/cmake/FindGMM.cmake
index 5049c64edcea43027c7875c9efa328a849cdb53f..ff45e6a0cfaacb4882190ecc408b60b587350c8b 100644
--- a/contrib/eigen/bench/btl/cmake/FindGMM.cmake
+++ b/contrib/eigen/bench/btl/cmake/FindGMM.cmake
@@ -1,7 +1,7 @@
 if (GMM_INCLUDE_DIR)
   # in cache already
   set(GMM_FOUND TRUE)
-else (GMM_INCLUDE_DIR)
+else ()
 
 find_path(GMM_INCLUDE_DIR NAMES gmm/gmm.h
      PATHS
@@ -14,4 +14,4 @@ FIND_PACKAGE_HANDLE_STANDARD_ARGS(GMM DEFAULT_MSG GMM_INCLUDE_DIR )
 
 mark_as_advanced(GMM_INCLUDE_DIR)
 
-endif(GMM_INCLUDE_DIR)
+endif()
diff --git a/contrib/eigen/bench/btl/cmake/FindMKL.cmake b/contrib/eigen/bench/btl/cmake/FindMKL.cmake
index f4d7c6ebe7dd629d20b7579ee6efa4dad989edcf..23e77279ab97ffa26264ef2a6584fd4a57d044f1 100644
--- a/contrib/eigen/bench/btl/cmake/FindMKL.cmake
+++ b/contrib/eigen/bench/btl/cmake/FindMKL.cmake
@@ -1,7 +1,7 @@
 
 if (MKL_LIBRARIES)
   set(MKL_FIND_QUIETLY TRUE)
-endif (MKL_LIBRARIES)
+endif ()
 
 if(CMAKE_MINOR_VERSION GREATER 4)
 
@@ -30,7 +30,7 @@ if(MKL_LIBRARIES AND MKL_GUIDE)
   set(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_intel_lp64 mkl_sequential ${MKL_GUIDE} pthread)
 endif()
 
-else(${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64")
+else()
 
 find_library(MKL_LIBRARIES
   mkl_core
@@ -55,9 +55,9 @@ if(MKL_LIBRARIES AND MKL_GUIDE)
   set(MKL_LIBRARIES ${MKL_LIBRARIES} mkl_intel mkl_sequential ${MKL_GUIDE} pthread)
 endif()
 
-endif(${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64")
+endif()
 
-endif(CMAKE_MINOR_VERSION GREATER 4)
+endif()
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(MKL DEFAULT_MSG MKL_LIBRARIES)
diff --git a/contrib/eigen/bench/btl/cmake/FindMTL4.cmake b/contrib/eigen/bench/btl/cmake/FindMTL4.cmake
index 3de4909802be09039c847931d323a81b8998fa50..1bafc93a6b43c9a851205ddb98aac61c488d4b14 100644
--- a/contrib/eigen/bench/btl/cmake/FindMTL4.cmake
+++ b/contrib/eigen/bench/btl/cmake/FindMTL4.cmake
@@ -15,7 +15,7 @@ if (MTL4_INCLUDE_DIR)
   # in cache already
   set(MTL4_FOUND TRUE)
 
-else (MTL4_INCLUDE_DIR)
+else ()
 
 find_path(MTL4_INCLUDE_DIR NAMES boost/numeric/mtl/mtl.hpp
      PATHS
@@ -27,5 +27,5 @@ find_package_handle_standard_args(MTL4 DEFAULT_MSG MTL4_INCLUDE_DIR)
 
 mark_as_advanced(MTL4_INCLUDE_DIR)
 
-endif(MTL4_INCLUDE_DIR)
+endif()
 
diff --git a/contrib/eigen/bench/btl/cmake/FindOPENBLAS.cmake b/contrib/eigen/bench/btl/cmake/FindOPENBLAS.cmake
index 2a091943645d64c86dcae8b01211dbec129167da..5c0762306d65532e040adba0fd2899238a8134d1 100644
--- a/contrib/eigen/bench/btl/cmake/FindOPENBLAS.cmake
+++ b/contrib/eigen/bench/btl/cmake/FindOPENBLAS.cmake
@@ -1,7 +1,7 @@
 
 if (OPENBLAS_LIBRARIES)
   set(OPENBLAS_FIND_QUIETLY TRUE)
-endif (OPENBLAS_LIBRARIES)
+endif ()
 
 find_file(OPENBLAS_LIBRARIES NAMES libopenblas.so libopenblas.so.0 PATHS /usr/lib /usr/lib64 $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR})
 find_library(OPENBLAS_LIBRARIES openblas PATHS $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR})
diff --git a/contrib/eigen/bench/btl/cmake/FindPackageHandleStandardArgs.cmake b/contrib/eigen/bench/btl/cmake/FindPackageHandleStandardArgs.cmake
index 7f122edcddd304b99fc1fd67ab1ca023a46edcb2..05d7e65bd2e06a34bf1f58073075ee45d9a223ba 100644
--- a/contrib/eigen/bench/btl/cmake/FindPackageHandleStandardArgs.cmake
+++ b/contrib/eigen/bench/btl/cmake/FindPackageHandleStandardArgs.cmake
@@ -1,7 +1,7 @@
 # FIND_PACKAGE_HANDLE_STANDARD_ARGS(NAME (DEFAULT_MSG|"Custom failure message") VAR1 ... )
 #
 # This macro is intended to be used in FindXXX.cmake modules files.
-# It handles the REQUIRED and QUIET argument to FIND_PACKAGE() and
+# It handles the REQUIRED and QUIET argument to find_package() and
 # it also sets the <UPPERCASED_NAME>_FOUND variable.
 # The package is found if all variables listed are TRUE.
 # Example:
@@ -19,42 +19,42 @@
 # be "Could NOT find LibXml2", if you don't like this message you can specify
 # your own custom failure message there.
 
-MACRO(FIND_PACKAGE_HANDLE_STANDARD_ARGS _NAME _FAIL_MSG _VAR1 )
+macro(FIND_PACKAGE_HANDLE_STANDARD_ARGS _NAME _FAIL_MSG _VAR1 )
 
-  IF("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG")
-    IF (${_NAME}_FIND_REQUIRED)
-      SET(_FAIL_MESSAGE "Could not find REQUIRED package ${_NAME}")
-    ELSE (${_NAME}_FIND_REQUIRED)
-      SET(_FAIL_MESSAGE "Could not find OPTIONAL package ${_NAME}")
-    ENDIF (${_NAME}_FIND_REQUIRED)
-  ELSE("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG")
-    SET(_FAIL_MESSAGE "${_FAIL_MSG}")
-  ENDIF("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG")
+  if("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG")
+    if (${_NAME}_FIND_REQUIRED)
+      set(_FAIL_MESSAGE "Could not find REQUIRED package ${_NAME}")
+    else (${_NAME}_FIND_REQUIRED)
+      set(_FAIL_MESSAGE "Could not find OPTIONAL package ${_NAME}")
+    endif (${_NAME}_FIND_REQUIRED)
+  else("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG")
+    set(_FAIL_MESSAGE "${_FAIL_MSG}")
+  endif("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG")
 
-  STRING(TOUPPER ${_NAME} _NAME_UPPER)
+  string(TOUPPER ${_NAME} _NAME_UPPER)
 
-  SET(${_NAME_UPPER}_FOUND TRUE)
-  IF(NOT ${_VAR1})
-    SET(${_NAME_UPPER}_FOUND FALSE)
-  ENDIF(NOT ${_VAR1})
+  set(${_NAME_UPPER}_FOUND TRUE)
+  if(NOT ${_VAR1})
+    set(${_NAME_UPPER}_FOUND FALSE)
+  endif(NOT ${_VAR1})
 
-  FOREACH(_CURRENT_VAR ${ARGN})
-    IF(NOT ${_CURRENT_VAR})
-      SET(${_NAME_UPPER}_FOUND FALSE)
-    ENDIF(NOT ${_CURRENT_VAR})
-  ENDFOREACH(_CURRENT_VAR)
+  foreach(_CURRENT_VAR ${ARGN})
+    if(NOT ${_CURRENT_VAR})
+      set(${_NAME_UPPER}_FOUND FALSE)
+    endif(NOT ${_CURRENT_VAR})
+  endforeach(_CURRENT_VAR)
 
-  IF (${_NAME_UPPER}_FOUND)
-    IF (NOT ${_NAME}_FIND_QUIETLY)
-        MESSAGE(STATUS "Found ${_NAME}: ${${_VAR1}}")
-    ENDIF (NOT ${_NAME}_FIND_QUIETLY)
-  ELSE (${_NAME_UPPER}_FOUND)
-    IF (${_NAME}_FIND_REQUIRED)
-        MESSAGE(FATAL_ERROR "${_FAIL_MESSAGE}")
-    ELSE (${_NAME}_FIND_REQUIRED)
-      IF (NOT ${_NAME}_FIND_QUIETLY)
-        MESSAGE(STATUS "${_FAIL_MESSAGE}")
-      ENDIF (NOT ${_NAME}_FIND_QUIETLY)
-    ENDIF (${_NAME}_FIND_REQUIRED)
-  ENDIF (${_NAME_UPPER}_FOUND)
-ENDMACRO(FIND_PACKAGE_HANDLE_STANDARD_ARGS)
+  if (${_NAME_UPPER}_FOUND)
+    if (NOT ${_NAME}_FIND_QUIETLY)
+        message(STATUS "Found ${_NAME}: ${${_VAR1}}")
+    endif (NOT ${_NAME}_FIND_QUIETLY)
+  else (${_NAME_UPPER}_FOUND)
+    if (${_NAME}_FIND_REQUIRED)
+        message(FATAL_ERROR "${_FAIL_MESSAGE}")
+    else (${_NAME}_FIND_REQUIRED)
+      if (NOT ${_NAME}_FIND_QUIETLY)
+        message(STATUS "${_FAIL_MESSAGE}")
+      endif (NOT ${_NAME}_FIND_QUIETLY)
+    endif (${_NAME}_FIND_REQUIRED)
+  endif (${_NAME_UPPER}_FOUND)
+endmacro(FIND_PACKAGE_HANDLE_STANDARD_ARGS)
diff --git a/contrib/eigen/bench/btl/cmake/FindTvmet.cmake b/contrib/eigen/bench/btl/cmake/FindTvmet.cmake
index 26a29d965d125b34b1fff2c6bbfc2b5bf52e42df..8ccae271b99f1552da0b07534078a31563ce25bf 100644
--- a/contrib/eigen/bench/btl/cmake/FindTvmet.cmake
+++ b/contrib/eigen/bench/btl/cmake/FindTvmet.cmake
@@ -15,7 +15,7 @@ if (TVMET_INCLUDE_DIR)
   # in cache already
   set(TVMET_FOUND TRUE)
 
-else (TVMET_INCLUDE_DIR)
+else ()
 
 find_path(TVMET_INCLUDE_DIR NAMES tvmet/tvmet.h
      PATHS
@@ -28,5 +28,5 @@ find_package_handle_standard_args(Tvmet DEFAULT_MSG TVMET_INCLUDE_DIR)
 
 mark_as_advanced(TVMET_INCLUDE_DIR)
 
-endif(TVMET_INCLUDE_DIR)
+endif()
 
diff --git a/contrib/eigen/bench/btl/cmake/MacroOptionalAddSubdirectory.cmake b/contrib/eigen/bench/btl/cmake/MacroOptionalAddSubdirectory.cmake
index 545048b684371ec6b8bdab3aaee2415b15b321a0..8d46fcea2ab90d8149c380c010e348bcec28d2e6 100644
--- a/contrib/eigen/bench/btl/cmake/MacroOptionalAddSubdirectory.cmake
+++ b/contrib/eigen/bench/btl/cmake/MacroOptionalAddSubdirectory.cmake
@@ -1,6 +1,6 @@
-# - MACRO_OPTIONAL_ADD_SUBDIRECTORY() combines ADD_SUBDIRECTORY() with an OPTION()
+# - MACRO_OPTIONAL_ADD_SUBDIRECTORY() combines add_subdirectory() with an option()
 # MACRO_OPTIONAL_ADD_SUBDIRECTORY( <dir> )
-# If you use MACRO_OPTIONAL_ADD_SUBDIRECTORY() instead of ADD_SUBDIRECTORY(),
+# If you use MACRO_OPTIONAL_ADD_SUBDIRECTORY() instead of add_subdirectory(),
 # this will have two effects
 # 1 - CMake will not complain if the directory doesn't exist
 #     This makes sense if you want to distribute just one of the subdirs
@@ -16,16 +16,16 @@
 # For details see the accompanying COPYING-CMAKE-SCRIPTS file.
 
 
-MACRO (MACRO_OPTIONAL_ADD_SUBDIRECTORY _dir )
-   GET_FILENAME_COMPONENT(_fullPath ${_dir} ABSOLUTE)
-   IF(EXISTS ${_fullPath})
-      IF(${ARGC} EQUAL 2)
-        OPTION(BUILD_${_dir} "Build directory ${_dir}" ${ARGV1})
-      ELSE(${ARGC} EQUAL 2)
-        OPTION(BUILD_${_dir} "Build directory ${_dir}" TRUE)
-      ENDIF(${ARGC} EQUAL 2)
-      IF(BUILD_${_dir})
-         ADD_SUBDIRECTORY(${_dir})
-      ENDIF(BUILD_${_dir})
-   ENDIF(EXISTS ${_fullPath})
-ENDMACRO (MACRO_OPTIONAL_ADD_SUBDIRECTORY)
+macro (MACRO_OPTIONAL_ADD_SUBDIRECTORY _dir )
+   get_filename_component(_fullPath ${_dir} ABSOLUTE)
+   if(EXISTS ${_fullPath})
+      if(${ARGC} EQUAL 2)
+        option(BUILD_${_dir} "Build directory ${_dir}" ${ARGV1})
+      else(${ARGC} EQUAL 2)
+        option(BUILD_${_dir} "Build directory ${_dir}" TRUE)
+      endif(${ARGC} EQUAL 2)
+      if(BUILD_${_dir})
+         add_subdirectory(${_dir})
+      endif(BUILD_${_dir})
+   endif(EXISTS ${_fullPath})
+endmacro (MACRO_OPTIONAL_ADD_SUBDIRECTORY)
diff --git a/contrib/eigen/bench/btl/data/CMakeLists.txt b/contrib/eigen/bench/btl/data/CMakeLists.txt
index 6af2a366f77ac4923d84584f1e7e4d7a834123bc..580c1ced0fb438971fabcbb023607ef087552da5 100644
--- a/contrib/eigen/bench/btl/data/CMakeLists.txt
+++ b/contrib/eigen/bench/btl/data/CMakeLists.txt
@@ -1,25 +1,25 @@
 
-ADD_CUSTOM_TARGET(copy_scripts)
+add_custom_target(copy_scripts)
 
-SET(script_files go_mean mk_mean_script.sh mk_new_gnuplot.sh
+set(script_files go_mean mk_mean_script.sh mk_new_gnuplot.sh
     perlib_plot_settings.txt action_settings.txt gnuplot_common_settings.hh )
 
-FOREACH(script_file ${script_files})
-ADD_CUSTOM_COMMAND(
+foreach(script_file ${script_files})
+add_custom_command(
   TARGET copy_scripts
   POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/${script_file} ${CMAKE_CURRENT_BINARY_DIR}/
   ARGS
 )
-ENDFOREACH(script_file)
+endforeach(script_file)
 
-ADD_CUSTOM_COMMAND(
+add_custom_command(
   TARGET copy_scripts
   POST_BUILD
   COMMAND ${CMAKE_CXX_COMPILER} --version | head -n 1 > ${CMAKE_CURRENT_BINARY_DIR}/compiler_version.txt
   ARGS
 )
-ADD_CUSTOM_COMMAND(
+add_custom_command(
   TARGET copy_scripts
   POST_BUILD
   COMMAND echo "${Eigen_SOURCE_DIR}" > ${CMAKE_CURRENT_BINARY_DIR}/eigen_root_dir.txt
diff --git a/contrib/eigen/bench/btl/data/go_mean b/contrib/eigen/bench/btl/data/go_mean
index 42338ca27811906c948b0ea2ba0f28f7957b00c2..d014269099534ca88020d2a29bd966a51bb8efb9 100755
--- a/contrib/eigen/bench/btl/data/go_mean
+++ b/contrib/eigen/bench/btl/data/go_mean
@@ -27,7 +27,7 @@ echo '<ul>'\
   '<li>' `cat /proc/cpuinfo | grep "model name" | head -n 1`\
   '  (' `uname -m` ')</li>'\
   '<li> compiler: ' `cat compiler_version.txt` '</li>'\
-  '<li> eigen3: ' `hg identify -i $EIGENDIR` '</li>'\
+  '<li> eigen3: ' `git ls-remote --refs  -q $EIGENDIR HEAD | cut  -f 1` '</li>'\
   '</ul>' \
   '</p>'  >> $webpagefilename
 
diff --git a/contrib/eigen/bench/btl/generic_bench/bench.hh b/contrib/eigen/bench/btl/generic_bench/bench.hh
index 7b7b951b509e7e75e9251ca35433c1a2cfe5150d..0732940d532679af3345b836e52dfeefcb887548 100644
--- a/contrib/eigen/bench/btl/generic_bench/bench.hh
+++ b/contrib/eigen/bench/btl/generic_bench/bench.hh
@@ -159,7 +159,7 @@ BTL_DONT_INLINE void bench( int size_min, int size_max, int nb_point ){
 //    bench<Mixed_Perf_Analyzer,Action>(size_min,size_max,nb_point);
 
 
-  // Only for small problem size. Otherwize it will be too long
+  // Only for small problem size. Otherwise it will be too long
 //   bench<X86_Perf_Analyzer,Action>(size_min,size_max,nb_point);
 //   bench<STL_Perf_Analyzer,Action>(size_min,size_max,nb_point);
 
diff --git a/contrib/eigen/bench/btl/generic_bench/utils/size_log.hh b/contrib/eigen/bench/btl/generic_bench/utils/size_log.hh
index 13a3da7a8f94c052dc620a0d190ab84809df9333..68945e7cc12a13497b4ef091fe34ed22fced1f90 100644
--- a/contrib/eigen/bench/btl/generic_bench/utils/size_log.hh
+++ b/contrib/eigen/bench/btl/generic_bench/utils/size_log.hh
@@ -23,7 +23,7 @@
 #include "math.h"
 // The Vector class must satisfy the following part of STL vector concept :
 //            resize() method
-//            [] operator for seting element
+//            [] operator for setting element
 // the vector element are int compatible.
 template<class Vector>
 void size_log(const int nb_point, const int size_min, const int size_max, Vector & X)
diff --git a/contrib/eigen/bench/btl/generic_bench/utils/xy_file.hh b/contrib/eigen/bench/btl/generic_bench/utils/xy_file.hh
index 4571bed8f81fb047a663a24258cd717e5c084f31..0492faf09082c4e06d3410487ed33d86744a3e51 100644
--- a/contrib/eigen/bench/btl/generic_bench/utils/xy_file.hh
+++ b/contrib/eigen/bench/btl/generic_bench/utils/xy_file.hh
@@ -55,7 +55,7 @@ bool read_xy_file(const std::string & filename, std::vector<int> & tab_sizes,
 
 // The Vector class must satisfy the following part of STL vector concept :
 //            resize() method
-//            [] operator for seting element
+//            [] operator for setting element
 // the vector element must have the << operator define
 
 using namespace std;
diff --git a/contrib/eigen/bench/btl/libs/BLAS/CMakeLists.txt b/contrib/eigen/bench/btl/libs/BLAS/CMakeLists.txt
index 0272ccad07a4a0eab515a79178b46fac1fcd83fe..f2738f1695b4d124c01d6cefb65a165a7986c67f 100644
--- a/contrib/eigen/bench/btl/libs/BLAS/CMakeLists.txt
+++ b/contrib/eigen/bench/btl/libs/BLAS/CMakeLists.txt
@@ -5,8 +5,8 @@ if (ATLAS_FOUND)
   if(BUILD_btl_atlas)
     target_link_libraries(btl_atlas ${ATLAS_LIBRARIES})
     set_target_properties(btl_atlas PROPERTIES COMPILE_FLAGS "-DCBLASNAME=ATLAS -DHAS_LAPACK=1")
-  endif(BUILD_btl_atlas)
-endif (ATLAS_FOUND)
+  endif()
+endif ()
 
 find_package(MKL)
 if (MKL_FOUND)
@@ -14,8 +14,8 @@ if (MKL_FOUND)
   if(BUILD_btl_mkl)
     target_link_libraries(btl_mkl ${MKL_LIBRARIES})
     set_target_properties(btl_mkl PROPERTIES COMPILE_FLAGS "-DCBLASNAME=INTEL_MKL -DHAS_LAPACK=1")
-  endif(BUILD_btl_mkl)
-endif (MKL_FOUND)
+  endif()
+endif ()
 
 
 find_package(OPENBLAS)
@@ -24,8 +24,8 @@ if (OPENBLAS_FOUND)
   if(BUILD_btl_openblas)
     target_link_libraries(btl_openblas ${OPENBLAS_LIBRARIES} )
     set_target_properties(btl_openblas PROPERTIES COMPILE_FLAGS "-DCBLASNAME=OPENBLAS")
-  endif(BUILD_btl_openblas)
-endif (OPENBLAS_FOUND)
+  endif()
+endif ()
 
 find_package(ACML)
 if (ACML_FOUND)
@@ -33,8 +33,8 @@ if (ACML_FOUND)
   if(BUILD_btl_acml)
     target_link_libraries(btl_acml ${ACML_LIBRARIES} )
     set_target_properties(btl_acml PROPERTIES COMPILE_FLAGS "-DCBLASNAME=ACML -DHAS_LAPACK=1")
-  endif(BUILD_btl_acml)
-endif (ACML_FOUND)
+  endif()
+endif ()
 
 if(Eigen_SOURCE_DIR AND CMAKE_Fortran_COMPILER_WORKS)
   # we are inside Eigen and blas/lapack interface is compilable
diff --git a/contrib/eigen/bench/btl/libs/BLAS/blas_interface_impl.hh b/contrib/eigen/bench/btl/libs/BLAS/blas_interface_impl.hh
index fc4ba2a1f39954351cf520bafd649f118280926b..9e0a649052847102a752542c3e6d786d969ad031 100644
--- a/contrib/eigen/bench/btl/libs/BLAS/blas_interface_impl.hh
+++ b/contrib/eigen/bench/btl/libs/BLAS/blas_interface_impl.hh
@@ -46,9 +46,9 @@ public :
     BLAS_FUNC(gemm)(&notrans,&notrans,&N,&N,&N,&fone,A,&N,B,&N,&fzero,X,&N);
   }
 
-//   static inline void ata_product(gene_matrix & A, gene_matrix & X, int N){
-//     ssyrk_(&lower,&trans,&N,&N,&fone,A,&N,&fzero,X,&N);
-//   }
+  static inline void ata_product(gene_matrix & A, gene_matrix & X, int N){
+    BLAS_FUNC(syrk)(&lower,&trans,&N,&N,&fone,A,&N,&fzero,X,&N);
+  }
 
   static inline void aat_product(gene_matrix & A, gene_matrix & X, int N){
     BLAS_FUNC(syrk)(&lower,&notrans,&N,&N,&fone,A,&N,&fzero,X,&N);
diff --git a/contrib/eigen/bench/btl/libs/BLAS/main.cpp b/contrib/eigen/bench/btl/libs/BLAS/main.cpp
index 564d55ef23f6becc520f69032e3c41f575cbdf01..fd991490afeb4992e3f2a2d353fb0a6a7c92eadb 100644
--- a/contrib/eigen/bench/btl/libs/BLAS/main.cpp
+++ b/contrib/eigen/bench/btl/libs/BLAS/main.cpp
@@ -48,7 +48,7 @@ int main()
   bench<Action_rot<blas_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
 
   bench<Action_matrix_matrix_product<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
-//   bench<Action_ata_product<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_ata_product<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
   bench<Action_aat_product<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
 
   bench<Action_trisolve<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
diff --git a/contrib/eigen/bench/btl/libs/STL/STL_interface.hh b/contrib/eigen/bench/btl/libs/STL/STL_interface.hh
index ef4cc923309fe6c7ce2e1761749cbb420c812ada..16658c4baa1768e6de86513dde0badaf102e3b9c 100644
--- a/contrib/eigen/bench/btl/libs/STL/STL_interface.hh
+++ b/contrib/eigen/bench/btl/libs/STL/STL_interface.hh
@@ -78,18 +78,18 @@ public :
         cible[i][j]=source[i][j];
   }
 
-//   static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N)
-//   {
-//     real somme;
-//     for (int j=0;j<N;j++){
-//       for (int i=0;i<N;i++){
-//         somme=0.0;
-//         for (int k=0;k<N;k++)
-//           somme += A[i][k]*A[j][k];
-//         X[j][i]=somme;
-//       }
-//     }
-//   }
+  static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N)
+  {
+    real somme;
+    for (int j=0;j<N;j++){
+      for (int i=0;i<N;i++){
+        somme=0.0;
+        for (int k=0;k<N;k++)
+          somme += A[i][k]*A[j][k];
+        X[j][i]=somme;
+      }
+    }
+  }
 
   static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N)
   {
diff --git a/contrib/eigen/bench/btl/libs/blaze/blaze_interface.hh b/contrib/eigen/bench/btl/libs/blaze/blaze_interface.hh
index ee1523944125950a4b8d669c0dd835e52f8d3800..7b418f6da067b1d15c0d77cdfa90f057c0f3976b 100644
--- a/contrib/eigen/bench/btl/libs/blaze/blaze_interface.hh
+++ b/contrib/eigen/bench/btl/libs/blaze/blaze_interface.hh
@@ -20,6 +20,7 @@
 
 #include <blaze/Math.h>
 #include <blaze/Blaze.h>
+#include <Eigen/Core>
 // using namespace blaze;
 
 #include <vector>
@@ -80,35 +81,35 @@ public :
     }
   }
 
-  static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
+  static EIGEN_DONT_INLINE void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
     X = (A*B);
   }
 
-  static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
+  static EIGEN_DONT_INLINE void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
     X = (trans(A)*trans(B));
   }
 
-  static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N){
+  static EIGEN_DONT_INLINE void ata_product(const gene_matrix & A, gene_matrix & X, int N){
     X = (trans(A)*A);
   }
 
-  static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){
+  static EIGEN_DONT_INLINE void aat_product(const gene_matrix & A, gene_matrix & X, int N){
     X = (A*trans(A));
   }
 
-  static inline void matrix_vector_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
+  static EIGEN_DONT_INLINE void matrix_vector_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
     X = (A*B);
   }
 
-  static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
+  static EIGEN_DONT_INLINE void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
     X = (trans(A)*B);
   }
 
-  static inline void axpy(const real coef, const gene_vector & X, gene_vector & Y, int N){
+  static EIGEN_DONT_INLINE void axpy(const real coef, const gene_vector & X, gene_vector & Y, int N){
     Y += coef * X;
   }
 
-  static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int N){
+  static EIGEN_DONT_INLINE void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int N){
     Y = a*X + b*Y;
   }
 
diff --git a/contrib/eigen/bench/btl/libs/blaze/main.cpp b/contrib/eigen/bench/btl/libs/blaze/main.cpp
index 80e8f4eaa03a91f285d61a17e09613f53a13505d..ccae0cbd57f92b6a7739e4f65452a4a655df5e15 100644
--- a/contrib/eigen/bench/btl/libs/blaze/main.cpp
+++ b/contrib/eigen/bench/btl/libs/blaze/main.cpp
@@ -30,9 +30,9 @@ int main()
 
   bench<Action_matrix_vector_product<blaze_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
   bench<Action_atv_product<blaze_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
-//   bench<Action_matrix_matrix_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
-//   bench<Action_ata_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
-//   bench<Action_aat_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_matrix_matrix_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_ata_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_aat_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
 
   return 0;
 }
diff --git a/contrib/eigen/bench/btl/libs/blitz/CMakeLists.txt b/contrib/eigen/bench/btl/libs/blitz/CMakeLists.txt
index 880ab7338564d89aaa919e7af064faf06b27e44e..e203c81522e0b7bdc5f1c0e65fb944d0e8d69049 100644
--- a/contrib/eigen/bench/btl/libs/blitz/CMakeLists.txt
+++ b/contrib/eigen/bench/btl/libs/blitz/CMakeLists.txt
@@ -7,11 +7,11 @@ if (BLITZ_FOUND)
   btl_add_bench(btl_blitz btl_blitz.cpp)
   if (BUILD_btl_blitz)
     target_link_libraries(btl_blitz ${BLITZ_LIBRARIES})
-  endif (BUILD_btl_blitz)
+  endif ()
 
   btl_add_bench(btl_tiny_blitz btl_tiny_blitz.cpp OFF)
   if (BUILD_btl_tiny_blitz)
     target_link_libraries(btl_tiny_blitz ${BLITZ_LIBRARIES})
-  endif (BUILD_btl_tiny_blitz)
+  endif ()
 
-endif (BLITZ_FOUND)
+endif ()
diff --git a/contrib/eigen/bench/btl/libs/eigen3/CMakeLists.txt b/contrib/eigen/bench/btl/libs/eigen3/CMakeLists.txt
index 00cae23d3a74f3cfe14b2ecf8335145aa185a419..06a72b4fb993ff1fb36fe4cc4d8b199da6590802 100644
--- a/contrib/eigen/bench/btl/libs/eigen3/CMakeLists.txt
+++ b/contrib/eigen/bench/btl/libs/eigen3/CMakeLists.txt
@@ -47,9 +47,9 @@ if (EIGEN3_FOUND)
 
 #     if(BUILD_btl_eigen3_adv)
 #       target_link_libraries(btl_eigen3_adv ${MKL_LIBRARIES})
-#     endif(BUILD_btl_eigen3_adv)
+#     endif()
 
-  endif(NOT BTL_NOVEC)
+  endif()
 
   btl_add_bench(btl_tiny_eigen3 btl_tiny_eigen3.cpp OFF)
 
@@ -59,7 +59,7 @@ if (EIGEN3_FOUND)
 
     if(BUILD_btl_tiny_eigen3_novec)
       btl_add_target_property(btl_tiny_eigen3_novec    COMPILE_FLAGS "-DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen3_tiny_novec")
-    endif(BUILD_btl_tiny_eigen3_novec)
-  endif(NOT BTL_NOVEC)
+    endif()
+  endif()
 
-endif (EIGEN3_FOUND)
+endif ()
diff --git a/contrib/eigen/bench/btl/libs/eigen3/eigen3_interface.hh b/contrib/eigen/bench/btl/libs/eigen3/eigen3_interface.hh
index b821fd721174d08ecf51feeb3d604248e1c68c1c..2e302d0729306515cfca1975ca6c1689e212d361 100644
--- a/contrib/eigen/bench/btl/libs/eigen3/eigen3_interface.hh
+++ b/contrib/eigen/bench/btl/libs/eigen3/eigen3_interface.hh
@@ -92,9 +92,11 @@ public :
     X.noalias() = A.transpose()*B.transpose();
   }
 
-//   static inline void ata_product(const gene_matrix & A, gene_matrix & X, int  /*N*/){
-//     X.noalias() = A.transpose()*A;
-//   }
+  static inline void ata_product(const gene_matrix & A, gene_matrix & X, int  /*N*/){
+    //X.noalias() = A.transpose()*A;
+    X.template triangularView<Lower>().setZero();
+    X.template selfadjointView<Lower>().rankUpdate(A.transpose());
+  }
 
   static inline void aat_product(const gene_matrix & A, gene_matrix & X, int  /*N*/){
     X.template triangularView<Lower>().setZero();
diff --git a/contrib/eigen/bench/btl/libs/eigen3/main_matmat.cpp b/contrib/eigen/bench/btl/libs/eigen3/main_matmat.cpp
index 926fa2b0177d92d2db61c00fb39335e1c1e4ec25..052810a16b7836333d6d989e506614e8a05bc0fb 100644
--- a/contrib/eigen/bench/btl/libs/eigen3/main_matmat.cpp
+++ b/contrib/eigen/bench/btl/libs/eigen3/main_matmat.cpp
@@ -25,7 +25,7 @@ BTL_MAIN;
 int main()
 {
   bench<Action_matrix_matrix_product<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
-//   bench<Action_ata_product<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_ata_product<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
   bench<Action_aat_product<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
   bench<Action_trmm<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
 
diff --git a/contrib/eigen/bench/btl/libs/gmm/CMakeLists.txt b/contrib/eigen/bench/btl/libs/gmm/CMakeLists.txt
index bc2586243092771e0b1f9919c6a093f522b8d685..0bcb0465935f9c40b7f36debb5c7f9309ff20b75 100644
--- a/contrib/eigen/bench/btl/libs/gmm/CMakeLists.txt
+++ b/contrib/eigen/bench/btl/libs/gmm/CMakeLists.txt
@@ -3,4 +3,4 @@ find_package(GMM)
 if (GMM_FOUND)
   include_directories(${GMM_INCLUDES})
   btl_add_bench(btl_gmm main.cpp)
-endif (GMM_FOUND)
+endif ()
diff --git a/contrib/eigen/bench/btl/libs/mtl4/CMakeLists.txt b/contrib/eigen/bench/btl/libs/mtl4/CMakeLists.txt
index 14b47a808c990500baadcef95268a8636b2e3bd1..132a501bea47545f6d663db75fc5a0a685bc5b53 100644
--- a/contrib/eigen/bench/btl/libs/mtl4/CMakeLists.txt
+++ b/contrib/eigen/bench/btl/libs/mtl4/CMakeLists.txt
@@ -3,4 +3,4 @@ find_package(MTL4)
 if (MTL4_FOUND)
   include_directories(${MTL4_INCLUDE_DIR})
   btl_add_bench(btl_mtl4 main.cpp)
-endif (MTL4_FOUND)
+endif ()
diff --git a/contrib/eigen/bench/btl/libs/tensors/CMakeLists.txt b/contrib/eigen/bench/btl/libs/tensors/CMakeLists.txt
index 09d6d8e43ee4de933fd273105b56f42c7f75a701..e10a736f575db9864be39d1e1e1e53807e22febe 100644
--- a/contrib/eigen/bench/btl/libs/tensors/CMakeLists.txt
+++ b/contrib/eigen/bench/btl/libs/tensors/CMakeLists.txt
@@ -39,6 +39,6 @@ if (TENSOR_FOUND)
     btl_add_target_property(btl_tensor_novec_vecmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
     btl_add_target_property(btl_tensor_novec_matmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
 
-  endif(NOT BTL_NOVEC)
+  endif()
 
-endif (TENSOR_FOUND)
+endif ()
diff --git a/contrib/eigen/bench/btl/libs/tvmet/CMakeLists.txt b/contrib/eigen/bench/btl/libs/tvmet/CMakeLists.txt
index 25b565b972360f99d49dc7560bd7989ba1f24682..e7376972cb53dde76ae29f08e8d39cb27437d05b 100644
--- a/contrib/eigen/bench/btl/libs/tvmet/CMakeLists.txt
+++ b/contrib/eigen/bench/btl/libs/tvmet/CMakeLists.txt
@@ -3,4 +3,4 @@ find_package(Tvmet)
 if (TVMET_FOUND)
   include_directories(${TVMET_INCLUDE_DIR})
   btl_add_bench(btl_tvmet main.cpp OFF)
-endif (TVMET_FOUND)
+endif ()
diff --git a/contrib/eigen/bench/btl/libs/ublas/CMakeLists.txt b/contrib/eigen/bench/btl/libs/ublas/CMakeLists.txt
index bdb58bea1ba0c7aedec5b82f1c6b8f2c2c443c89..5accf5b820ca9e7cb5c875def89ceeac7b9dc580 100644
--- a/contrib/eigen/bench/btl/libs/ublas/CMakeLists.txt
+++ b/contrib/eigen/bench/btl/libs/ublas/CMakeLists.txt
@@ -4,4 +4,4 @@ if (Boost_FOUND)
   include_directories(${Boost_INCLUDE_DIRS})
   include_directories(${Boost_INCLUDES})
   btl_add_bench(btl_ublas main.cpp)
-endif (Boost_FOUND)
+endif ()
diff --git a/contrib/eigen/bench/btl/libs/ublas/ublas_interface.hh b/contrib/eigen/bench/btl/libs/ublas/ublas_interface.hh
index 95cad5195e3cbf2885a69696c550d06c190b4755..f59b7cf2f537802329a2f39c76063f7a039cd362 100644
--- a/contrib/eigen/bench/btl/libs/ublas/ublas_interface.hh
+++ b/contrib/eigen/bench/btl/libs/ublas/ublas_interface.hh
@@ -100,7 +100,7 @@ public :
     Y+=coef*X;
   }
 
-  // alias free assignements
+  // alias free assignments
 
   static inline void matrix_vector_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
     X.assign(prod(A,B));
diff --git a/contrib/eigen/bench/eig33.cpp b/contrib/eigen/bench/eig33.cpp
index 47947a9bed50df66bcc840dbf3fafeb60cae87e8..f003d8a53aed5b9fbf0322b0097c4f225c4d029c 100644
--- a/contrib/eigen/bench/eig33.cpp
+++ b/contrib/eigen/bench/eig33.cpp
@@ -101,7 +101,7 @@ void eigen33(const Matrix& mat, Matrix& evecs, Vector& evals)
   computeRoots(scaledMat,evals);
 
   // compute the eigen vectors
-  // **here we assume 3 differents eigenvalues**
+  // **here we assume 3 different eigenvalues**
 
   // "optimized version" which appears to be slower with gcc!
 //     Vector base;
diff --git a/contrib/eigen/bench/perf_monitoring/changesets.txt b/contrib/eigen/bench/perf_monitoring/changesets.txt
new file mode 100644
index 0000000000000000000000000000000000000000..efdd9a0ff8ec9a2ffc3985941cfd0be4b2f66629
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/changesets.txt
@@ -0,0 +1,95 @@
+Load hg-to-git hash maps from ./eigen_git/.git/
+#3.0.1
+#3.1.1
+#3.2.0
+3.2.4
+#574a7621809fe
+58964a85800bd  # introduce AVX
+#589cbd7e98174  # merge
+589db7d49efbb  # introduce FMA
+#590a078f442a3  # complex and AVX
+590a419cea4a0  # improve packing with ptranspose
+#59251e85c936d  # merge
+#592e497a27ddc
+593d5a795f673  # New gebp kernel: up to 3 packets x 4 register-level blocks
+#5942c3c95990d  # merge
+#596c9788d55b9  # Disable 3pX4 kernel on Altivec
+#5999aa3dc4e21  # merge
+6209452eb38f8   # before-evaluators
+#6333eba5e1101  # Implement evaluator for sparse outer products
+#663b9d314ae19
+#6655ef95fabee  # Properly detect FMA support on ARM
+#667fe25f3b8e3   # FMA has been wrongly disabled
+#668409547a0c8
+#6694304c73542   # merge default to tensors
+#67216047c8d4a   # merge default to tensors
+#67410a79ca3a3   # merge default to tensors
+#674b7271dffb5   # Generalized the gebp apis
+676bfdd9f3ac9   # Made the blocking computation aware of the l3 cache;<br/> Also optimized the blocking parameters to take<br/> into account the number of threads used for a computation.
+6782dde63499c   # generalized gemv
+6799f98650d0a   # ensured that contractions that can be reduced to a matrix vector product
+#6840918c51e60   # merge tensor
+684e972b55ec4   # change prefetching in gebp
+#68598604576d1   # merge index conversion
+68963eb0f6fe6   # clean blocking size computation
+689db05f2d01e   # rotating kernel for ARM only
+#6901b7e12847d   # result_of
+69226275b250a   # fix prefetching change for ARM
+692692136350b   # prefetching
+693a8ad8887bf   # blocking size strategy
+693bcf9bb5c1f   # avoid redundant pack_rhs
+6987550107028   # dynamic loop swapping
+69858740ce4c6   # rm dynamic loop swapping,<br/> adjust lhs's micro panel height to fully exploit L1 cache
+698cd3bbffa73   # blocking heuristic:<br/> block on the rhs in L1 if the lhs fit in L1.
+701488c15615a   # organize a little our default cache sizes,<br/> and use a saner default L1 outside of x86 (10% faster on Nexus 5)
+701e56aabf205   # Refactor computeProductBlockingSizes to make room<br/> for the possibility of using lookup tables
+701ca5c12587b   # Polish lookup tables generation
+7013589a9c115   # actual_panel_rows computation should always be resilient<br/> to parameters not consistent with the known L1 cache size, see comment
+70102babb9c0f   # Provide a empirical lookup table for blocking sizes measured on a Nexus 5.<br/> Only for float, only for Android on ARM 32bit for now.
+7088481dc21ea   # Bug 986: add support for coefficient-based<br/> product with 0 depth.
+709d7f51feb07   # Bug 992: don't select a 3p GEMM path with non-SIMD scalar types.
+759f9303cc7c5   # 3.3-alpha1
+765aba1eda71e   # help clang inlining
+770fe630c9873   # Improve numerical accuracy in LLT and triangular solve<br/> by using true scalar divisions (instead of x * (1/y))
+#8741d23430628   # Improved the matrix multiplication blocking in the case<br/> where mr is not a power of 2 (e.g on Haswell CPUs)
+878f629fe95c8   # Made the index type a template parameter to evaluateProductBlockingSizes.<br/> Use numext::mini and numext::maxi instead of <br/> std::min/std::max to compute blocking sizes.
+8975d51a7f12c   # Don't optimize the processing of the last rows of<br/> a matrix matrix product in cases that violate<br/> the assumptions made by the optimized code path.
+8986136f4fdd4   # Remove the rotating kernel.
+898e68e165a23   # Bug 256: enable vectorization with unaligned loads/stores.
+91466e99ab6a1   # Relax mixing-type constraints for binary coeff-wise operators
+91776236cdea4   # merge
+917101ea26f5e   # Include the cost of stores in unrolling
+921672076db5d   # Fix perf regression introduced in changeset e56aabf205
+9210fa9e4a15c   # Fix perf regression in dgemm introduced by changeset 5d51a7f12c
+936f6b3cf8de9   # 3.3-beta2
+944504a4404f1   # Optimize expression matching 'd?=a-b*c' as 'd?=a; d?=b*c;'
+95877e27fbeee   # 3.3-rc1
+959779774f98c   # Bug 1311: fix alignment logic in some cases<br/> of (scalar*small).lazyProduct(small)
+9729f9d8d2f62   # Disabled part of the matrix matrix peeling code<br/> that's incompatible with 512 bit registers
+979eeac81b8c0   # 3.3.0
+989c927af60ed   # Fix a performance regression in (mat*mat)*vec<br/> for which mat*mat was evaluated multiple times.
+994fe696022ec   # Operators += and -= do not resize!
+99466f65ccc36   # Ease compiler generating clean and efficient code in mat*vec
+9946a5fe86098   # Complete rewrite of column-major-matrix * vector product<br/> to deliver higher performance of modern CPU.
+99591003f3b86   # Improve performance of row-major-dense-matrix * vector products<br/> for recent CPUs.
+997eb621413c1   # Revert vec/y to vec*(1/y) in row-major TRSM
+10444bbc320468  # Bug 1435: fix aliasing issue in exressions like: A = C - B*A;
+1073624df50945  # Adds missing EIGEN_STRONG_INLINE to support MSVC<br/> properly inlining small vector calculations
+1094d428a199ab  # Bug 1562: optimize evaluation of small products<br/> of the form s*A*B by rewriting them as: s*(A.lazyProduct(B))<br/> to save a costly temporary.<br/> Measured speedup from 2x to 5x.
+1096de9e31a06d  # Introduce the macro ei_declare_local_nested_eval to<br/> help allocating on the stack local temporaries via alloca,<br/> and let outer-products makes a good use of it.
+11087b91c11207  # Bug 1578: Improve prefetching in matrix multiplication on MIPS.
+1153aa110e681b  # PR 526: Speed up multiplication of small, dynamically sized matrices
+11544ad359237a  # Vectorize row-by-row gebp loop iterations on 16 packets as well
+1157a476054879  # Bug 1624: improve matrix-matrix product on ARM 64, 20% speedup
+1160a4159dba08  # do not read buffers out of bounds
+1163c53eececb0  # Implement AVX512 vectorization of std::complex<float/double>
+11644e7746fe22  # Bug 1636: fix gemm performance issue with gcc>=6 and no FMA
+1164956678a4ef  # Bug 1515: disable gebp's 3pX4 micro kernel<br/> for MSVC<=19.14 because of register spilling.
+1165426bce7529  # fix EIGEN_GEBP_2PX4_SPILLING_WORKAROUND<br/> for non vectorized type, and non x86/64 target
+11660d90637838  # enable spilling workaround on architectures with SSE/AVX
+1166f159cf3d75  # Artificially increase l1-blocking size for AVX512.<br/> +10% speedup with current kernels.
+11686dd93f7e3b  # Make code compile again for older compilers.
+1175dbfcceabf5  # Bug: 1633: refactor gebp kernel and optimize for neon
+117670e133333d  # Bug 1661: fix regression in GEBP and AVX512
+11760f028f61cb  # GEBP: cleanup logic to choose between<br/> a 4 packets of 1 packet (=e118ce86fd+fix)
+1180de77bf5d6c  # gebp: Add new ½ and ¼ packet rows per (peeling) round on the lhs
diff --git a/contrib/eigen/bench/perf_monitoring/gemm.cpp b/contrib/eigen/bench/perf_monitoring/gemm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..804139db7a2ffbc27a059ef91693efc7d15c552b
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/gemm.cpp
@@ -0,0 +1,12 @@
+#include "gemm_common.h"
+
+EIGEN_DONT_INLINE
+void gemm(const Mat &A, const Mat &B, Mat &C)
+{
+  C.noalias() += A * B;
+}
+
+int main(int argc, char **argv)
+{
+  return main_gemm(argc, argv, gemm);
+}
diff --git a/contrib/eigen/bench/perf_monitoring/gemm/changesets.txt b/contrib/eigen/bench/perf_monitoring/gemm/changesets.txt
deleted file mode 100644
index af8eb9b8f7bb9ad6e8288123de6470dff347cd55..0000000000000000000000000000000000000000
--- a/contrib/eigen/bench/perf_monitoring/gemm/changesets.txt
+++ /dev/null
@@ -1,61 +0,0 @@
-#3.0.1
-#3.1.1
-#3.2.0
-3.2.4
-#5745:37f59e65eb6c
-5891:d8652709345d  # introduce AVX
-#5893:24b4dc92c6d3  # merge
-5895:997c2ef9fc8b  # introduce FMA
-#5904:e1eafd14eaa1  # complex and AVX
-5908:f8ee3c721251  # improve packing with ptranspose
-#5921:ca808bb456b0  # merge
-#5927:8b1001f9e3ac
-5937:5a4ca1ad8c53  # New gebp kernel handling up to 3 packets x 4 register-level blocks
-#5949:f3488f4e45b2  # merge
-#5969:e09031dccfd9  # Disable 3pX4 kernel on Altivec
-#5992:4a429f5e0483  # merge
-before-evaluators
-#6334:f6a45e5b8b7c  # Implement evaluator for sparse outer products
-#6639:c9121c60b5c7
-#6655:06f163b5221f  # Properly detect FMA support on ARM
-#6677:700e023044e7   # FMA has been wrongly disabled
-#6681:11d31dafb0e3
-#6699:5e6e8e10aad1   # merge default to tensors
-#6726:ff2d2388e7b9   # merge default to tensors
-#6742:0cbd6195e829   # merge default to tensors
-#6747:853d2bafeb8f   # Generalized the gebp apis
-6765:71584fd55762   # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation
-#6781:9cc5a931b2c6   # generalized gemv
-#6792:f6e1daab600a   # ensured that contractions that can be reduced to a matrix vector product
-#6844:039efd86b75c   # merge tensor
-6845:7333ed40c6ef   # change prefetching in gebp
-#6856:b5be5e10eb7f   # merge index conversion
-#6893:c3a64aba7c70   # clean blocking size computation
-#6898:6fb31ebe6492   # rotating kernel for ARM
-6899:877facace746   # rotating kernel for ARM only
-#6904:c250623ae9fa   # result_of
-6921:915f1b1fc158   # fix prefetching change for ARM
-6923:9ff25f6dacc6   # prefetching
-6933:52572e60b5d3   # blocking size strategy
-6937:c8c042f286b2   # avoid redundant pack_rhs
-6981:7e5d6f78da59   # dynamic loop swapping
-6984:45f26866c091   # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache
-6986:a675d05b6f8f   # blocking heuristic: block on the rhs in L1 if the lhs fit in L1.
-7013:f875e75f07e5   # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5)
-7015:8aad8f35c955   # Refactor computeProductBlockingSizes to make room for the possibility of using lookup tables
-7016:a58d253e8c91   # Polish lookup tables generation
-7018:9b27294a8186   # actual_panel_rows computation should always be resilient to parameters not consistent with the known L1 cache size, see comment
-7019:c758b1e2c073   # Provide a empirical lookup table for blocking sizes measured on a Nexus 5. Only for float, only for Android on ARM 32bit for now.
-7085:627e039fba68   # Bug 986: add support for coefficient-based product with 0 depth.
-7098:b6f1db9cf9ec   # Bug 992: don't select a 3p GEMM path with non-vectorizable scalar types, this hits unsupported paths in symm/triangular products code
-7591:09a8e2186610   # 3.3-alpha1
-7650:b0f3c8f43025   # help clang inlining
-#8744:74b789ada92a   # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs)
-8789:efcb912e4356   # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes
-8972:81d53c711775   # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path
-8985:d935df21a082   # Remove the rotating kernel.
-8988:6c2dc56e73b3   # Bug 256: enable vectorization with unaligned loads/stores.
-9148:b8b8c421e36c   # Relax mixing-type constraints for binary coefficient-wise operators
-9174:d228bc282ac9   # merge
-9212:c90098affa7b   # Fix performance regression introduced in changeset 8aad8f35c955
-9213:9f1c14e4694b   # Fix performance regression in dgemm introduced by changeset 81d53c711775
diff --git a/contrib/eigen/bench/perf_monitoring/gemm/make_plot.sh b/contrib/eigen/bench/perf_monitoring/gemm/make_plot.sh
deleted file mode 100755
index cd3214ac915b60a913b21ccd0cceba58f1750e88..0000000000000000000000000000000000000000
--- a/contrib/eigen/bench/perf_monitoring/gemm/make_plot.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-
-# base name of the bench
-# it reads $1.out
-# and generates $1.pdf
-WHAT=$1
-bench=$2
-
-header="rev "
-while read line
-do
-  if [ ! -z '$line' ]; then
-    header="$header  \"$line\""
-  fi
-done < $bench"_settings.txt"
-
-echo $header > $WHAT.out.header
-cat $WHAT.out >> $WHAT.out.header
-
-
-echo "set title '$WHAT'" > $WHAT.gnuplot
-echo "set key autotitle columnhead outside " >> $WHAT.gnuplot
-echo "set xtics rotate 1" >> $WHAT.gnuplot
-
-echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot
-echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot
-
-col=`cat $bench"_settings.txt" | wc -l`
-echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot
-echo " " >>  $WHAT.gnuplot
-
-gnuplot -persist < $WHAT.gnuplot
-
-# generate a png file
-# convert -background white -density 120 -rotate 90 -resize 800 +dither -colors 256 -quality 0 $WHAT.ps -background white -flatten  .$WHAT.png
-
-# clean
-rm $WHAT.out.header $WHAT.gnuplot
\ No newline at end of file
diff --git a/contrib/eigen/bench/perf_monitoring/gemm/gemm.cpp b/contrib/eigen/bench/perf_monitoring/gemm_common.h
similarity index 67%
rename from contrib/eigen/bench/perf_monitoring/gemm/gemm.cpp
rename to contrib/eigen/bench/perf_monitoring/gemm_common.h
index 614bd47373b471d7a770a9ac8d2789e7637948a8..30dbc0df68db28831aa4af2a4833a4cf0e6908b4 100644
--- a/contrib/eigen/bench/perf_monitoring/gemm/gemm.cpp
+++ b/contrib/eigen/bench/perf_monitoring/gemm_common.h
@@ -1,8 +1,9 @@
 #include <iostream>
 #include <fstream>
 #include <vector>
-#include <Eigen/Core>
-#include "../../BenchTimer.h"
+#include <string>
+#include "eigen_src/Eigen/Core"
+#include "../BenchTimer.h"
 using namespace Eigen;
 
 #ifndef SCALAR
@@ -13,14 +14,9 @@ typedef SCALAR Scalar;
 
 typedef Matrix<Scalar,Dynamic,Dynamic> Mat;
 
+template<typename Func>
 EIGEN_DONT_INLINE
-void gemm(const Mat &A, const Mat &B, Mat &C)
-{
-  C.noalias() += A * B;
-}
-
-EIGEN_DONT_INLINE
-double bench(long m, long n, long k)
+double bench(long m, long n, long k, const Func& f)
 {
   Mat A(m,k);
   Mat B(k,n);
@@ -44,21 +40,25 @@ double bench(long m, long n, long k)
   long rep = std::max(1., std::min(100., up/flops) );
   long tries = std::max(tm0, std::min(tm1, up/flops) );
   
-  BENCH(t, tries, rep, gemm(A,B,C));
+  BENCH(t, tries, rep, f(A,B,C));
   
   return 1e-9 * rep * flops / t.best();
 }
 
-int main(int argc, char **argv)
+template<typename Func>
+int main_gemm(int argc, char **argv, const Func& f)
 {
   std::vector<double> results;
   
-  std::ifstream settings("gemm_settings.txt");
+  std::string filename = std::string("gemm_settings.txt");
+  if(argc>1)
+    filename = std::string(argv[1]);
+  std::ifstream settings(filename);
   long m, n, k;
   while(settings >> m >> n >> k)
   {
     //std::cerr << "  Testing " << m << " " << n << " " << k << std::endl;
-    results.push_back( bench(m, n, k) );
+    results.push_back( bench(m, n, k, f) );
   }
   
   std::cout << RowVectorXd::Map(results.data(), results.size());
diff --git a/contrib/eigen/bench/perf_monitoring/gemm/gemm_settings.txt b/contrib/eigen/bench/perf_monitoring/gemm_settings.txt
similarity index 100%
rename from contrib/eigen/bench/perf_monitoring/gemm/gemm_settings.txt
rename to contrib/eigen/bench/perf_monitoring/gemm_settings.txt
diff --git a/contrib/eigen/bench/perf_monitoring/gemm_square_settings.txt b/contrib/eigen/bench/perf_monitoring/gemm_square_settings.txt
new file mode 100644
index 0000000000000000000000000000000000000000..98474d1736423175879f75e3bd1d641d4a707ca3
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/gemm_square_settings.txt
@@ -0,0 +1,11 @@
+8 8 8
+9 9 9
+12 12 12
+15 15 15
+16 16 16
+24 24 24
+102 102 102
+239 239 239
+240 240 240
+2400 2400 2400
+2463 2463 2463
diff --git a/contrib/eigen/bench/perf_monitoring/gemv.cpp b/contrib/eigen/bench/perf_monitoring/gemv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..82e5ab96031b413a9543b483e80eeaab8048fc98
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/gemv.cpp
@@ -0,0 +1,12 @@
+#include "gemv_common.h"
+
+EIGEN_DONT_INLINE
+void gemv(const Mat &A, const Vec &B, Vec &C)
+{
+  C.noalias() += A * B;
+}
+
+int main(int argc, char **argv)
+{
+  return main_gemv(argc, argv, gemv);
+}
diff --git a/contrib/eigen/bench/perf_monitoring/gemv_common.h b/contrib/eigen/bench/perf_monitoring/gemv_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc32577291384d208fad4e2dc9763d87bbb40538
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/gemv_common.h
@@ -0,0 +1,69 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <functional>
+#include "eigen_src/Eigen/Core"
+#include "../BenchTimer.h"
+using namespace Eigen;
+
+#ifndef SCALAR
+#error SCALAR must be defined
+#endif
+
+typedef SCALAR Scalar;
+
+typedef Matrix<Scalar,Dynamic,Dynamic> Mat;
+typedef Matrix<Scalar,Dynamic,1>       Vec;
+
+template<typename Func>
+EIGEN_DONT_INLINE
+double bench(long m, long n, Func &f)
+{
+  Mat A(m,n);
+  Vec B(n);
+  Vec C(m);
+  A.setRandom();
+  B.setRandom();
+  C.setRandom();
+
+  BenchTimer t;
+
+  double up = 1e8/sizeof(Scalar);
+  double tm0 = 4, tm1 = 10;
+  if(NumTraits<Scalar>::IsComplex)
+  {
+    up /= 4;
+    tm0 = 2;
+    tm1 = 4;
+  }
+
+  double flops = 2. * m * n;
+  long rep = std::max(1., std::min(100., up/flops) );
+  long tries = std::max(tm0, std::min(tm1, up/flops) );
+
+  BENCH(t, tries, rep, f(A,B,C));
+
+  return 1e-9 * rep * flops / t.best();
+}
+
+template<typename Func>
+int main_gemv(int argc, char **argv, Func& f)
+{
+  std::vector<double> results;
+
+  std::string filename = std::string("gemv_settings.txt");
+  if(argc>1)
+    filename = std::string(argv[1]);
+  std::ifstream settings(filename);
+  long m, n;
+  while(settings >> m >> n)
+  {
+    //std::cerr << "  Testing " << m << " " << n << std::endl;
+    results.push_back( bench(m, n, f) );
+  }
+
+  std::cout << RowVectorXd::Map(results.data(), results.size());
+
+  return 0;
+}
diff --git a/contrib/eigen/bench/perf_monitoring/gemv_settings.txt b/contrib/eigen/bench/perf_monitoring/gemv_settings.txt
new file mode 100644
index 0000000000000000000000000000000000000000..21a5ee051984f74da4d5d88ff3508d4fa3f7125c
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/gemv_settings.txt
@@ -0,0 +1,11 @@
+8 8
+9 9
+24 24
+239 239
+240 240
+2400 24
+24 2400
+24 240
+2400 2400
+4800 23
+23 4800
diff --git a/contrib/eigen/bench/perf_monitoring/gemv_square_settings.txt b/contrib/eigen/bench/perf_monitoring/gemv_square_settings.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5165759f494e7a3b520103c62b32cdcc9608064b
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/gemv_square_settings.txt
@@ -0,0 +1,13 @@
+8 8
+9 9
+12 12
+15 15
+16 16
+24 24
+53 53
+74 74
+102 102
+239 239
+240 240
+2400 2400
+2463 2463
diff --git a/contrib/eigen/bench/perf_monitoring/gemvt.cpp b/contrib/eigen/bench/perf_monitoring/gemvt.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fe945767ed223ca1df1fa15938b9b4c2b4dd7903
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/gemvt.cpp
@@ -0,0 +1,12 @@
+#include "gemv_common.h"
+
+EIGEN_DONT_INLINE
+void gemv(const Mat &A, Vec &B, const Vec &C)
+{
+  B.noalias() += A.transpose() * C;
+}
+
+int main(int argc, char **argv)
+{
+  return main_gemv(argc, argv, gemv);
+}
diff --git a/contrib/eigen/bench/perf_monitoring/gemm/lazy_gemm.cpp b/contrib/eigen/bench/perf_monitoring/lazy_gemm.cpp
similarity index 93%
rename from contrib/eigen/bench/perf_monitoring/gemm/lazy_gemm.cpp
rename to contrib/eigen/bench/perf_monitoring/lazy_gemm.cpp
index 6dc370155206866fc584ebb94cc646423baa272d..773306048d8e1bb85baf919585a9dca412890203 100644
--- a/contrib/eigen/bench/perf_monitoring/gemm/lazy_gemm.cpp
+++ b/contrib/eigen/bench/perf_monitoring/lazy_gemm.cpp
@@ -84,7 +84,10 @@ int main(int argc, char **argv)
 {
   std::vector<double> results;
   
-  std::ifstream settings("lazy_gemm_settings.txt");
+  std::string filename = std::string("lazy_gemm_settings.txt");
+  if(argc>1)
+    filename = std::string(argv[1]);
+  std::ifstream settings(filename);
   long m, n, k, t;
   while(settings >> m >> n >> k >> t)
   {
diff --git a/contrib/eigen/bench/perf_monitoring/gemm/lazy_gemm_settings.txt b/contrib/eigen/bench/perf_monitoring/lazy_gemm_settings.txt
similarity index 100%
rename from contrib/eigen/bench/perf_monitoring/gemm/lazy_gemm_settings.txt
rename to contrib/eigen/bench/perf_monitoring/lazy_gemm_settings.txt
diff --git a/contrib/eigen/bench/perf_monitoring/llt.cpp b/contrib/eigen/bench/perf_monitoring/llt.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d55b7d80340a7a07c1090da6f82bb97b515b4842
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/llt.cpp
@@ -0,0 +1,15 @@
+#include "gemm_common.h"
+#include <Eigen/Cholesky>
+
+EIGEN_DONT_INLINE
+void llt(const Mat &A, const Mat &B, Mat &C)
+{
+  C = A;
+  C.diagonal().array() += 1000;
+  Eigen::internal::llt_inplace<Mat::Scalar, Lower>::blocked(C);
+}
+
+int main(int argc, char **argv)
+{
+  return main_gemm(argc, argv, llt);
+}
diff --git a/contrib/eigen/bench/perf_monitoring/make_plot.sh b/contrib/eigen/bench/perf_monitoring/make_plot.sh
new file mode 100755
index 0000000000000000000000000000000000000000..65aaf66f9612f3f333cb6dd931af95f81aa5c457
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/make_plot.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+
+# base name of the bench
+# it reads $1.out
+# and generates $1.pdf
+WHAT=$1
+bench=$2
+settings_file=$3
+
+header="rev "
+while read line
+do
+  if [ ! -z '$line' ]; then
+    header="$header  \"$line\""
+  fi
+done < $settings_file
+
+echo $header > $WHAT.out.header
+cat $WHAT.out >> $WHAT.out.header
+
+
+echo "set title '$WHAT'" > $WHAT.gnuplot
+echo "set key autotitle columnhead outside " >> $WHAT.gnuplot
+echo "set xtics rotate 1" >> $WHAT.gnuplot
+
+echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot
+echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot
+
+col=`cat $settings_file | wc -l`
+echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot
+echo " " >>  $WHAT.gnuplot
+
+gnuplot -persist < $WHAT.gnuplot
+
+# generate a png file (thumbnail)
+convert -colors 256 -background white -density 300 -resize 300  -quality 0 $WHAT.pdf -background white -flatten $WHAT.png
+
+# clean
+rm $WHAT.out.header $WHAT.gnuplot
+
+
+# generate html/svg graph
+
+echo " " > $WHAT.html
+cat resources/chart_header.html > $WHAT.html
+echo 'var customSettings = {"TITLE":"","SUBTITLE":"","XLABEL":"","YLABEL":""};' >> $WHAT.html
+#  'data' is an array of datasets (i.e. curves), each of which is an object of the form
+#  {
+#    key: <name of the curve>,
+#    color: <optional color of the curve>,
+#    values: [{
+#        r: <revision number>,
+#        v: <GFlops>
+#    }]
+#  }
+echo 'var data = [' >> $WHAT.html
+
+col=2
+while read line
+do
+  if [ ! -z '$line' ]; then
+    header="$header  \"$line\""
+    echo '{"key":"'$line'","values":[' >> $WHAT.html
+    i=0
+    while read line2
+    do
+      if [ ! -z "$line2" ]; then
+        val=`echo $line2 | cut -s -f $col -d ' '`
+        if [ -n "$val" ]; then # skip build failures
+          echo '{"r":'$i',"v":'$val'},' >> $WHAT.html
+        fi
+      fi
+      ((i++))
+    done < $WHAT.out
+    echo ']},'  >> $WHAT.html
+  fi
+  ((col++))
+done < $settings_file
+echo '];'  >> $WHAT.html
+
+echo 'var changesets = [' >> $WHAT.html
+while read line2
+do
+  if [ ! -z '$line2' ]; then
+    echo '"'`echo $line2 | cut -f 1 -d ' '`'",' >> $WHAT.html
+  fi
+done < $WHAT.out
+echo '];'  >> $WHAT.html
+
+echo 'var changesets_details = [' >> $WHAT.html
+while read line2
+do
+  if [ ! -z '$line2' ]; then
+    num=`echo "$line2" | cut -f 1 -d ' '`
+    comment=`grep ":$num" changesets.txt | cut -f 2 -d '#'`
+    echo '"'"$comment"'",' >> $WHAT.html
+  fi
+done < $WHAT.out
+echo '];'  >> $WHAT.html
+
+echo 'var changesets_count = [' >> $WHAT.html
+i=0
+while read line2
+do
+  if [ ! -z '$line2' ]; then
+    echo $i ',' >> $WHAT.html
+  fi
+  ((i++))
+done < $WHAT.out
+echo '];'  >> $WHAT.html
+
+cat resources/chart_footer.html >> $WHAT.html
diff --git a/contrib/eigen/bench/perf_monitoring/resources/chart_footer.html b/contrib/eigen/bench/perf_monitoring/resources/chart_footer.html
new file mode 100644
index 0000000000000000000000000000000000000000..a96cdb898c09220e2dd7064bc69237b21d920d56
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/resources/chart_footer.html
@@ -0,0 +1,41 @@
+      /* setup the chart and its options */                                                                                
+      var chart = nv.models.lineChart()                                                                                    
+                    .color(d3.scale.category10().range())                                                                  
+                    .margin({left: 75, bottom: 100})                                                                        
+                    .forceX([0]).forceY([0]);                                                                              
+                                                                                                                           
+      chart.x(function(datum){ return datum.r; })                                                                          
+           .xAxis.options({                                                                                                
+             axisLabel: customSettings.XLABEL || 'Changeset',
+             tickFormat: d3.format('.0f')                                                                                  
+           });
+      chart.xAxis
+        .tickValues(changesets_count)
+        .tickFormat(function(d){return changesets[d]})
+        .rotateLabels(-90);
+                                                                                                                                                                                                       
+      chart.y(function(datum){ return datum.v; })                                                    
+            .yAxis.options({                                                                                              
+              axisLabel: customSettings.YLABEL || 'GFlops'/*,
+              tickFormat: function(val){ return d3.format('.0f')(val) + ' GFlops'; }*/
+            });
+      
+      chart.tooltip.headerFormatter(function(d) { return changesets[d]
+        + ' <p style="font-weight:normal;text-align: left;">'
+        + changesets_details[d] + "</p>"; });
+
+      //chart.useInteractiveGuideline(true);
+      d3.select('#chart').datum(data).call(chart);                                                                         
+      var plot = d3.select('#chart > g');                                                                                  
+                                                                                                                           
+      /* setup the title */                                                                                                
+      plot.append('text')                                                                                                  
+          .style('font-size', '24px')                                                                                      
+          .attr('text-anchor', 'middle').attr('x', '50%').attr('y', '20px')                                                
+          .text(customSettings.TITLE || '');                                                                                                                   
+                                                                                                                           
+      /* ensure the chart is responsive */                                                                                 
+      nv.utils.windowResize(chart.update);                                                                                 
+    </script>                                                                                                              
+  </body>                                                                                                                  
+</html>  
diff --git a/contrib/eigen/bench/perf_monitoring/resources/chart_header.html b/contrib/eigen/bench/perf_monitoring/resources/chart_header.html
new file mode 100644
index 0000000000000000000000000000000000000000..27eb02e546a8fc4bee5a1e998bee588fda7eaae9
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/resources/chart_header.html
@@ -0,0 +1,45 @@
+
+<!DOCTYPE html>                                                                                                            
+<html>                                                                                                                     
+  <head>                                                                                                                   
+    <meta charset='utf-8'>                                                                                                 
+    <style>.nvd3 .nv-axis line,.nvd3 .nv-axis path{fill:none;shape-rendering:crispEdges}.nv-brush .extent,.nvd3 .background path,.nvd3 .nv-axis line,.nvd3 .nv-axis path{shape-rendering:crispEdges}.nv-distx,.nv-disty,.nv-noninteractive,.nvd3 .nv-axis,.nvd3.nv-pie .nv-label,.nvd3.nv-sparklineplus g.nv-hoverValue{pointer-events:none}.nvtooltip,svg.nvd3-svg{display:block;-webkit-touch-callout:none;-khtml-user-select:none}.nvd3 .nv-axis{opacity:1}.nvd3 .nv-axis.nv-disabled,.nvd3 .nv-controlsWrap .nv-legend .nv-check-box .nv-check{opacity:0}.nvd3 .nv-axis path{stroke:#000;stroke-opacity:.75}.nvd3 .nv-axis path.domain{stroke-opacity:.75}.nvd3 .nv-axis.nv-x path.domain{stroke-opacity:0}.nvd3 .nv-axis line{stroke:#e5e5e5}.nvd3 .nv-axis .zero line, .nvd3 .nv-axis line.zero{stroke-opacity:.75}.nvd3 .nv-axis .nv-axisMaxMin text{font-weight:700}.nvd3 .x .nv-axis .nv-axisMaxMin text,.nvd3 .x2 .nv-axis .nv-axisMaxMin text,.nvd3 .x3 .nv-axis .nv-axisMaxMin text{text-anchor:middle}.nvd3 .nv-bars rect{fill-opacity:.75;transition:fill-opacity 250ms linear;-moz-transition:fill-opacity 250ms linear;-webkit-transition:fill-opacity 250ms linear}.nvd3 .nv-bars rect.hover{fill-opacity:1}.nvd3 .nv-bars .hover rect{fill:#add8e6}.nvd3 .nv-bars text{fill:transparent}.nvd3 .nv-bars .hover text{fill:rgba(0,0,0,1)}.nvd3 .nv-discretebar .nv-groups rect,.nvd3 .nv-multibar .nv-groups rect,.nvd3 .nv-multibarHorizontal .nv-groups rect{stroke-opacity:0;transition:fill-opacity 250ms linear;-moz-transition:fill-opacity 250ms linear;-webkit-transition:fill-opacity 250ms linear}.nvd3 .nv-candlestickBar .nv-ticks rect:hover,.nvd3 .nv-discretebar .nv-groups rect:hover,.nvd3 .nv-multibar .nv-groups rect:hover,.nvd3 .nv-multibarHorizontal .nv-groups rect:hover{fill-opacity:1}.nvd3 .nv-discretebar .nv-groups text,.nvd3 .nv-multibarHorizontal .nv-groups text{font-weight:700;fill:rgba(0,0,0,1);stroke:transparent}.nvd3 .nv-boxplot circle{fill-opacity:.5}.nvd3 .nv-boxplot circle:hover,.nvd3 .nv-boxplot rect:hover{fill-opacity:1}.nvd3 line.nv-boxplot-median{stroke:#000}.nv-boxplot-tick:hover{stroke-width:2.5px}.nvd3.nv-bullet{font:10px sans-serif}.nvd3.nv-bullet .nv-measure{fill-opacity:.8}.nvd3.nv-bullet .nv-measure:hover{fill-opacity:1}.nvd3.nv-bullet .nv-marker{stroke:#000;stroke-width:2px}.nvd3.nv-bullet .nv-markerTriangle{stroke:#000;fill:#fff;stroke-width:1.5px}.nvd3.nv-bullet .nv-markerLine{stroke:#000;stroke-width:1.5px}.nvd3.nv-bullet .nv-tick line{stroke:#666;stroke-width:.5px}.nvd3.nv-bullet .nv-range.nv-s0{fill:#eee}.nvd3.nv-bullet .nv-range.nv-s1{fill:#ddd}.nvd3.nv-bullet .nv-range.nv-s2{fill:#ccc}.nvd3.nv-bullet .nv-title{font-size:14px;font-weight:700}.nvd3.nv-bullet .nv-subtitle{fill:#999}.nvd3.nv-bullet .nv-range{fill:#bababa;fill-opacity:.4}.nvd3.nv-bullet .nv-range:hover{fill-opacity:.7}.nvd3.nv-candlestickBar .nv-ticks .nv-tick{stroke-width:1px}.nvd3.nv-candlestickBar .nv-ticks .nv-tick.hover{stroke-width:2px}.nvd3.nv-candlestickBar .nv-ticks .nv-tick.positive rect{stroke:#2ca02c;fill:#2ca02c}.nvd3.nv-candlestickBar .nv-ticks .nv-tick.negative rect{stroke:#d62728;fill:#d62728}.with-transitions .nv-candlestickBar .nv-ticks .nv-tick{transition:stroke-width 250ms linear,stroke-opacity 250ms linear;-moz-transition:stroke-width 250ms linear,stroke-opacity 250ms linear;-webkit-transition:stroke-width 250ms linear,stroke-opacity 250ms linear}.nvd3.nv-candlestickBar .nv-ticks line{stroke:#333}.nv-force-node{stroke:#fff;stroke-width:1.5px}.nv-force-link{stroke:#999;stroke-opacity:.6}.nv-force-node text{stroke-width:0}.nvd3 .nv-check-box .nv-box{fill-opacity:0;stroke-width:2}.nvd3 .nv-check-box .nv-check{fill-opacity:0;stroke-width:4}.nvd3 .nv-series.nv-disabled .nv-check-box .nv-check{fill-opacity:0;stroke-opacity:0}.nvd3.nv-linePlusBar .nv-bar rect{fill-opacity:.75}.nvd3.nv-linePlusBar .nv-bar rect:hover{fill-opacity:1}.nvd3 .nv-groups path.nv-line{fill:none}.nvd3 .nv-groups path.nv-area{stroke:none}.nvd3.nv-line .nvd3.nv-scatter .nv-groups .nv-point{fill-opacity:0;stroke-opacity:0}.nvd3.nv-scatter.nv-single-point .nv-groups .nv-point{fill-opacity:.5!important;stroke-opacity:.5!important}.with-transitions .nvd3 .nv-groups .nv-point{transition:stroke-width 250ms linear,stroke-opacity 250ms linear;-moz-transition:stroke-width 250ms linear,stroke-opacity 250ms linear;-webkit-transition:stroke-width 250ms linear,stroke-opacity 250ms linear}.nvd3 .nv-groups .nv-point.hover,.nvd3.nv-scatter .nv-groups .nv-point.hover{stroke-width:7px;fill-opacity:.95!important;stroke-opacity:.95!important}.nvd3 .nv-point-paths path{stroke:#aaa;stroke-opacity:0;fill:#eee;fill-opacity:0}.nvd3 .nv-indexLine{cursor:ew-resize}svg.nvd3-svg{-webkit-user-select:none;-ms-user-select:none;-moz-user-select:none;user-select:none;width:100%;height:100%}.nvtooltip.with-3d-shadow,.with-3d-shadow .nvtooltip{-moz-box-shadow:0 5px 10px rgba(0,0,0,.2);-webkit-box-shadow:0 5px 10px rgba(0,0,0,.2);box-shadow:0 5px 10px rgba(0,0,0,.2);-webkit-border-radius:5px;-moz-border-radius:5px;border-radius:5px}.nvd3 text{font:400 12px Arial}.nvd3 .title{font:700 14px Arial}.nvd3 .nv-background{fill:#fff;fill-opacity:0}.nvd3.nv-noData{font-size:18px;font-weight:700}.nv-brush .extent{fill-opacity:.125}.nv-brush .resize path{fill:#eee;stroke:#666}.nvd3 .nv-legend .nv-series{cursor:pointer}.nvd3 .nv-legend .nv-disabled circle{fill-opacity:0}.nvd3 .nv-brush .extent{fill-opacity:0!important}.nvd3 .nv-brushBackground rect{stroke:#000;stroke-width:.4;fill:#fff;fill-opacity:.7}@media print{.nvd3 text{stroke-width:0;fill-opacity:1}}.nvd3.nv-ohlcBar .nv-ticks .nv-tick{stroke-width:1px}.nvd3.nv-ohlcBar .nv-ticks .nv-tick.hover{stroke-width:2px}.nvd3.nv-ohlcBar .nv-ticks .nv-tick.positive{stroke:#2ca02c}.nvd3.nv-ohlcBar .nv-ticks .nv-tick.negative{stroke:#d62728}.nvd3 .background path{fill:none;stroke:#EEE;stroke-opacity:.4}.nvd3 .foreground path{fill:none;stroke-opacity:.7}.nvd3 .nv-parallelCoordinates-brush .extent{fill:#fff;fill-opacity:.6;stroke:gray;shape-rendering:crispEdges}.nvd3 .nv-parallelCoordinates .hover{fill-opacity:1;stroke-width:3px}.nvd3 .missingValuesline line{fill:none;stroke:#000;stroke-width:1;stroke-opacity:1;stroke-dasharray:5,5}.nvd3.nv-pie .nv-pie-title{font-size:24px;fill:rgba(19,196,249,.59)}.nvd3.nv-pie .nv-slice text{stroke:#000;stroke-width:0}.nvd3.nv-pie path{transition:fill-opacity 250ms linear,stroke-width 250ms linear,stroke-opacity 250ms linear;-moz-transition:fill-opacity 250ms linear,stroke-width 250ms linear,stroke-opacity 250ms linear;-webkit-transition:fill-opacity 250ms linear,stroke-width 250ms linear,stroke-opacity 250ms linear;stroke:#fff;stroke-width:1px;stroke-opacity:1;fill-opacity:.7}.nvd3.nv-pie .hover path{fill-opacity:1}.nvd3.nv-pie .nv-label rect{fill-opacity:0;stroke-opacity:0}.nvd3 .nv-groups .nv-point.hover{stroke-width:20px;stroke-opacity:.5}.nvd3 .nv-scatter .nv-point.hover{fill-opacity:1}.nvd3.nv-sparkline path{fill:none}.nvd3.nv-sparklineplus .nv-hoverValue line{stroke:#333;stroke-width:1.5px}.nvd3.nv-sparklineplus,.nvd3.nv-sparklineplus g{pointer-events:all}.nvd3 .nv-interactiveGuideLine,.nvtooltip{pointer-events:none}.nvd3 .nv-hoverArea{fill-opacity:0;stroke-opacity:0}.nvd3.nv-sparklineplus .nv-xValue,.nvd3.nv-sparklineplus .nv-yValue{stroke-width:0;font-size:.9em;font-weight:400}.nvd3.nv-sparklineplus .nv-yValue{stroke:#f66}.nvd3.nv-sparklineplus .nv-maxValue{stroke:#2ca02c;fill:#2ca02c}.nvd3.nv-sparklineplus .nv-minValue{stroke:#d62728;fill:#d62728}.nvd3.nv-sparklineplus .nv-currentValue{font-weight:700;font-size:1.1em}.nvtooltip h3,.nvtooltip table td.key{font-weight:400}.nvd3.nv-stackedarea path.nv-area{fill-opacity:.7;stroke-opacity:0;transition:fill-opacity 250ms linear,stroke-opacity 250ms linear;-moz-transition:fill-opacity 250ms linear,stroke-opacity 250ms linear;-webkit-transition:fill-opacity 250ms linear,stroke-opacity 250ms linear}.nvd3.nv-stackedarea path.nv-area.hover{fill-opacity:.9}.nvd3.nv-stackedarea .nv-groups .nv-point{stroke-opacity:0;fill-opacity:0}.nvtooltip{position:absolute;color:rgba(0,0,0,1);padding:1px;z-index:10000;font-family:Arial;font-size:13px;text-align:left;white-space:nowrap;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;background:rgba(255,255,255,.8);border:1px solid rgba(0,0,0,.5);border-radius:4px}.nvtooltip h3,.nvtooltip p{margin:0;text-align:center}.nvtooltip.with-transitions,.with-transitions .nvtooltip{transition:opacity 50ms linear;-moz-transition:opacity 50ms linear;-webkit-transition:opacity 50ms linear;transition-delay:200ms;-moz-transition-delay:200ms;-webkit-transition-delay:200ms}.nvtooltip.x-nvtooltip,.nvtooltip.y-nvtooltip{padding:8px}.nvtooltip h3{padding:4px 14px;line-height:18px;background-color:rgba(247,247,247,.75);color:rgba(0,0,0,1);border-bottom:1px solid #ebebeb;-webkit-border-radius:5px 5px 0 0;-moz-border-radius:5px 5px 0 0;border-radius:5px 5px 0 0}.nvtooltip p{padding:5px 14px}.nvtooltip span{display:inline-block;margin:2px 0}.nvtooltip table{margin:6px;border-spacing:0}.nvtooltip table td{padding:2px 9px 2px 0;vertical-align:middle}.nvtooltip table td.key.total{font-weight:700}.nvtooltip table td.value{text-align:right;font-weight:700}.nvtooltip table td.percent{color:#a9a9a9}.nvtooltip table tr.highlight td{padding:1px 9px 1px 0;border-bottom-style:solid;border-bottom-width:1px;border-top-style:solid;border-top-width:1px}.nvtooltip table td.legend-color-guide div{vertical-align:middle;width:12px;height:12px;border:1px solid #999}.nvtooltip .footer{padding:3px;text-align:center}.nvtooltip-pending-removal{pointer-events:none;display:none}.nvd3 line.nv-guideline{stroke:#ccc}</style>                                                             
+    <style>                                                                                                                
+      * {                                                                                                                  
+        box-sizing: border-box;                                                                                            
+      }                                                                                                                    
+      html, body {                                                                                                         
+        position: relative;                                                                                                
+        height: 100%;                                                                                                      
+        width: 100%;                                                                                                       
+        border: 0;                                                                                                         
+        margin: 0;                                                                                                         
+        padding: 0;                                                                                                        
+        font-size: 0;                                                                                                      
+      }                                                                                                                    
+      svg g {                                                                                                              
+        clip-path: none;                                                                                                   
+      }                                                                                                                    
+      svg text {                                                                                                           
+        fill: #333;                                                                                                        
+      }                                                                                                                    
+      .nv-axislabel {                                                                                                      
+        font-size: 16px !important;                                                                                        
+      }                                                                                                                    
+      .control {                                                                                                           
+        cursor: pointer;                                                                                                   
+        visibility: hidden;                                                                                                
+        pointer-events: visible;                                                                                           
+      }                                                                                                                    
+      @media (min-width: 768px) {                                                                                          
+        .control {                                                                                                         
+          visibility: visible;                                                                                             
+        }                                                                                                                  
+      }                                                                                                                    
+    </style>                                                                                                               
+    <script src="s1.js"></script>                                                              
+    <script src="s2.js"></script>                                                            
+  </head>                                                                                                                  
+  <body>                                                                                                                   
+    <svg id='chart'></svg> 
+    <script type='text/javascript'>
diff --git a/contrib/eigen/bench/perf_monitoring/resources/footer.html b/contrib/eigen/bench/perf_monitoring/resources/footer.html
new file mode 100644
index 0000000000000000000000000000000000000000..81d8c883c83d211f4b1dd56648f49828bb23e708
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/resources/footer.html
@@ -0,0 +1,3 @@
+</table>
+</body>
+</html>
diff --git a/contrib/eigen/bench/perf_monitoring/resources/header.html b/contrib/eigen/bench/perf_monitoring/resources/header.html
new file mode 100644
index 0000000000000000000000000000000000000000..1f22309094882ee77268dd419e8e21e8379aad25
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/resources/header.html
@@ -0,0 +1,42 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<title>Eigen performance monitoring</title>
+<style type="text/css">
+
+body
+{
+ background:#fff;
+}
+th {
+
+}
+img
+{
+ width:auto;
+ box-shadow:0px 0px 20px #cecece;
+ margin: 20px 20px  20px  20px;
+  -moz-transform: scale(1);
+ -moz-transition-duration: 0.4s;
+ -webkit-transition-duration: 0.4s;
+ -webkit-transform: scale(1);
+
+ -ms-transform: scale(1);
+ -ms-transition-duration: 0.4s;
+}
+img:hover
+{
+box-shadow: 5px 5px 20px #dcdcdc;
+ -moz-transform: scale(1.1);
+ -moz-transition-duration: 0.4s;
+ -webkit-transition-duration: 0.4s;
+ -webkit-transform: scale(1.1);
+
+ -ms-transform: scale(1.1);
+ -ms-transition-duration: 0.4s;
+
+}
+</style>
+</head>
+<body>
diff --git a/contrib/eigen/bench/perf_monitoring/resources/s1.js b/contrib/eigen/bench/perf_monitoring/resources/s1.js
new file mode 100644
index 0000000000000000000000000000000000000000..cfff2d3f59b162378049ea57eddd67c601061de2
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/resources/s1.js
@@ -0,0 +1 @@
+!function(){function n(n){return n&&(n.ownerDocument||n.document||n).documentElement}function t(n){return n&&(n.ownerDocument&&n.ownerDocument.defaultView||n.document&&n||n.defaultView)}function e(n,t){return t>n?-1:n>t?1:n>=t?0:NaN}function r(n){return null===n?NaN:+n}function i(n){return!isNaN(n)}function u(n){return{left:function(t,e,r,i){for(arguments.length<3&&(r=0),arguments.length<4&&(i=t.length);i>r;){var u=r+i>>>1;n(t[u],e)<0?r=u+1:i=u}return r},right:function(t,e,r,i){for(arguments.length<3&&(r=0),arguments.length<4&&(i=t.length);i>r;){var u=r+i>>>1;n(t[u],e)>0?i=u:r=u+1}return r}}}function o(n){return n.length}function a(n){for(var t=1;n*t%1;)t*=10;return t}function l(n,t){for(var e in t)Object.defineProperty(n.prototype,e,{value:t[e],enumerable:!1})}function c(){this._=Object.create(null)}function f(n){return(n+="")===bo||n[0]===_o?_o+n:n}function s(n){return(n+="")[0]===_o?n.slice(1):n}function h(n){return f(n)in this._}function p(n){return(n=f(n))in this._&&delete this._[n]}function g(){var n=[];for(var t in this._)n.push(s(t));return n}function v(){var n=0;for(var t in this._)++n;return n}function d(){for(var n in this._)return!1;return!0}function y(){this._=Object.create(null)}function m(n){return n}function M(n,t,e){return function(){var r=e.apply(t,arguments);return r===t?n:r}}function x(n,t){if(t in n)return t;t=t.charAt(0).toUpperCase()+t.slice(1);for(var e=0,r=wo.length;r>e;++e){var i=wo[e]+t;if(i in n)return i}}function b(){}function _(){}function w(n){function t(){for(var t,r=e,i=-1,u=r.length;++i<u;)(t=r[i].on)&&t.apply(this,arguments);return n}var e=[],r=new c;return t.on=function(t,i){var u,o=r.get(t);return arguments.length<2?o&&o.on:(o&&(o.on=null,e=e.slice(0,u=e.indexOf(o)).concat(e.slice(u+1)),r.remove(t)),i&&e.push(r.set(t,{on:i})),n)},t}function S(){ao.event.preventDefault()}function k(){for(var n,t=ao.event;n=t.sourceEvent;)t=n;return t}function N(n){for(var t=new _,e=0,r=arguments.length;++e<r;)t[arguments[e]]=w(t);return t.of=function(e,r){return function(i){try{var u=i.sourceEvent=ao.event;i.target=n,ao.event=i,t[i.type].apply(e,r)}finally{ao.event=u}}},t}function E(n){return ko(n,Co),n}function A(n){return"function"==typeof n?n:function(){return No(n,this)}}function C(n){return"function"==typeof n?n:function(){return Eo(n,this)}}function z(n,t){function e(){this.removeAttribute(n)}function r(){this.removeAttributeNS(n.space,n.local)}function i(){this.setAttribute(n,t)}function u(){this.setAttributeNS(n.space,n.local,t)}function o(){var e=t.apply(this,arguments);null==e?this.removeAttribute(n):this.setAttribute(n,e)}function a(){var e=t.apply(this,arguments);null==e?this.removeAttributeNS(n.space,n.local):this.setAttributeNS(n.space,n.local,e)}return n=ao.ns.qualify(n),null==t?n.local?r:e:"function"==typeof t?n.local?a:o:n.local?u:i}function L(n){return n.trim().replace(/\s+/g," ")}function q(n){return new RegExp("(?:^|\\s+)"+ao.requote(n)+"(?:\\s+|$)","g")}function T(n){return(n+"").trim().split(/^|\s+/)}function R(n,t){function e(){for(var e=-1;++e<i;)n[e](this,t)}function r(){for(var e=-1,r=t.apply(this,arguments);++e<i;)n[e](this,r)}n=T(n).map(D);var i=n.length;return"function"==typeof t?r:e}function D(n){var t=q(n);return function(e,r){if(i=e.classList)return r?i.add(n):i.remove(n);var i=e.getAttribute("class")||"";r?(t.lastIndex=0,t.test(i)||e.setAttribute("class",L(i+" "+n))):e.setAttribute("class",L(i.replace(t," ")))}}function P(n,t,e){function r(){this.style.removeProperty(n)}function i(){this.style.setProperty(n,t,e)}function u(){var r=t.apply(this,arguments);null==r?this.style.removeProperty(n):this.style.setProperty(n,r,e)}return null==t?r:"function"==typeof t?u:i}function U(n,t){function e(){delete this[n]}function r(){this[n]=t}function i(){var e=t.apply(this,arguments);null==e?delete this[n]:this[n]=e}return null==t?e:"function"==typeof t?i:r}function j(n){function t(){var t=this.ownerDocument,e=this.namespaceURI;return e===zo&&t.documentElement.namespaceURI===zo?t.createElement(n):t.createElementNS(e,n)}function e(){return this.ownerDocument.createElementNS(n.space,n.local)}return"function"==typeof n?n:(n=ao.ns.qualify(n)).local?e:t}function F(){var n=this.parentNode;n&&n.removeChild(this)}function H(n){return{__data__:n}}function O(n){return function(){return Ao(this,n)}}function I(n){return arguments.length||(n=e),function(t,e){return t&&e?n(t.__data__,e.__data__):!t-!e}}function Y(n,t){for(var e=0,r=n.length;r>e;e++)for(var i,u=n[e],o=0,a=u.length;a>o;o++)(i=u[o])&&t(i,o,e);return n}function Z(n){return ko(n,qo),n}function V(n){var t,e;return function(r,i,u){var o,a=n[u].update,l=a.length;for(u!=e&&(e=u,t=0),i>=t&&(t=i+1);!(o=a[t])&&++t<l;);return o}}function X(n,t,e){function r(){var t=this[o];t&&(this.removeEventListener(n,t,t.$),delete this[o])}function i(){var i=l(t,co(arguments));r.call(this),this.addEventListener(n,this[o]=i,i.$=e),i._=t}function u(){var t,e=new RegExp("^__on([^.]+)"+ao.requote(n)+"$");for(var r in this)if(t=r.match(e)){var i=this[r];this.removeEventListener(t[1],i,i.$),delete this[r]}}var o="__on"+n,a=n.indexOf("."),l=$;a>0&&(n=n.slice(0,a));var c=To.get(n);return c&&(n=c,l=B),a?t?i:r:t?b:u}function $(n,t){return function(e){var r=ao.event;ao.event=e,t[0]=this.__data__;try{n.apply(this,t)}finally{ao.event=r}}}function B(n,t){var e=$(n,t);return function(n){var t=this,r=n.relatedTarget;r&&(r===t||8&r.compareDocumentPosition(t))||e.call(t,n)}}function W(e){var r=".dragsuppress-"+ ++Do,i="click"+r,u=ao.select(t(e)).on("touchmove"+r,S).on("dragstart"+r,S).on("selectstart"+r,S);if(null==Ro&&(Ro="onselectstart"in e?!1:x(e.style,"userSelect")),Ro){var o=n(e).style,a=o[Ro];o[Ro]="none"}return function(n){if(u.on(r,null),Ro&&(o[Ro]=a),n){var t=function(){u.on(i,null)};u.on(i,function(){S(),t()},!0),setTimeout(t,0)}}}function J(n,e){e.changedTouches&&(e=e.changedTouches[0]);var r=n.ownerSVGElement||n;if(r.createSVGPoint){var i=r.createSVGPoint();if(0>Po){var u=t(n);if(u.scrollX||u.scrollY){r=ao.select("body").append("svg").style({position:"absolute",top:0,left:0,margin:0,padding:0,border:"none"},"important");var o=r[0][0].getScreenCTM();Po=!(o.f||o.e),r.remove()}}return Po?(i.x=e.pageX,i.y=e.pageY):(i.x=e.clientX,i.y=e.clientY),i=i.matrixTransform(n.getScreenCTM().inverse()),[i.x,i.y]}var a=n.getBoundingClientRect();return[e.clientX-a.left-n.clientLeft,e.clientY-a.top-n.clientTop]}function G(){return ao.event.changedTouches[0].identifier}function K(n){return n>0?1:0>n?-1:0}function Q(n,t,e){return(t[0]-n[0])*(e[1]-n[1])-(t[1]-n[1])*(e[0]-n[0])}function nn(n){return n>1?0:-1>n?Fo:Math.acos(n)}function tn(n){return n>1?Io:-1>n?-Io:Math.asin(n)}function en(n){return((n=Math.exp(n))-1/n)/2}function rn(n){return((n=Math.exp(n))+1/n)/2}function un(n){return((n=Math.exp(2*n))-1)/(n+1)}function on(n){return(n=Math.sin(n/2))*n}function an(){}function ln(n,t,e){return this instanceof ln?(this.h=+n,this.s=+t,void(this.l=+e)):arguments.length<2?n instanceof ln?new ln(n.h,n.s,n.l):_n(""+n,wn,ln):new ln(n,t,e)}function cn(n,t,e){function r(n){return n>360?n-=360:0>n&&(n+=360),60>n?u+(o-u)*n/60:180>n?o:240>n?u+(o-u)*(240-n)/60:u}function i(n){return Math.round(255*r(n))}var u,o;return n=isNaN(n)?0:(n%=360)<0?n+360:n,t=isNaN(t)?0:0>t?0:t>1?1:t,e=0>e?0:e>1?1:e,o=.5>=e?e*(1+t):e+t-e*t,u=2*e-o,new mn(i(n+120),i(n),i(n-120))}function fn(n,t,e){return this instanceof fn?(this.h=+n,this.c=+t,void(this.l=+e)):arguments.length<2?n instanceof fn?new fn(n.h,n.c,n.l):n instanceof hn?gn(n.l,n.a,n.b):gn((n=Sn((n=ao.rgb(n)).r,n.g,n.b)).l,n.a,n.b):new fn(n,t,e)}function sn(n,t,e){return isNaN(n)&&(n=0),isNaN(t)&&(t=0),new hn(e,Math.cos(n*=Yo)*t,Math.sin(n)*t)}function hn(n,t,e){return this instanceof hn?(this.l=+n,this.a=+t,void(this.b=+e)):arguments.length<2?n instanceof hn?new hn(n.l,n.a,n.b):n instanceof fn?sn(n.h,n.c,n.l):Sn((n=mn(n)).r,n.g,n.b):new hn(n,t,e)}function pn(n,t,e){var r=(n+16)/116,i=r+t/500,u=r-e/200;return i=vn(i)*na,r=vn(r)*ta,u=vn(u)*ea,new mn(yn(3.2404542*i-1.5371385*r-.4985314*u),yn(-.969266*i+1.8760108*r+.041556*u),yn(.0556434*i-.2040259*r+1.0572252*u))}function gn(n,t,e){return n>0?new fn(Math.atan2(e,t)*Zo,Math.sqrt(t*t+e*e),n):new fn(NaN,NaN,n)}function vn(n){return n>.206893034?n*n*n:(n-4/29)/7.787037}function dn(n){return n>.008856?Math.pow(n,1/3):7.787037*n+4/29}function yn(n){return Math.round(255*(.00304>=n?12.92*n:1.055*Math.pow(n,1/2.4)-.055))}function mn(n,t,e){return this instanceof mn?(this.r=~~n,this.g=~~t,void(this.b=~~e)):arguments.length<2?n instanceof mn?new mn(n.r,n.g,n.b):_n(""+n,mn,cn):new mn(n,t,e)}function Mn(n){return new mn(n>>16,n>>8&255,255&n)}function xn(n){return Mn(n)+""}function bn(n){return 16>n?"0"+Math.max(0,n).toString(16):Math.min(255,n).toString(16)}function _n(n,t,e){var r,i,u,o=0,a=0,l=0;if(r=/([a-z]+)\((.*)\)/.exec(n=n.toLowerCase()))switch(i=r[2].split(","),r[1]){case"hsl":return e(parseFloat(i[0]),parseFloat(i[1])/100,parseFloat(i[2])/100);case"rgb":return t(Nn(i[0]),Nn(i[1]),Nn(i[2]))}return(u=ua.get(n))?t(u.r,u.g,u.b):(null==n||"#"!==n.charAt(0)||isNaN(u=parseInt(n.slice(1),16))||(4===n.length?(o=(3840&u)>>4,o=o>>4|o,a=240&u,a=a>>4|a,l=15&u,l=l<<4|l):7===n.length&&(o=(16711680&u)>>16,a=(65280&u)>>8,l=255&u)),t(o,a,l))}function wn(n,t,e){var r,i,u=Math.min(n/=255,t/=255,e/=255),o=Math.max(n,t,e),a=o-u,l=(o+u)/2;return a?(i=.5>l?a/(o+u):a/(2-o-u),r=n==o?(t-e)/a+(e>t?6:0):t==o?(e-n)/a+2:(n-t)/a+4,r*=60):(r=NaN,i=l>0&&1>l?0:r),new ln(r,i,l)}function Sn(n,t,e){n=kn(n),t=kn(t),e=kn(e);var r=dn((.4124564*n+.3575761*t+.1804375*e)/na),i=dn((.2126729*n+.7151522*t+.072175*e)/ta),u=dn((.0193339*n+.119192*t+.9503041*e)/ea);return hn(116*i-16,500*(r-i),200*(i-u))}function kn(n){return(n/=255)<=.04045?n/12.92:Math.pow((n+.055)/1.055,2.4)}function Nn(n){var t=parseFloat(n);return"%"===n.charAt(n.length-1)?Math.round(2.55*t):t}function En(n){return"function"==typeof n?n:function(){return n}}function An(n){return function(t,e,r){return 2===arguments.length&&"function"==typeof e&&(r=e,e=null),Cn(t,e,n,r)}}function Cn(n,t,e,r){function i(){var n,t=l.status;if(!t&&Ln(l)||t>=200&&300>t||304===t){try{n=e.call(u,l)}catch(r){return void o.error.call(u,r)}o.load.call(u,n)}else o.error.call(u,l)}var u={},o=ao.dispatch("beforesend","progress","load","error"),a={},l=new XMLHttpRequest,c=null;return!this.XDomainRequest||"withCredentials"in l||!/^(http(s)?:)?\/\//.test(n)||(l=new XDomainRequest),"onload"in l?l.onload=l.onerror=i:l.onreadystatechange=function(){l.readyState>3&&i()},l.onprogress=function(n){var t=ao.event;ao.event=n;try{o.progress.call(u,l)}finally{ao.event=t}},u.header=function(n,t){return n=(n+"").toLowerCase(),arguments.length<2?a[n]:(null==t?delete a[n]:a[n]=t+"",u)},u.mimeType=function(n){return arguments.length?(t=null==n?null:n+"",u):t},u.responseType=function(n){return arguments.length?(c=n,u):c},u.response=function(n){return e=n,u},["get","post"].forEach(function(n){u[n]=function(){return u.send.apply(u,[n].concat(co(arguments)))}}),u.send=function(e,r,i){if(2===arguments.length&&"function"==typeof r&&(i=r,r=null),l.open(e,n,!0),null==t||"accept"in a||(a.accept=t+",*/*"),l.setRequestHeader)for(var f in a)l.setRequestHeader(f,a[f]);return null!=t&&l.overrideMimeType&&l.overrideMimeType(t),null!=c&&(l.responseType=c),null!=i&&u.on("error",i).on("load",function(n){i(null,n)}),o.beforesend.call(u,l),l.send(null==r?null:r),u},u.abort=function(){return l.abort(),u},ao.rebind(u,o,"on"),null==r?u:u.get(zn(r))}function zn(n){return 1===n.length?function(t,e){n(null==t?e:null)}:n}function Ln(n){var t=n.responseType;return t&&"text"!==t?n.response:n.responseText}function qn(n,t,e){var r=arguments.length;2>r&&(t=0),3>r&&(e=Date.now());var i=e+t,u={c:n,t:i,n:null};return aa?aa.n=u:oa=u,aa=u,la||(ca=clearTimeout(ca),la=1,fa(Tn)),u}function Tn(){var n=Rn(),t=Dn()-n;t>24?(isFinite(t)&&(clearTimeout(ca),ca=setTimeout(Tn,t)),la=0):(la=1,fa(Tn))}function Rn(){for(var n=Date.now(),t=oa;t;)n>=t.t&&t.c(n-t.t)&&(t.c=null),t=t.n;return n}function Dn(){for(var n,t=oa,e=1/0;t;)t.c?(t.t<e&&(e=t.t),t=(n=t).n):t=n?n.n=t.n:oa=t.n;return aa=n,e}function Pn(n,t){return t-(n?Math.ceil(Math.log(n)/Math.LN10):1)}function Un(n,t){var e=Math.pow(10,3*xo(8-t));return{scale:t>8?function(n){return n/e}:function(n){return n*e},symbol:n}}function jn(n){var t=n.decimal,e=n.thousands,r=n.grouping,i=n.currency,u=r&&e?function(n,t){for(var i=n.length,u=[],o=0,a=r[0],l=0;i>0&&a>0&&(l+a+1>t&&(a=Math.max(1,t-l)),u.push(n.substring(i-=a,i+a)),!((l+=a+1)>t));)a=r[o=(o+1)%r.length];return u.reverse().join(e)}:m;return function(n){var e=ha.exec(n),r=e[1]||" ",o=e[2]||">",a=e[3]||"-",l=e[4]||"",c=e[5],f=+e[6],s=e[7],h=e[8],p=e[9],g=1,v="",d="",y=!1,m=!0;switch(h&&(h=+h.substring(1)),(c||"0"===r&&"="===o)&&(c=r="0",o="="),p){case"n":s=!0,p="g";break;case"%":g=100,d="%",p="f";break;case"p":g=100,d="%",p="r";break;case"b":case"o":case"x":case"X":"#"===l&&(v="0"+p.toLowerCase());case"c":m=!1;case"d":y=!0,h=0;break;case"s":g=-1,p="r"}"$"===l&&(v=i[0],d=i[1]),"r"!=p||h||(p="g"),null!=h&&("g"==p?h=Math.max(1,Math.min(21,h)):"e"!=p&&"f"!=p||(h=Math.max(0,Math.min(20,h)))),p=pa.get(p)||Fn;var M=c&&s;return function(n){var e=d;if(y&&n%1)return"";var i=0>n||0===n&&0>1/n?(n=-n,"-"):"-"===a?"":a;if(0>g){var l=ao.formatPrefix(n,h);n=l.scale(n),e=l.symbol+d}else n*=g;n=p(n,h);var x,b,_=n.lastIndexOf(".");if(0>_){var w=m?n.lastIndexOf("e"):-1;0>w?(x=n,b=""):(x=n.substring(0,w),b=n.substring(w))}else x=n.substring(0,_),b=t+n.substring(_+1);!c&&s&&(x=u(x,1/0));var S=v.length+x.length+b.length+(M?0:i.length),k=f>S?new Array(S=f-S+1).join(r):"";return M&&(x=u(k+x,k.length?f-b.length:1/0)),i+=v,n=x+b,("<"===o?i+n+k:">"===o?k+i+n:"^"===o?k.substring(0,S>>=1)+i+n+k.substring(S):i+(M?n:k+n))+e}}}function Fn(n){return n+""}function Hn(){this._=new Date(arguments.length>1?Date.UTC.apply(this,arguments):arguments[0])}function On(n,t,e){function r(t){var e=n(t),r=u(e,1);return r-t>t-e?e:r}function i(e){return t(e=n(new va(e-1)),1),e}function u(n,e){return t(n=new va(+n),e),n}function o(n,r,u){var o=i(n),a=[];if(u>1)for(;r>o;)e(o)%u||a.push(new Date(+o)),t(o,1);else for(;r>o;)a.push(new Date(+o)),t(o,1);return a}function a(n,t,e){try{va=Hn;var r=new Hn;return r._=n,o(r,t,e)}finally{va=Date}}n.floor=n,n.round=r,n.ceil=i,n.offset=u,n.range=o;var l=n.utc=In(n);return l.floor=l,l.round=In(r),l.ceil=In(i),l.offset=In(u),l.range=a,n}function In(n){return function(t,e){try{va=Hn;var r=new Hn;return r._=t,n(r,e)._}finally{va=Date}}}function Yn(n){function t(n){function t(t){for(var e,i,u,o=[],a=-1,l=0;++a<r;)37===n.charCodeAt(a)&&(o.push(n.slice(l,a)),null!=(i=ya[e=n.charAt(++a)])&&(e=n.charAt(++a)),(u=A[e])&&(e=u(t,null==i?"e"===e?" ":"0":i)),o.push(e),l=a+1);return o.push(n.slice(l,a)),o.join("")}var r=n.length;return t.parse=function(t){var r={y:1900,m:0,d:1,H:0,M:0,S:0,L:0,Z:null},i=e(r,n,t,0);if(i!=t.length)return null;"p"in r&&(r.H=r.H%12+12*r.p);var u=null!=r.Z&&va!==Hn,o=new(u?Hn:va);return"j"in r?o.setFullYear(r.y,0,r.j):"W"in r||"U"in r?("w"in r||(r.w="W"in r?1:0),o.setFullYear(r.y,0,1),o.setFullYear(r.y,0,"W"in r?(r.w+6)%7+7*r.W-(o.getDay()+5)%7:r.w+7*r.U-(o.getDay()+6)%7)):o.setFullYear(r.y,r.m,r.d),o.setHours(r.H+(r.Z/100|0),r.M+r.Z%100,r.S,r.L),u?o._:o},t.toString=function(){return n},t}function e(n,t,e,r){for(var i,u,o,a=0,l=t.length,c=e.length;l>a;){if(r>=c)return-1;if(i=t.charCodeAt(a++),37===i){if(o=t.charAt(a++),u=C[o in ya?t.charAt(a++):o],!u||(r=u(n,e,r))<0)return-1}else if(i!=e.charCodeAt(r++))return-1}return r}function r(n,t,e){_.lastIndex=0;var r=_.exec(t.slice(e));return r?(n.w=w.get(r[0].toLowerCase()),e+r[0].length):-1}function i(n,t,e){x.lastIndex=0;var r=x.exec(t.slice(e));return r?(n.w=b.get(r[0].toLowerCase()),e+r[0].length):-1}function u(n,t,e){N.lastIndex=0;var r=N.exec(t.slice(e));return r?(n.m=E.get(r[0].toLowerCase()),e+r[0].length):-1}function o(n,t,e){S.lastIndex=0;var r=S.exec(t.slice(e));return r?(n.m=k.get(r[0].toLowerCase()),e+r[0].length):-1}function a(n,t,r){return e(n,A.c.toString(),t,r)}function l(n,t,r){return e(n,A.x.toString(),t,r)}function c(n,t,r){return e(n,A.X.toString(),t,r)}function f(n,t,e){var r=M.get(t.slice(e,e+=2).toLowerCase());return null==r?-1:(n.p=r,e)}var s=n.dateTime,h=n.date,p=n.time,g=n.periods,v=n.days,d=n.shortDays,y=n.months,m=n.shortMonths;t.utc=function(n){function e(n){try{va=Hn;var t=new va;return t._=n,r(t)}finally{va=Date}}var r=t(n);return e.parse=function(n){try{va=Hn;var t=r.parse(n);return t&&t._}finally{va=Date}},e.toString=r.toString,e},t.multi=t.utc.multi=ct;var M=ao.map(),x=Vn(v),b=Xn(v),_=Vn(d),w=Xn(d),S=Vn(y),k=Xn(y),N=Vn(m),E=Xn(m);g.forEach(function(n,t){M.set(n.toLowerCase(),t)});var A={a:function(n){return d[n.getDay()]},A:function(n){return v[n.getDay()]},b:function(n){return m[n.getMonth()]},B:function(n){return y[n.getMonth()]},c:t(s),d:function(n,t){return Zn(n.getDate(),t,2)},e:function(n,t){return Zn(n.getDate(),t,2)},H:function(n,t){return Zn(n.getHours(),t,2)},I:function(n,t){return Zn(n.getHours()%12||12,t,2)},j:function(n,t){return Zn(1+ga.dayOfYear(n),t,3)},L:function(n,t){return Zn(n.getMilliseconds(),t,3)},m:function(n,t){return Zn(n.getMonth()+1,t,2)},M:function(n,t){return Zn(n.getMinutes(),t,2)},p:function(n){return g[+(n.getHours()>=12)]},S:function(n,t){return Zn(n.getSeconds(),t,2)},U:function(n,t){return Zn(ga.sundayOfYear(n),t,2)},w:function(n){return n.getDay()},W:function(n,t){return Zn(ga.mondayOfYear(n),t,2)},x:t(h),X:t(p),y:function(n,t){return Zn(n.getFullYear()%100,t,2)},Y:function(n,t){return Zn(n.getFullYear()%1e4,t,4)},Z:at,"%":function(){return"%"}},C={a:r,A:i,b:u,B:o,c:a,d:tt,e:tt,H:rt,I:rt,j:et,L:ot,m:nt,M:it,p:f,S:ut,U:Bn,w:$n,W:Wn,x:l,X:c,y:Gn,Y:Jn,Z:Kn,"%":lt};return t}function Zn(n,t,e){var r=0>n?"-":"",i=(r?-n:n)+"",u=i.length;return r+(e>u?new Array(e-u+1).join(t)+i:i)}function Vn(n){return new RegExp("^(?:"+n.map(ao.requote).join("|")+")","i")}function Xn(n){for(var t=new c,e=-1,r=n.length;++e<r;)t.set(n[e].toLowerCase(),e);return t}function $n(n,t,e){ma.lastIndex=0;var r=ma.exec(t.slice(e,e+1));return r?(n.w=+r[0],e+r[0].length):-1}function Bn(n,t,e){ma.lastIndex=0;var r=ma.exec(t.slice(e));return r?(n.U=+r[0],e+r[0].length):-1}function Wn(n,t,e){ma.lastIndex=0;var r=ma.exec(t.slice(e));return r?(n.W=+r[0],e+r[0].length):-1}function Jn(n,t,e){ma.lastIndex=0;var r=ma.exec(t.slice(e,e+4));return r?(n.y=+r[0],e+r[0].length):-1}function Gn(n,t,e){ma.lastIndex=0;var r=ma.exec(t.slice(e,e+2));return r?(n.y=Qn(+r[0]),e+r[0].length):-1}function Kn(n,t,e){return/^[+-]\d{4}$/.test(t=t.slice(e,e+5))?(n.Z=-t,e+5):-1}function Qn(n){return n+(n>68?1900:2e3)}function nt(n,t,e){ma.lastIndex=0;var r=ma.exec(t.slice(e,e+2));return r?(n.m=r[0]-1,e+r[0].length):-1}function tt(n,t,e){ma.lastIndex=0;var r=ma.exec(t.slice(e,e+2));return r?(n.d=+r[0],e+r[0].length):-1}function et(n,t,e){ma.lastIndex=0;var r=ma.exec(t.slice(e,e+3));return r?(n.j=+r[0],e+r[0].length):-1}function rt(n,t,e){ma.lastIndex=0;var r=ma.exec(t.slice(e,e+2));return r?(n.H=+r[0],e+r[0].length):-1}function it(n,t,e){ma.lastIndex=0;var r=ma.exec(t.slice(e,e+2));return r?(n.M=+r[0],e+r[0].length):-1}function ut(n,t,e){ma.lastIndex=0;var r=ma.exec(t.slice(e,e+2));return r?(n.S=+r[0],e+r[0].length):-1}function ot(n,t,e){ma.lastIndex=0;var r=ma.exec(t.slice(e,e+3));return r?(n.L=+r[0],e+r[0].length):-1}function at(n){var t=n.getTimezoneOffset(),e=t>0?"-":"+",r=xo(t)/60|0,i=xo(t)%60;return e+Zn(r,"0",2)+Zn(i,"0",2)}function lt(n,t,e){Ma.lastIndex=0;var r=Ma.exec(t.slice(e,e+1));return r?e+r[0].length:-1}function ct(n){for(var t=n.length,e=-1;++e<t;)n[e][0]=this(n[e][0]);return function(t){for(var e=0,r=n[e];!r[1](t);)r=n[++e];return r[0](t)}}function ft(){}function st(n,t,e){var r=e.s=n+t,i=r-n,u=r-i;e.t=n-u+(t-i)}function ht(n,t){n&&wa.hasOwnProperty(n.type)&&wa[n.type](n,t)}function pt(n,t,e){var r,i=-1,u=n.length-e;for(t.lineStart();++i<u;)r=n[i],t.point(r[0],r[1],r[2]);t.lineEnd()}function gt(n,t){var e=-1,r=n.length;for(t.polygonStart();++e<r;)pt(n[e],t,1);t.polygonEnd()}function vt(){function n(n,t){n*=Yo,t=t*Yo/2+Fo/4;var e=n-r,o=e>=0?1:-1,a=o*e,l=Math.cos(t),c=Math.sin(t),f=u*c,s=i*l+f*Math.cos(a),h=f*o*Math.sin(a);ka.add(Math.atan2(h,s)),r=n,i=l,u=c}var t,e,r,i,u;Na.point=function(o,a){Na.point=n,r=(t=o)*Yo,i=Math.cos(a=(e=a)*Yo/2+Fo/4),u=Math.sin(a)},Na.lineEnd=function(){n(t,e)}}function dt(n){var t=n[0],e=n[1],r=Math.cos(e);return[r*Math.cos(t),r*Math.sin(t),Math.sin(e)]}function yt(n,t){return n[0]*t[0]+n[1]*t[1]+n[2]*t[2]}function mt(n,t){return[n[1]*t[2]-n[2]*t[1],n[2]*t[0]-n[0]*t[2],n[0]*t[1]-n[1]*t[0]]}function Mt(n,t){n[0]+=t[0],n[1]+=t[1],n[2]+=t[2]}function xt(n,t){return[n[0]*t,n[1]*t,n[2]*t]}function bt(n){var t=Math.sqrt(n[0]*n[0]+n[1]*n[1]+n[2]*n[2]);n[0]/=t,n[1]/=t,n[2]/=t}function _t(n){return[Math.atan2(n[1],n[0]),tn(n[2])]}function wt(n,t){return xo(n[0]-t[0])<Uo&&xo(n[1]-t[1])<Uo}function St(n,t){n*=Yo;var e=Math.cos(t*=Yo);kt(e*Math.cos(n),e*Math.sin(n),Math.sin(t))}function kt(n,t,e){++Ea,Ca+=(n-Ca)/Ea,za+=(t-za)/Ea,La+=(e-La)/Ea}function Nt(){function n(n,i){n*=Yo;var u=Math.cos(i*=Yo),o=u*Math.cos(n),a=u*Math.sin(n),l=Math.sin(i),c=Math.atan2(Math.sqrt((c=e*l-r*a)*c+(c=r*o-t*l)*c+(c=t*a-e*o)*c),t*o+e*a+r*l);Aa+=c,qa+=c*(t+(t=o)),Ta+=c*(e+(e=a)),Ra+=c*(r+(r=l)),kt(t,e,r)}var t,e,r;ja.point=function(i,u){i*=Yo;var o=Math.cos(u*=Yo);t=o*Math.cos(i),e=o*Math.sin(i),r=Math.sin(u),ja.point=n,kt(t,e,r)}}function Et(){ja.point=St}function At(){function n(n,t){n*=Yo;var e=Math.cos(t*=Yo),o=e*Math.cos(n),a=e*Math.sin(n),l=Math.sin(t),c=i*l-u*a,f=u*o-r*l,s=r*a-i*o,h=Math.sqrt(c*c+f*f+s*s),p=r*o+i*a+u*l,g=h&&-nn(p)/h,v=Math.atan2(h,p);Da+=g*c,Pa+=g*f,Ua+=g*s,Aa+=v,qa+=v*(r+(r=o)),Ta+=v*(i+(i=a)),Ra+=v*(u+(u=l)),kt(r,i,u)}var t,e,r,i,u;ja.point=function(o,a){t=o,e=a,ja.point=n,o*=Yo;var l=Math.cos(a*=Yo);r=l*Math.cos(o),i=l*Math.sin(o),u=Math.sin(a),kt(r,i,u)},ja.lineEnd=function(){n(t,e),ja.lineEnd=Et,ja.point=St}}function Ct(n,t){function e(e,r){return e=n(e,r),t(e[0],e[1])}return n.invert&&t.invert&&(e.invert=function(e,r){return e=t.invert(e,r),e&&n.invert(e[0],e[1])}),e}function zt(){return!0}function Lt(n,t,e,r,i){var u=[],o=[];if(n.forEach(function(n){if(!((t=n.length-1)<=0)){var t,e=n[0],r=n[t];if(wt(e,r)){i.lineStart();for(var a=0;t>a;++a)i.point((e=n[a])[0],e[1]);return void i.lineEnd()}var l=new Tt(e,n,null,!0),c=new Tt(e,null,l,!1);l.o=c,u.push(l),o.push(c),l=new Tt(r,n,null,!1),c=new Tt(r,null,l,!0),l.o=c,u.push(l),o.push(c)}}),o.sort(t),qt(u),qt(o),u.length){for(var a=0,l=e,c=o.length;c>a;++a)o[a].e=l=!l;for(var f,s,h=u[0];;){for(var p=h,g=!0;p.v;)if((p=p.n)===h)return;f=p.z,i.lineStart();do{if(p.v=p.o.v=!0,p.e){if(g)for(var a=0,c=f.length;c>a;++a)i.point((s=f[a])[0],s[1]);else r(p.x,p.n.x,1,i);p=p.n}else{if(g){f=p.p.z;for(var a=f.length-1;a>=0;--a)i.point((s=f[a])[0],s[1])}else r(p.x,p.p.x,-1,i);p=p.p}p=p.o,f=p.z,g=!g}while(!p.v);i.lineEnd()}}}function qt(n){if(t=n.length){for(var t,e,r=0,i=n[0];++r<t;)i.n=e=n[r],e.p=i,i=e;i.n=e=n[0],e.p=i}}function Tt(n,t,e,r){this.x=n,this.z=t,this.o=e,this.e=r,this.v=!1,this.n=this.p=null}function Rt(n,t,e,r){return function(i,u){function o(t,e){var r=i(t,e);n(t=r[0],e=r[1])&&u.point(t,e)}function a(n,t){var e=i(n,t);d.point(e[0],e[1])}function l(){m.point=a,d.lineStart()}function c(){m.point=o,d.lineEnd()}function f(n,t){v.push([n,t]);var e=i(n,t);x.point(e[0],e[1])}function s(){x.lineStart(),v=[]}function h(){f(v[0][0],v[0][1]),x.lineEnd();var n,t=x.clean(),e=M.buffer(),r=e.length;if(v.pop(),g.push(v),v=null,r)if(1&t){n=e[0];var i,r=n.length-1,o=-1;if(r>0){for(b||(u.polygonStart(),b=!0),u.lineStart();++o<r;)u.point((i=n[o])[0],i[1]);u.lineEnd()}}else r>1&&2&t&&e.push(e.pop().concat(e.shift())),p.push(e.filter(Dt))}var p,g,v,d=t(u),y=i.invert(r[0],r[1]),m={point:o,lineStart:l,lineEnd:c,polygonStart:function(){m.point=f,m.lineStart=s,m.lineEnd=h,p=[],g=[]},polygonEnd:function(){m.point=o,m.lineStart=l,m.lineEnd=c,p=ao.merge(p);var n=Ot(y,g);p.length?(b||(u.polygonStart(),b=!0),Lt(p,Ut,n,e,u)):n&&(b||(u.polygonStart(),b=!0),u.lineStart(),e(null,null,1,u),u.lineEnd()),b&&(u.polygonEnd(),b=!1),p=g=null},sphere:function(){u.polygonStart(),u.lineStart(),e(null,null,1,u),u.lineEnd(),u.polygonEnd()}},M=Pt(),x=t(M),b=!1;return m}}function Dt(n){return n.length>1}function Pt(){var n,t=[];return{lineStart:function(){t.push(n=[])},point:function(t,e){n.push([t,e])},lineEnd:b,buffer:function(){var e=t;return t=[],n=null,e},rejoin:function(){t.length>1&&t.push(t.pop().concat(t.shift()))}}}function Ut(n,t){return((n=n.x)[0]<0?n[1]-Io-Uo:Io-n[1])-((t=t.x)[0]<0?t[1]-Io-Uo:Io-t[1])}function jt(n){var t,e=NaN,r=NaN,i=NaN;return{lineStart:function(){n.lineStart(),t=1},point:function(u,o){var a=u>0?Fo:-Fo,l=xo(u-e);xo(l-Fo)<Uo?(n.point(e,r=(r+o)/2>0?Io:-Io),n.point(i,r),n.lineEnd(),n.lineStart(),n.point(a,r),n.point(u,r),t=0):i!==a&&l>=Fo&&(xo(e-i)<Uo&&(e-=i*Uo),xo(u-a)<Uo&&(u-=a*Uo),r=Ft(e,r,u,o),n.point(i,r),n.lineEnd(),n.lineStart(),n.point(a,r),t=0),n.point(e=u,r=o),i=a},lineEnd:function(){n.lineEnd(),e=r=NaN},clean:function(){return 2-t}}}function Ft(n,t,e,r){var i,u,o=Math.sin(n-e);return xo(o)>Uo?Math.atan((Math.sin(t)*(u=Math.cos(r))*Math.sin(e)-Math.sin(r)*(i=Math.cos(t))*Math.sin(n))/(i*u*o)):(t+r)/2}function Ht(n,t,e,r){var i;if(null==n)i=e*Io,r.point(-Fo,i),r.point(0,i),r.point(Fo,i),r.point(Fo,0),r.point(Fo,-i),r.point(0,-i),r.point(-Fo,-i),r.point(-Fo,0),r.point(-Fo,i);else if(xo(n[0]-t[0])>Uo){var u=n[0]<t[0]?Fo:-Fo;i=e*u/2,r.point(-u,i),r.point(0,i),r.point(u,i)}else r.point(t[0],t[1])}function Ot(n,t){var e=n[0],r=n[1],i=[Math.sin(e),-Math.cos(e),0],u=0,o=0;ka.reset();for(var a=0,l=t.length;l>a;++a){var c=t[a],f=c.length;if(f)for(var s=c[0],h=s[0],p=s[1]/2+Fo/4,g=Math.sin(p),v=Math.cos(p),d=1;;){d===f&&(d=0),n=c[d];var y=n[0],m=n[1]/2+Fo/4,M=Math.sin(m),x=Math.cos(m),b=y-h,_=b>=0?1:-1,w=_*b,S=w>Fo,k=g*M;if(ka.add(Math.atan2(k*_*Math.sin(w),v*x+k*Math.cos(w))),u+=S?b+_*Ho:b,S^h>=e^y>=e){var N=mt(dt(s),dt(n));bt(N);var E=mt(i,N);bt(E);var A=(S^b>=0?-1:1)*tn(E[2]);(r>A||r===A&&(N[0]||N[1]))&&(o+=S^b>=0?1:-1)}if(!d++)break;h=y,g=M,v=x,s=n}}return(-Uo>u||Uo>u&&-Uo>ka)^1&o}function It(n){function t(n,t){return Math.cos(n)*Math.cos(t)>u}function e(n){var e,u,l,c,f;return{lineStart:function(){c=l=!1,f=1},point:function(s,h){var p,g=[s,h],v=t(s,h),d=o?v?0:i(s,h):v?i(s+(0>s?Fo:-Fo),h):0;if(!e&&(c=l=v)&&n.lineStart(),v!==l&&(p=r(e,g),(wt(e,p)||wt(g,p))&&(g[0]+=Uo,g[1]+=Uo,v=t(g[0],g[1]))),v!==l)f=0,v?(n.lineStart(),p=r(g,e),n.point(p[0],p[1])):(p=r(e,g),n.point(p[0],p[1]),n.lineEnd()),e=p;else if(a&&e&&o^v){var y;d&u||!(y=r(g,e,!0))||(f=0,o?(n.lineStart(),n.point(y[0][0],y[0][1]),n.point(y[1][0],y[1][1]),n.lineEnd()):(n.point(y[1][0],y[1][1]),n.lineEnd(),n.lineStart(),n.point(y[0][0],y[0][1])))}!v||e&&wt(e,g)||n.point(g[0],g[1]),e=g,l=v,u=d},lineEnd:function(){l&&n.lineEnd(),e=null},clean:function(){return f|(c&&l)<<1}}}function r(n,t,e){var r=dt(n),i=dt(t),o=[1,0,0],a=mt(r,i),l=yt(a,a),c=a[0],f=l-c*c;if(!f)return!e&&n;var s=u*l/f,h=-u*c/f,p=mt(o,a),g=xt(o,s),v=xt(a,h);Mt(g,v);var d=p,y=yt(g,d),m=yt(d,d),M=y*y-m*(yt(g,g)-1);if(!(0>M)){var x=Math.sqrt(M),b=xt(d,(-y-x)/m);if(Mt(b,g),b=_t(b),!e)return b;var _,w=n[0],S=t[0],k=n[1],N=t[1];w>S&&(_=w,w=S,S=_);var E=S-w,A=xo(E-Fo)<Uo,C=A||Uo>E;if(!A&&k>N&&(_=k,k=N,N=_),C?A?k+N>0^b[1]<(xo(b[0]-w)<Uo?k:N):k<=b[1]&&b[1]<=N:E>Fo^(w<=b[0]&&b[0]<=S)){var z=xt(d,(-y+x)/m);return Mt(z,g),[b,_t(z)]}}}function i(t,e){var r=o?n:Fo-n,i=0;return-r>t?i|=1:t>r&&(i|=2),-r>e?i|=4:e>r&&(i|=8),i}var u=Math.cos(n),o=u>0,a=xo(u)>Uo,l=ve(n,6*Yo);return Rt(t,e,l,o?[0,-n]:[-Fo,n-Fo])}function Yt(n,t,e,r){return function(i){var u,o=i.a,a=i.b,l=o.x,c=o.y,f=a.x,s=a.y,h=0,p=1,g=f-l,v=s-c;if(u=n-l,g||!(u>0)){if(u/=g,0>g){if(h>u)return;p>u&&(p=u)}else if(g>0){if(u>p)return;u>h&&(h=u)}if(u=e-l,g||!(0>u)){if(u/=g,0>g){if(u>p)return;u>h&&(h=u)}else if(g>0){if(h>u)return;p>u&&(p=u)}if(u=t-c,v||!(u>0)){if(u/=v,0>v){if(h>u)return;p>u&&(p=u)}else if(v>0){if(u>p)return;u>h&&(h=u)}if(u=r-c,v||!(0>u)){if(u/=v,0>v){if(u>p)return;u>h&&(h=u)}else if(v>0){if(h>u)return;p>u&&(p=u)}return h>0&&(i.a={x:l+h*g,y:c+h*v}),1>p&&(i.b={x:l+p*g,y:c+p*v}),i}}}}}}function Zt(n,t,e,r){function i(r,i){return xo(r[0]-n)<Uo?i>0?0:3:xo(r[0]-e)<Uo?i>0?2:1:xo(r[1]-t)<Uo?i>0?1:0:i>0?3:2}function u(n,t){return o(n.x,t.x)}function o(n,t){var e=i(n,1),r=i(t,1);return e!==r?e-r:0===e?t[1]-n[1]:1===e?n[0]-t[0]:2===e?n[1]-t[1]:t[0]-n[0]}return function(a){function l(n){for(var t=0,e=d.length,r=n[1],i=0;e>i;++i)for(var u,o=1,a=d[i],l=a.length,c=a[0];l>o;++o)u=a[o],c[1]<=r?u[1]>r&&Q(c,u,n)>0&&++t:u[1]<=r&&Q(c,u,n)<0&&--t,c=u;return 0!==t}function c(u,a,l,c){var f=0,s=0;if(null==u||(f=i(u,l))!==(s=i(a,l))||o(u,a)<0^l>0){do c.point(0===f||3===f?n:e,f>1?r:t);while((f=(f+l+4)%4)!==s)}else c.point(a[0],a[1])}function f(i,u){return i>=n&&e>=i&&u>=t&&r>=u}function s(n,t){f(n,t)&&a.point(n,t)}function h(){C.point=g,d&&d.push(y=[]),S=!0,w=!1,b=_=NaN}function p(){v&&(g(m,M),x&&w&&E.rejoin(),v.push(E.buffer())),C.point=s,w&&a.lineEnd()}function g(n,t){n=Math.max(-Ha,Math.min(Ha,n)),t=Math.max(-Ha,Math.min(Ha,t));var e=f(n,t);if(d&&y.push([n,t]),S)m=n,M=t,x=e,S=!1,e&&(a.lineStart(),a.point(n,t));else if(e&&w)a.point(n,t);else{var r={a:{x:b,y:_},b:{x:n,y:t}};A(r)?(w||(a.lineStart(),a.point(r.a.x,r.a.y)),a.point(r.b.x,r.b.y),e||a.lineEnd(),k=!1):e&&(a.lineStart(),a.point(n,t),k=!1)}b=n,_=t,w=e}var v,d,y,m,M,x,b,_,w,S,k,N=a,E=Pt(),A=Yt(n,t,e,r),C={point:s,lineStart:h,lineEnd:p,polygonStart:function(){a=E,v=[],d=[],k=!0},polygonEnd:function(){a=N,v=ao.merge(v);var t=l([n,r]),e=k&&t,i=v.length;(e||i)&&(a.polygonStart(),e&&(a.lineStart(),c(null,null,1,a),a.lineEnd()),i&&Lt(v,u,t,c,a),a.polygonEnd()),v=d=y=null}};return C}}function Vt(n){var t=0,e=Fo/3,r=ae(n),i=r(t,e);return i.parallels=function(n){return arguments.length?r(t=n[0]*Fo/180,e=n[1]*Fo/180):[t/Fo*180,e/Fo*180]},i}function Xt(n,t){function e(n,t){var e=Math.sqrt(u-2*i*Math.sin(t))/i;return[e*Math.sin(n*=i),o-e*Math.cos(n)]}var r=Math.sin(n),i=(r+Math.sin(t))/2,u=1+r*(2*i-r),o=Math.sqrt(u)/i;return e.invert=function(n,t){var e=o-t;return[Math.atan2(n,e)/i,tn((u-(n*n+e*e)*i*i)/(2*i))]},e}function $t(){function n(n,t){Ia+=i*n-r*t,r=n,i=t}var t,e,r,i;$a.point=function(u,o){$a.point=n,t=r=u,e=i=o},$a.lineEnd=function(){n(t,e)}}function Bt(n,t){Ya>n&&(Ya=n),n>Va&&(Va=n),Za>t&&(Za=t),t>Xa&&(Xa=t)}function Wt(){function n(n,t){o.push("M",n,",",t,u)}function t(n,t){o.push("M",n,",",t),a.point=e}function e(n,t){o.push("L",n,",",t)}function r(){a.point=n}function i(){o.push("Z")}var u=Jt(4.5),o=[],a={point:n,lineStart:function(){a.point=t},lineEnd:r,polygonStart:function(){a.lineEnd=i},polygonEnd:function(){a.lineEnd=r,a.point=n},pointRadius:function(n){return u=Jt(n),a},result:function(){if(o.length){var n=o.join("");return o=[],n}}};return a}function Jt(n){return"m0,"+n+"a"+n+","+n+" 0 1,1 0,"+-2*n+"a"+n+","+n+" 0 1,1 0,"+2*n+"z"}function Gt(n,t){Ca+=n,za+=t,++La}function Kt(){function n(n,r){var i=n-t,u=r-e,o=Math.sqrt(i*i+u*u);qa+=o*(t+n)/2,Ta+=o*(e+r)/2,Ra+=o,Gt(t=n,e=r)}var t,e;Wa.point=function(r,i){Wa.point=n,Gt(t=r,e=i)}}function Qt(){Wa.point=Gt}function ne(){function n(n,t){var e=n-r,u=t-i,o=Math.sqrt(e*e+u*u);qa+=o*(r+n)/2,Ta+=o*(i+t)/2,Ra+=o,o=i*n-r*t,Da+=o*(r+n),Pa+=o*(i+t),Ua+=3*o,Gt(r=n,i=t)}var t,e,r,i;Wa.point=function(u,o){Wa.point=n,Gt(t=r=u,e=i=o)},Wa.lineEnd=function(){n(t,e)}}function te(n){function t(t,e){n.moveTo(t+o,e),n.arc(t,e,o,0,Ho)}function e(t,e){n.moveTo(t,e),a.point=r}function r(t,e){n.lineTo(t,e)}function i(){a.point=t}function u(){n.closePath()}var o=4.5,a={point:t,lineStart:function(){a.point=e},lineEnd:i,polygonStart:function(){a.lineEnd=u},polygonEnd:function(){a.lineEnd=i,a.point=t},pointRadius:function(n){return o=n,a},result:b};return a}function ee(n){function t(n){return(a?r:e)(n)}function e(t){return ue(t,function(e,r){e=n(e,r),t.point(e[0],e[1])})}function r(t){function e(e,r){e=n(e,r),t.point(e[0],e[1])}function r(){M=NaN,S.point=u,t.lineStart()}function u(e,r){var u=dt([e,r]),o=n(e,r);i(M,x,m,b,_,w,M=o[0],x=o[1],m=e,b=u[0],_=u[1],w=u[2],a,t),t.point(M,x)}function o(){S.point=e,t.lineEnd()}function l(){r(),S.point=c,S.lineEnd=f}function c(n,t){u(s=n,h=t),p=M,g=x,v=b,d=_,y=w,S.point=u}function f(){i(M,x,m,b,_,w,p,g,s,v,d,y,a,t),S.lineEnd=o,o()}var s,h,p,g,v,d,y,m,M,x,b,_,w,S={point:e,lineStart:r,lineEnd:o,polygonStart:function(){t.polygonStart(),S.lineStart=l},polygonEnd:function(){t.polygonEnd(),S.lineStart=r}};return S}function i(t,e,r,a,l,c,f,s,h,p,g,v,d,y){var m=f-t,M=s-e,x=m*m+M*M;if(x>4*u&&d--){var b=a+p,_=l+g,w=c+v,S=Math.sqrt(b*b+_*_+w*w),k=Math.asin(w/=S),N=xo(xo(w)-1)<Uo||xo(r-h)<Uo?(r+h)/2:Math.atan2(_,b),E=n(N,k),A=E[0],C=E[1],z=A-t,L=C-e,q=M*z-m*L;(q*q/x>u||xo((m*z+M*L)/x-.5)>.3||o>a*p+l*g+c*v)&&(i(t,e,r,a,l,c,A,C,N,b/=S,_/=S,w,d,y),y.point(A,C),i(A,C,N,b,_,w,f,s,h,p,g,v,d,y))}}var u=.5,o=Math.cos(30*Yo),a=16;return t.precision=function(n){return arguments.length?(a=(u=n*n)>0&&16,t):Math.sqrt(u)},t}function re(n){var t=ee(function(t,e){return n([t*Zo,e*Zo])});return function(n){return le(t(n))}}function ie(n){this.stream=n}function ue(n,t){return{point:t,sphere:function(){n.sphere()},lineStart:function(){n.lineStart()},lineEnd:function(){n.lineEnd()},polygonStart:function(){n.polygonStart()},polygonEnd:function(){n.polygonEnd()}}}function oe(n){return ae(function(){return n})()}function ae(n){function t(n){return n=a(n[0]*Yo,n[1]*Yo),[n[0]*h+l,c-n[1]*h]}function e(n){return n=a.invert((n[0]-l)/h,(c-n[1])/h),n&&[n[0]*Zo,n[1]*Zo]}function r(){a=Ct(o=se(y,M,x),u);var n=u(v,d);return l=p-n[0]*h,c=g+n[1]*h,i()}function i(){return f&&(f.valid=!1,f=null),t}var u,o,a,l,c,f,s=ee(function(n,t){return n=u(n,t),[n[0]*h+l,c-n[1]*h]}),h=150,p=480,g=250,v=0,d=0,y=0,M=0,x=0,b=Fa,_=m,w=null,S=null;return t.stream=function(n){return f&&(f.valid=!1),f=le(b(o,s(_(n)))),f.valid=!0,f},t.clipAngle=function(n){return arguments.length?(b=null==n?(w=n,Fa):It((w=+n)*Yo),i()):w},t.clipExtent=function(n){return arguments.length?(S=n,_=n?Zt(n[0][0],n[0][1],n[1][0],n[1][1]):m,i()):S},t.scale=function(n){return arguments.length?(h=+n,r()):h},t.translate=function(n){return arguments.length?(p=+n[0],g=+n[1],r()):[p,g]},t.center=function(n){return arguments.length?(v=n[0]%360*Yo,d=n[1]%360*Yo,r()):[v*Zo,d*Zo]},t.rotate=function(n){return arguments.length?(y=n[0]%360*Yo,M=n[1]%360*Yo,x=n.length>2?n[2]%360*Yo:0,r()):[y*Zo,M*Zo,x*Zo]},ao.rebind(t,s,"precision"),function(){return u=n.apply(this,arguments),t.invert=u.invert&&e,r()}}function le(n){return ue(n,function(t,e){n.point(t*Yo,e*Yo)})}function ce(n,t){return[n,t]}function fe(n,t){return[n>Fo?n-Ho:-Fo>n?n+Ho:n,t]}function se(n,t,e){return n?t||e?Ct(pe(n),ge(t,e)):pe(n):t||e?ge(t,e):fe}function he(n){return function(t,e){return t+=n,[t>Fo?t-Ho:-Fo>t?t+Ho:t,e]}}function pe(n){var t=he(n);return t.invert=he(-n),t}function ge(n,t){function e(n,t){var e=Math.cos(t),a=Math.cos(n)*e,l=Math.sin(n)*e,c=Math.sin(t),f=c*r+a*i;return[Math.atan2(l*u-f*o,a*r-c*i),tn(f*u+l*o)]}var r=Math.cos(n),i=Math.sin(n),u=Math.cos(t),o=Math.sin(t);return e.invert=function(n,t){var e=Math.cos(t),a=Math.cos(n)*e,l=Math.sin(n)*e,c=Math.sin(t),f=c*u-l*o;return[Math.atan2(l*u+c*o,a*r+f*i),tn(f*r-a*i)]},e}function ve(n,t){var e=Math.cos(n),r=Math.sin(n);return function(i,u,o,a){var l=o*t;null!=i?(i=de(e,i),u=de(e,u),(o>0?u>i:i>u)&&(i+=o*Ho)):(i=n+o*Ho,u=n-.5*l);for(var c,f=i;o>0?f>u:u>f;f-=l)a.point((c=_t([e,-r*Math.cos(f),-r*Math.sin(f)]))[0],c[1])}}function de(n,t){var e=dt(t);e[0]-=n,bt(e);var r=nn(-e[1]);return((-e[2]<0?-r:r)+2*Math.PI-Uo)%(2*Math.PI)}function ye(n,t,e){var r=ao.range(n,t-Uo,e).concat(t);return function(n){return r.map(function(t){return[n,t]})}}function me(n,t,e){var r=ao.range(n,t-Uo,e).concat(t);return function(n){return r.map(function(t){return[t,n]})}}function Me(n){return n.source}function xe(n){return n.target}function be(n,t,e,r){var i=Math.cos(t),u=Math.sin(t),o=Math.cos(r),a=Math.sin(r),l=i*Math.cos(n),c=i*Math.sin(n),f=o*Math.cos(e),s=o*Math.sin(e),h=2*Math.asin(Math.sqrt(on(r-t)+i*o*on(e-n))),p=1/Math.sin(h),g=h?function(n){var t=Math.sin(n*=h)*p,e=Math.sin(h-n)*p,r=e*l+t*f,i=e*c+t*s,o=e*u+t*a;return[Math.atan2(i,r)*Zo,Math.atan2(o,Math.sqrt(r*r+i*i))*Zo]}:function(){return[n*Zo,t*Zo]};return g.distance=h,g}function _e(){function n(n,i){var u=Math.sin(i*=Yo),o=Math.cos(i),a=xo((n*=Yo)-t),l=Math.cos(a);Ja+=Math.atan2(Math.sqrt((a=o*Math.sin(a))*a+(a=r*u-e*o*l)*a),e*u+r*o*l),t=n,e=u,r=o}var t,e,r;Ga.point=function(i,u){t=i*Yo,e=Math.sin(u*=Yo),r=Math.cos(u),Ga.point=n},Ga.lineEnd=function(){Ga.point=Ga.lineEnd=b}}function we(n,t){function e(t,e){var r=Math.cos(t),i=Math.cos(e),u=n(r*i);return[u*i*Math.sin(t),u*Math.sin(e)]}return e.invert=function(n,e){var r=Math.sqrt(n*n+e*e),i=t(r),u=Math.sin(i),o=Math.cos(i);return[Math.atan2(n*u,r*o),Math.asin(r&&e*u/r)]},e}function Se(n,t){function e(n,t){o>0?-Io+Uo>t&&(t=-Io+Uo):t>Io-Uo&&(t=Io-Uo);var e=o/Math.pow(i(t),u);return[e*Math.sin(u*n),o-e*Math.cos(u*n)]}var r=Math.cos(n),i=function(n){return Math.tan(Fo/4+n/2)},u=n===t?Math.sin(n):Math.log(r/Math.cos(t))/Math.log(i(t)/i(n)),o=r*Math.pow(i(n),u)/u;return u?(e.invert=function(n,t){var e=o-t,r=K(u)*Math.sqrt(n*n+e*e);return[Math.atan2(n,e)/u,2*Math.atan(Math.pow(o/r,1/u))-Io]},e):Ne}function ke(n,t){function e(n,t){var e=u-t;return[e*Math.sin(i*n),u-e*Math.cos(i*n)]}var r=Math.cos(n),i=n===t?Math.sin(n):(r-Math.cos(t))/(t-n),u=r/i+n;return xo(i)<Uo?ce:(e.invert=function(n,t){var e=u-t;return[Math.atan2(n,e)/i,u-K(i)*Math.sqrt(n*n+e*e)]},e)}function Ne(n,t){return[n,Math.log(Math.tan(Fo/4+t/2))]}function Ee(n){var t,e=oe(n),r=e.scale,i=e.translate,u=e.clipExtent;return e.scale=function(){var n=r.apply(e,arguments);return n===e?t?e.clipExtent(null):e:n},e.translate=function(){var n=i.apply(e,arguments);return n===e?t?e.clipExtent(null):e:n},e.clipExtent=function(n){var o=u.apply(e,arguments);if(o===e){if(t=null==n){var a=Fo*r(),l=i();u([[l[0]-a,l[1]-a],[l[0]+a,l[1]+a]])}}else t&&(o=null);return o},e.clipExtent(null)}function Ae(n,t){return[Math.log(Math.tan(Fo/4+t/2)),-n]}function Ce(n){return n[0]}function ze(n){return n[1]}function Le(n){for(var t=n.length,e=[0,1],r=2,i=2;t>i;i++){for(;r>1&&Q(n[e[r-2]],n[e[r-1]],n[i])<=0;)--r;e[r++]=i}return e.slice(0,r)}function qe(n,t){return n[0]-t[0]||n[1]-t[1]}function Te(n,t,e){return(e[0]-t[0])*(n[1]-t[1])<(e[1]-t[1])*(n[0]-t[0])}function Re(n,t,e,r){var i=n[0],u=e[0],o=t[0]-i,a=r[0]-u,l=n[1],c=e[1],f=t[1]-l,s=r[1]-c,h=(a*(l-c)-s*(i-u))/(s*o-a*f);return[i+h*o,l+h*f]}function De(n){var t=n[0],e=n[n.length-1];return!(t[0]-e[0]||t[1]-e[1])}function Pe(){rr(this),this.edge=this.site=this.circle=null}function Ue(n){var t=cl.pop()||new Pe;return t.site=n,t}function je(n){Be(n),ol.remove(n),cl.push(n),rr(n)}function Fe(n){var t=n.circle,e=t.x,r=t.cy,i={x:e,y:r},u=n.P,o=n.N,a=[n];je(n);for(var l=u;l.circle&&xo(e-l.circle.x)<Uo&&xo(r-l.circle.cy)<Uo;)u=l.P,a.unshift(l),je(l),l=u;a.unshift(l),Be(l);for(var c=o;c.circle&&xo(e-c.circle.x)<Uo&&xo(r-c.circle.cy)<Uo;)o=c.N,a.push(c),je(c),c=o;a.push(c),Be(c);var f,s=a.length;for(f=1;s>f;++f)c=a[f],l=a[f-1],nr(c.edge,l.site,c.site,i);l=a[0],c=a[s-1],c.edge=Ke(l.site,c.site,null,i),$e(l),$e(c)}function He(n){for(var t,e,r,i,u=n.x,o=n.y,a=ol._;a;)if(r=Oe(a,o)-u,r>Uo)a=a.L;else{if(i=u-Ie(a,o),!(i>Uo)){r>-Uo?(t=a.P,e=a):i>-Uo?(t=a,e=a.N):t=e=a;break}if(!a.R){t=a;break}a=a.R}var l=Ue(n);if(ol.insert(t,l),t||e){if(t===e)return Be(t),e=Ue(t.site),ol.insert(l,e),l.edge=e.edge=Ke(t.site,l.site),$e(t),void $e(e);if(!e)return void(l.edge=Ke(t.site,l.site));Be(t),Be(e);var c=t.site,f=c.x,s=c.y,h=n.x-f,p=n.y-s,g=e.site,v=g.x-f,d=g.y-s,y=2*(h*d-p*v),m=h*h+p*p,M=v*v+d*d,x={x:(d*m-p*M)/y+f,y:(h*M-v*m)/y+s};nr(e.edge,c,g,x),l.edge=Ke(c,n,null,x),e.edge=Ke(n,g,null,x),$e(t),$e(e)}}function Oe(n,t){var e=n.site,r=e.x,i=e.y,u=i-t;if(!u)return r;var o=n.P;if(!o)return-(1/0);e=o.site;var a=e.x,l=e.y,c=l-t;if(!c)return a;var f=a-r,s=1/u-1/c,h=f/c;return s?(-h+Math.sqrt(h*h-2*s*(f*f/(-2*c)-l+c/2+i-u/2)))/s+r:(r+a)/2}function Ie(n,t){var e=n.N;if(e)return Oe(e,t);var r=n.site;return r.y===t?r.x:1/0}function Ye(n){this.site=n,this.edges=[]}function Ze(n){for(var t,e,r,i,u,o,a,l,c,f,s=n[0][0],h=n[1][0],p=n[0][1],g=n[1][1],v=ul,d=v.length;d--;)if(u=v[d],u&&u.prepare())for(a=u.edges,l=a.length,o=0;l>o;)f=a[o].end(),r=f.x,i=f.y,c=a[++o%l].start(),t=c.x,e=c.y,(xo(r-t)>Uo||xo(i-e)>Uo)&&(a.splice(o,0,new tr(Qe(u.site,f,xo(r-s)<Uo&&g-i>Uo?{x:s,y:xo(t-s)<Uo?e:g}:xo(i-g)<Uo&&h-r>Uo?{x:xo(e-g)<Uo?t:h,y:g}:xo(r-h)<Uo&&i-p>Uo?{x:h,y:xo(t-h)<Uo?e:p}:xo(i-p)<Uo&&r-s>Uo?{x:xo(e-p)<Uo?t:s,y:p}:null),u.site,null)),++l)}function Ve(n,t){return t.angle-n.angle}function Xe(){rr(this),this.x=this.y=this.arc=this.site=this.cy=null}function $e(n){var t=n.P,e=n.N;if(t&&e){var r=t.site,i=n.site,u=e.site;if(r!==u){var o=i.x,a=i.y,l=r.x-o,c=r.y-a,f=u.x-o,s=u.y-a,h=2*(l*s-c*f);if(!(h>=-jo)){var p=l*l+c*c,g=f*f+s*s,v=(s*p-c*g)/h,d=(l*g-f*p)/h,s=d+a,y=fl.pop()||new Xe;y.arc=n,y.site=i,y.x=v+o,y.y=s+Math.sqrt(v*v+d*d),y.cy=s,n.circle=y;for(var m=null,M=ll._;M;)if(y.y<M.y||y.y===M.y&&y.x<=M.x){if(!M.L){m=M.P;break}M=M.L}else{if(!M.R){m=M;break}M=M.R}ll.insert(m,y),m||(al=y)}}}}function Be(n){var t=n.circle;t&&(t.P||(al=t.N),ll.remove(t),fl.push(t),rr(t),n.circle=null)}function We(n){for(var t,e=il,r=Yt(n[0][0],n[0][1],n[1][0],n[1][1]),i=e.length;i--;)t=e[i],(!Je(t,n)||!r(t)||xo(t.a.x-t.b.x)<Uo&&xo(t.a.y-t.b.y)<Uo)&&(t.a=t.b=null,e.splice(i,1))}function Je(n,t){var e=n.b;if(e)return!0;var r,i,u=n.a,o=t[0][0],a=t[1][0],l=t[0][1],c=t[1][1],f=n.l,s=n.r,h=f.x,p=f.y,g=s.x,v=s.y,d=(h+g)/2,y=(p+v)/2;if(v===p){if(o>d||d>=a)return;if(h>g){if(u){if(u.y>=c)return}else u={x:d,y:l};e={x:d,y:c}}else{if(u){if(u.y<l)return}else u={x:d,y:c};e={x:d,y:l}}}else if(r=(h-g)/(v-p),i=y-r*d,-1>r||r>1)if(h>g){if(u){if(u.y>=c)return}else u={x:(l-i)/r,y:l};e={x:(c-i)/r,y:c}}else{if(u){if(u.y<l)return}else u={x:(c-i)/r,y:c};e={x:(l-i)/r,y:l}}else if(v>p){if(u){if(u.x>=a)return}else u={x:o,y:r*o+i};e={x:a,y:r*a+i}}else{if(u){if(u.x<o)return}else u={x:a,y:r*a+i};e={x:o,y:r*o+i}}return n.a=u,n.b=e,!0}function Ge(n,t){this.l=n,this.r=t,this.a=this.b=null}function Ke(n,t,e,r){var i=new Ge(n,t);return il.push(i),e&&nr(i,n,t,e),r&&nr(i,t,n,r),ul[n.i].edges.push(new tr(i,n,t)),ul[t.i].edges.push(new tr(i,t,n)),i}function Qe(n,t,e){var r=new Ge(n,null);return r.a=t,r.b=e,il.push(r),r}function nr(n,t,e,r){n.a||n.b?n.l===e?n.b=r:n.a=r:(n.a=r,n.l=t,n.r=e)}function tr(n,t,e){var r=n.a,i=n.b;this.edge=n,this.site=t,this.angle=e?Math.atan2(e.y-t.y,e.x-t.x):n.l===t?Math.atan2(i.x-r.x,r.y-i.y):Math.atan2(r.x-i.x,i.y-r.y)}function er(){this._=null}function rr(n){n.U=n.C=n.L=n.R=n.P=n.N=null}function ir(n,t){var e=t,r=t.R,i=e.U;i?i.L===e?i.L=r:i.R=r:n._=r,r.U=i,e.U=r,e.R=r.L,e.R&&(e.R.U=e),r.L=e}function ur(n,t){var e=t,r=t.L,i=e.U;i?i.L===e?i.L=r:i.R=r:n._=r,r.U=i,e.U=r,e.L=r.R,e.L&&(e.L.U=e),r.R=e}function or(n){for(;n.L;)n=n.L;return n}function ar(n,t){var e,r,i,u=n.sort(lr).pop();for(il=[],ul=new Array(n.length),ol=new er,ll=new er;;)if(i=al,u&&(!i||u.y<i.y||u.y===i.y&&u.x<i.x))u.x===e&&u.y===r||(ul[u.i]=new Ye(u),He(u),e=u.x,r=u.y),u=n.pop();else{if(!i)break;Fe(i.arc)}t&&(We(t),Ze(t));var o={cells:ul,edges:il};return ol=ll=il=ul=null,o}function lr(n,t){return t.y-n.y||t.x-n.x}function cr(n,t,e){return(n.x-e.x)*(t.y-n.y)-(n.x-t.x)*(e.y-n.y)}function fr(n){return n.x}function sr(n){return n.y}function hr(){return{leaf:!0,nodes:[],point:null,x:null,y:null}}function pr(n,t,e,r,i,u){if(!n(t,e,r,i,u)){var o=.5*(e+i),a=.5*(r+u),l=t.nodes;l[0]&&pr(n,l[0],e,r,o,a),l[1]&&pr(n,l[1],o,r,i,a),l[2]&&pr(n,l[2],e,a,o,u),l[3]&&pr(n,l[3],o,a,i,u)}}function gr(n,t,e,r,i,u,o){var a,l=1/0;return function c(n,f,s,h,p){if(!(f>u||s>o||r>h||i>p)){if(g=n.point){var g,v=t-n.x,d=e-n.y,y=v*v+d*d;if(l>y){var m=Math.sqrt(l=y);r=t-m,i=e-m,u=t+m,o=e+m,a=g}}for(var M=n.nodes,x=.5*(f+h),b=.5*(s+p),_=t>=x,w=e>=b,S=w<<1|_,k=S+4;k>S;++S)if(n=M[3&S])switch(3&S){case 0:c(n,f,s,x,b);break;case 1:c(n,x,s,h,b);break;case 2:c(n,f,b,x,p);break;case 3:c(n,x,b,h,p)}}}(n,r,i,u,o),a}function vr(n,t){n=ao.rgb(n),t=ao.rgb(t);var e=n.r,r=n.g,i=n.b,u=t.r-e,o=t.g-r,a=t.b-i;return function(n){return"#"+bn(Math.round(e+u*n))+bn(Math.round(r+o*n))+bn(Math.round(i+a*n))}}function dr(n,t){var e,r={},i={};for(e in n)e in t?r[e]=Mr(n[e],t[e]):i[e]=n[e];for(e in t)e in n||(i[e]=t[e]);return function(n){for(e in r)i[e]=r[e](n);return i}}function yr(n,t){return n=+n,t=+t,function(e){return n*(1-e)+t*e}}function mr(n,t){var e,r,i,u=hl.lastIndex=pl.lastIndex=0,o=-1,a=[],l=[];for(n+="",t+="";(e=hl.exec(n))&&(r=pl.exec(t));)(i=r.index)>u&&(i=t.slice(u,i),a[o]?a[o]+=i:a[++o]=i),(e=e[0])===(r=r[0])?a[o]?a[o]+=r:a[++o]=r:(a[++o]=null,l.push({i:o,x:yr(e,r)})),u=pl.lastIndex;return u<t.length&&(i=t.slice(u),a[o]?a[o]+=i:a[++o]=i),a.length<2?l[0]?(t=l[0].x,function(n){return t(n)+""}):function(){return t}:(t=l.length,function(n){for(var e,r=0;t>r;++r)a[(e=l[r]).i]=e.x(n);return a.join("")})}function Mr(n,t){for(var e,r=ao.interpolators.length;--r>=0&&!(e=ao.interpolators[r](n,t)););return e}function xr(n,t){var e,r=[],i=[],u=n.length,o=t.length,a=Math.min(n.length,t.length);for(e=0;a>e;++e)r.push(Mr(n[e],t[e]));for(;u>e;++e)i[e]=n[e];for(;o>e;++e)i[e]=t[e];return function(n){for(e=0;a>e;++e)i[e]=r[e](n);return i}}function br(n){return function(t){return 0>=t?0:t>=1?1:n(t)}}function _r(n){return function(t){return 1-n(1-t)}}function wr(n){return function(t){return.5*(.5>t?n(2*t):2-n(2-2*t))}}function Sr(n){return n*n}function kr(n){return n*n*n}function Nr(n){if(0>=n)return 0;if(n>=1)return 1;var t=n*n,e=t*n;return 4*(.5>n?e:3*(n-t)+e-.75)}function Er(n){return function(t){return Math.pow(t,n)}}function Ar(n){return 1-Math.cos(n*Io)}function Cr(n){return Math.pow(2,10*(n-1))}function zr(n){return 1-Math.sqrt(1-n*n)}function Lr(n,t){var e;return arguments.length<2&&(t=.45),arguments.length?e=t/Ho*Math.asin(1/n):(n=1,e=t/4),function(r){return 1+n*Math.pow(2,-10*r)*Math.sin((r-e)*Ho/t)}}function qr(n){return n||(n=1.70158),function(t){return t*t*((n+1)*t-n)}}function Tr(n){return 1/2.75>n?7.5625*n*n:2/2.75>n?7.5625*(n-=1.5/2.75)*n+.75:2.5/2.75>n?7.5625*(n-=2.25/2.75)*n+.9375:7.5625*(n-=2.625/2.75)*n+.984375}function Rr(n,t){n=ao.hcl(n),t=ao.hcl(t);var e=n.h,r=n.c,i=n.l,u=t.h-e,o=t.c-r,a=t.l-i;return isNaN(o)&&(o=0,r=isNaN(r)?t.c:r),isNaN(u)?(u=0,e=isNaN(e)?t.h:e):u>180?u-=360:-180>u&&(u+=360),function(n){return sn(e+u*n,r+o*n,i+a*n)+""}}function Dr(n,t){n=ao.hsl(n),t=ao.hsl(t);var e=n.h,r=n.s,i=n.l,u=t.h-e,o=t.s-r,a=t.l-i;return isNaN(o)&&(o=0,r=isNaN(r)?t.s:r),isNaN(u)?(u=0,e=isNaN(e)?t.h:e):u>180?u-=360:-180>u&&(u+=360),function(n){return cn(e+u*n,r+o*n,i+a*n)+""}}function Pr(n,t){n=ao.lab(n),t=ao.lab(t);var e=n.l,r=n.a,i=n.b,u=t.l-e,o=t.a-r,a=t.b-i;return function(n){return pn(e+u*n,r+o*n,i+a*n)+""}}function Ur(n,t){return t-=n,function(e){return Math.round(n+t*e)}}function jr(n){var t=[n.a,n.b],e=[n.c,n.d],r=Hr(t),i=Fr(t,e),u=Hr(Or(e,t,-i))||0;t[0]*e[1]<e[0]*t[1]&&(t[0]*=-1,t[1]*=-1,r*=-1,i*=-1),this.rotate=(r?Math.atan2(t[1],t[0]):Math.atan2(-e[0],e[1]))*Zo,this.translate=[n.e,n.f],this.scale=[r,u],this.skew=u?Math.atan2(i,u)*Zo:0}function Fr(n,t){return n[0]*t[0]+n[1]*t[1]}function Hr(n){var t=Math.sqrt(Fr(n,n));return t&&(n[0]/=t,n[1]/=t),t}function Or(n,t,e){return n[0]+=e*t[0],n[1]+=e*t[1],n}function Ir(n){return n.length?n.pop()+",":""}function Yr(n,t,e,r){if(n[0]!==t[0]||n[1]!==t[1]){var i=e.push("translate(",null,",",null,")");r.push({i:i-4,x:yr(n[0],t[0])},{i:i-2,x:yr(n[1],t[1])})}else(t[0]||t[1])&&e.push("translate("+t+")")}function Zr(n,t,e,r){n!==t?(n-t>180?t+=360:t-n>180&&(n+=360),r.push({i:e.push(Ir(e)+"rotate(",null,")")-2,x:yr(n,t)})):t&&e.push(Ir(e)+"rotate("+t+")")}function Vr(n,t,e,r){n!==t?r.push({i:e.push(Ir(e)+"skewX(",null,")")-2,x:yr(n,t)}):t&&e.push(Ir(e)+"skewX("+t+")")}function Xr(n,t,e,r){if(n[0]!==t[0]||n[1]!==t[1]){var i=e.push(Ir(e)+"scale(",null,",",null,")");r.push({i:i-4,x:yr(n[0],t[0])},{i:i-2,x:yr(n[1],t[1])})}else 1===t[0]&&1===t[1]||e.push(Ir(e)+"scale("+t+")")}function $r(n,t){var e=[],r=[];return n=ao.transform(n),t=ao.transform(t),Yr(n.translate,t.translate,e,r),Zr(n.rotate,t.rotate,e,r),Vr(n.skew,t.skew,e,r),Xr(n.scale,t.scale,e,r),n=t=null,function(n){for(var t,i=-1,u=r.length;++i<u;)e[(t=r[i]).i]=t.x(n);return e.join("")}}function Br(n,t){return t=(t-=n=+n)||1/t,function(e){return(e-n)/t}}function Wr(n,t){return t=(t-=n=+n)||1/t,function(e){return Math.max(0,Math.min(1,(e-n)/t))}}function Jr(n){for(var t=n.source,e=n.target,r=Kr(t,e),i=[t];t!==r;)t=t.parent,i.push(t);for(var u=i.length;e!==r;)i.splice(u,0,e),e=e.parent;return i}function Gr(n){for(var t=[],e=n.parent;null!=e;)t.push(n),n=e,e=e.parent;return t.push(n),t}function Kr(n,t){if(n===t)return n;for(var e=Gr(n),r=Gr(t),i=e.pop(),u=r.pop(),o=null;i===u;)o=i,i=e.pop(),u=r.pop();return o}function Qr(n){n.fixed|=2}function ni(n){n.fixed&=-7}function ti(n){n.fixed|=4,n.px=n.x,n.py=n.y}function ei(n){n.fixed&=-5}function ri(n,t,e){var r=0,i=0;if(n.charge=0,!n.leaf)for(var u,o=n.nodes,a=o.length,l=-1;++l<a;)u=o[l],null!=u&&(ri(u,t,e),n.charge+=u.charge,r+=u.charge*u.cx,i+=u.charge*u.cy);if(n.point){n.leaf||(n.point.x+=Math.random()-.5,n.point.y+=Math.random()-.5);var c=t*e[n.point.index];n.charge+=n.pointCharge=c,r+=c*n.point.x,i+=c*n.point.y}n.cx=r/n.charge,n.cy=i/n.charge}function ii(n,t){return ao.rebind(n,t,"sort","children","value"),n.nodes=n,n.links=fi,n}function ui(n,t){for(var e=[n];null!=(n=e.pop());)if(t(n),(i=n.children)&&(r=i.length))for(var r,i;--r>=0;)e.push(i[r])}function oi(n,t){for(var e=[n],r=[];null!=(n=e.pop());)if(r.push(n),(u=n.children)&&(i=u.length))for(var i,u,o=-1;++o<i;)e.push(u[o]);for(;null!=(n=r.pop());)t(n)}function ai(n){return n.children}function li(n){return n.value}function ci(n,t){return t.value-n.value}function fi(n){return ao.merge(n.map(function(n){return(n.children||[]).map(function(t){return{source:n,target:t}})}))}function si(n){return n.x}function hi(n){return n.y}function pi(n,t,e){n.y0=t,n.y=e}function gi(n){return ao.range(n.length)}function vi(n){for(var t=-1,e=n[0].length,r=[];++t<e;)r[t]=0;return r}function di(n){for(var t,e=1,r=0,i=n[0][1],u=n.length;u>e;++e)(t=n[e][1])>i&&(r=e,i=t);return r}function yi(n){return n.reduce(mi,0)}function mi(n,t){return n+t[1]}function Mi(n,t){return xi(n,Math.ceil(Math.log(t.length)/Math.LN2+1))}function xi(n,t){for(var e=-1,r=+n[0],i=(n[1]-r)/t,u=[];++e<=t;)u[e]=i*e+r;return u}function bi(n){return[ao.min(n),ao.max(n)]}function _i(n,t){return n.value-t.value}function wi(n,t){var e=n._pack_next;n._pack_next=t,t._pack_prev=n,t._pack_next=e,e._pack_prev=t}function Si(n,t){n._pack_next=t,t._pack_prev=n}function ki(n,t){var e=t.x-n.x,r=t.y-n.y,i=n.r+t.r;return.999*i*i>e*e+r*r}function Ni(n){function t(n){f=Math.min(n.x-n.r,f),s=Math.max(n.x+n.r,s),h=Math.min(n.y-n.r,h),p=Math.max(n.y+n.r,p)}if((e=n.children)&&(c=e.length)){var e,r,i,u,o,a,l,c,f=1/0,s=-(1/0),h=1/0,p=-(1/0);if(e.forEach(Ei),r=e[0],r.x=-r.r,r.y=0,t(r),c>1&&(i=e[1],i.x=i.r,i.y=0,t(i),c>2))for(u=e[2],zi(r,i,u),t(u),wi(r,u),r._pack_prev=u,wi(u,i),i=r._pack_next,o=3;c>o;o++){zi(r,i,u=e[o]);var g=0,v=1,d=1;for(a=i._pack_next;a!==i;a=a._pack_next,v++)if(ki(a,u)){g=1;break}if(1==g)for(l=r._pack_prev;l!==a._pack_prev&&!ki(l,u);l=l._pack_prev,d++);g?(d>v||v==d&&i.r<r.r?Si(r,i=a):Si(r=l,i),o--):(wi(r,u),i=u,t(u))}var y=(f+s)/2,m=(h+p)/2,M=0;for(o=0;c>o;o++)u=e[o],u.x-=y,u.y-=m,M=Math.max(M,u.r+Math.sqrt(u.x*u.x+u.y*u.y));n.r=M,e.forEach(Ai)}}function Ei(n){n._pack_next=n._pack_prev=n}function Ai(n){delete n._pack_next,delete n._pack_prev}function Ci(n,t,e,r){var i=n.children;if(n.x=t+=r*n.x,n.y=e+=r*n.y,n.r*=r,i)for(var u=-1,o=i.length;++u<o;)Ci(i[u],t,e,r)}function zi(n,t,e){var r=n.r+e.r,i=t.x-n.x,u=t.y-n.y;if(r&&(i||u)){var o=t.r+e.r,a=i*i+u*u;o*=o,r*=r;var l=.5+(r-o)/(2*a),c=Math.sqrt(Math.max(0,2*o*(r+a)-(r-=a)*r-o*o))/(2*a);e.x=n.x+l*i+c*u,e.y=n.y+l*u-c*i}else e.x=n.x+r,e.y=n.y}function Li(n,t){return n.parent==t.parent?1:2}function qi(n){var t=n.children;return t.length?t[0]:n.t}function Ti(n){var t,e=n.children;return(t=e.length)?e[t-1]:n.t}function Ri(n,t,e){var r=e/(t.i-n.i);t.c-=r,t.s+=e,n.c+=r,t.z+=e,t.m+=e}function Di(n){for(var t,e=0,r=0,i=n.children,u=i.length;--u>=0;)t=i[u],t.z+=e,t.m+=e,e+=t.s+(r+=t.c)}function Pi(n,t,e){return n.a.parent===t.parent?n.a:e}function Ui(n){return 1+ao.max(n,function(n){return n.y})}function ji(n){return n.reduce(function(n,t){return n+t.x},0)/n.length}function Fi(n){var t=n.children;return t&&t.length?Fi(t[0]):n}function Hi(n){var t,e=n.children;return e&&(t=e.length)?Hi(e[t-1]):n}function Oi(n){return{x:n.x,y:n.y,dx:n.dx,dy:n.dy}}function Ii(n,t){var e=n.x+t[3],r=n.y+t[0],i=n.dx-t[1]-t[3],u=n.dy-t[0]-t[2];return 0>i&&(e+=i/2,i=0),0>u&&(r+=u/2,u=0),{x:e,y:r,dx:i,dy:u}}function Yi(n){var t=n[0],e=n[n.length-1];return e>t?[t,e]:[e,t]}function Zi(n){return n.rangeExtent?n.rangeExtent():Yi(n.range())}function Vi(n,t,e,r){var i=e(n[0],n[1]),u=r(t[0],t[1]);return function(n){return u(i(n))}}function Xi(n,t){var e,r=0,i=n.length-1,u=n[r],o=n[i];return u>o&&(e=r,r=i,i=e,e=u,u=o,o=e),n[r]=t.floor(u),n[i]=t.ceil(o),n}function $i(n){return n?{floor:function(t){return Math.floor(t/n)*n},ceil:function(t){return Math.ceil(t/n)*n}}:Sl}function Bi(n,t,e,r){var i=[],u=[],o=0,a=Math.min(n.length,t.length)-1;for(n[a]<n[0]&&(n=n.slice().reverse(),t=t.slice().reverse());++o<=a;)i.push(e(n[o-1],n[o])),u.push(r(t[o-1],t[o]));return function(t){var e=ao.bisect(n,t,1,a)-1;return u[e](i[e](t))}}function Wi(n,t,e,r){function i(){var i=Math.min(n.length,t.length)>2?Bi:Vi,l=r?Wr:Br;return o=i(n,t,l,e),a=i(t,n,l,Mr),u}function u(n){return o(n)}var o,a;return u.invert=function(n){return a(n)},u.domain=function(t){return arguments.length?(n=t.map(Number),i()):n},u.range=function(n){return arguments.length?(t=n,i()):t},u.rangeRound=function(n){return u.range(n).interpolate(Ur)},u.clamp=function(n){return arguments.length?(r=n,i()):r},u.interpolate=function(n){return arguments.length?(e=n,i()):e},u.ticks=function(t){return Qi(n,t)},u.tickFormat=function(t,e){return nu(n,t,e)},u.nice=function(t){return Gi(n,t),i()},u.copy=function(){return Wi(n,t,e,r)},i()}function Ji(n,t){return ao.rebind(n,t,"range","rangeRound","interpolate","clamp")}function Gi(n,t){return Xi(n,$i(Ki(n,t)[2])),Xi(n,$i(Ki(n,t)[2])),n}function Ki(n,t){null==t&&(t=10);var e=Yi(n),r=e[1]-e[0],i=Math.pow(10,Math.floor(Math.log(r/t)/Math.LN10)),u=t/r*i;return.15>=u?i*=10:.35>=u?i*=5:.75>=u&&(i*=2),e[0]=Math.ceil(e[0]/i)*i,e[1]=Math.floor(e[1]/i)*i+.5*i,e[2]=i,e}function Qi(n,t){return ao.range.apply(ao,Ki(n,t))}function nu(n,t,e){var r=Ki(n,t);if(e){var i=ha.exec(e);if(i.shift(),"s"===i[8]){var u=ao.formatPrefix(Math.max(xo(r[0]),xo(r[1])));return i[7]||(i[7]="."+tu(u.scale(r[2]))),i[8]="f",e=ao.format(i.join("")),function(n){return e(u.scale(n))+u.symbol}}i[7]||(i[7]="."+eu(i[8],r)),e=i.join("")}else e=",."+tu(r[2])+"f";return ao.format(e)}function tu(n){return-Math.floor(Math.log(n)/Math.LN10+.01)}function eu(n,t){var e=tu(t[2]);return n in kl?Math.abs(e-tu(Math.max(xo(t[0]),xo(t[1]))))+ +("e"!==n):e-2*("%"===n)}function ru(n,t,e,r){function i(n){return(e?Math.log(0>n?0:n):-Math.log(n>0?0:-n))/Math.log(t)}function u(n){return e?Math.pow(t,n):-Math.pow(t,-n)}function o(t){return n(i(t))}return o.invert=function(t){return u(n.invert(t))},o.domain=function(t){return arguments.length?(e=t[0]>=0,n.domain((r=t.map(Number)).map(i)),o):r},o.base=function(e){return arguments.length?(t=+e,n.domain(r.map(i)),o):t},o.nice=function(){var t=Xi(r.map(i),e?Math:El);return n.domain(t),r=t.map(u),o},o.ticks=function(){var n=Yi(r),o=[],a=n[0],l=n[1],c=Math.floor(i(a)),f=Math.ceil(i(l)),s=t%1?2:t;if(isFinite(f-c)){if(e){for(;f>c;c++)for(var h=1;s>h;h++)o.push(u(c)*h);o.push(u(c))}else for(o.push(u(c));c++<f;)for(var h=s-1;h>0;h--)o.push(u(c)*h);for(c=0;o[c]<a;c++);for(f=o.length;o[f-1]>l;f--);o=o.slice(c,f)}return o},o.tickFormat=function(n,e){if(!arguments.length)return Nl;arguments.length<2?e=Nl:"function"!=typeof e&&(e=ao.format(e));var r=Math.max(1,t*n/o.ticks().length);return function(n){var o=n/u(Math.round(i(n)));return t-.5>o*t&&(o*=t),r>=o?e(n):""}},o.copy=function(){return ru(n.copy(),t,e,r)},Ji(o,n)}function iu(n,t,e){function r(t){return n(i(t))}var i=uu(t),u=uu(1/t);return r.invert=function(t){return u(n.invert(t))},r.domain=function(t){return arguments.length?(n.domain((e=t.map(Number)).map(i)),r):e},r.ticks=function(n){return Qi(e,n)},r.tickFormat=function(n,t){return nu(e,n,t)},r.nice=function(n){return r.domain(Gi(e,n))},r.exponent=function(o){return arguments.length?(i=uu(t=o),u=uu(1/t),n.domain(e.map(i)),r):t},r.copy=function(){return iu(n.copy(),t,e)},Ji(r,n)}function uu(n){return function(t){return 0>t?-Math.pow(-t,n):Math.pow(t,n)}}function ou(n,t){function e(e){return u[((i.get(e)||("range"===t.t?i.set(e,n.push(e)):NaN))-1)%u.length]}function r(t,e){return ao.range(n.length).map(function(n){return t+e*n})}var i,u,o;return e.domain=function(r){if(!arguments.length)return n;n=[],i=new c;for(var u,o=-1,a=r.length;++o<a;)i.has(u=r[o])||i.set(u,n.push(u));return e[t.t].apply(e,t.a)},e.range=function(n){return arguments.length?(u=n,o=0,t={t:"range",a:arguments},e):u},e.rangePoints=function(i,a){arguments.length<2&&(a=0);var l=i[0],c=i[1],f=n.length<2?(l=(l+c)/2,0):(c-l)/(n.length-1+a);return u=r(l+f*a/2,f),o=0,t={t:"rangePoints",a:arguments},e},e.rangeRoundPoints=function(i,a){arguments.length<2&&(a=0);var l=i[0],c=i[1],f=n.length<2?(l=c=Math.round((l+c)/2),0):(c-l)/(n.length-1+a)|0;return u=r(l+Math.round(f*a/2+(c-l-(n.length-1+a)*f)/2),f),o=0,t={t:"rangeRoundPoints",a:arguments},e},e.rangeBands=function(i,a,l){arguments.length<2&&(a=0),arguments.length<3&&(l=a);var c=i[1]<i[0],f=i[c-0],s=i[1-c],h=(s-f)/(n.length-a+2*l);return u=r(f+h*l,h),c&&u.reverse(),o=h*(1-a),t={t:"rangeBands",a:arguments},e},e.rangeRoundBands=function(i,a,l){arguments.length<2&&(a=0),arguments.length<3&&(l=a);var c=i[1]<i[0],f=i[c-0],s=i[1-c],h=Math.floor((s-f)/(n.length-a+2*l));return u=r(f+Math.round((s-f-(n.length-a)*h)/2),h),c&&u.reverse(),o=Math.round(h*(1-a)),t={t:"rangeRoundBands",a:arguments},e},e.rangeBand=function(){return o},e.rangeExtent=function(){return Yi(t.a[0])},e.copy=function(){return ou(n,t)},e.domain(n)}function au(n,t){function u(){var e=0,r=t.length;for(a=[];++e<r;)a[e-1]=ao.quantile(n,e/r);return o}function o(n){return isNaN(n=+n)?void 0:t[ao.bisect(a,n)]}var a;return o.domain=function(t){return arguments.length?(n=t.map(r).filter(i).sort(e),u()):n},o.range=function(n){return arguments.length?(t=n,u()):t},o.quantiles=function(){return a},o.invertExtent=function(e){return e=t.indexOf(e),0>e?[NaN,NaN]:[e>0?a[e-1]:n[0],e<a.length?a[e]:n[n.length-1]]},o.copy=function(){return au(n,t)},u()}function lu(n,t,e){function r(t){return e[Math.max(0,Math.min(o,Math.floor(u*(t-n))))]}function i(){return u=e.length/(t-n),o=e.length-1,r}var u,o;return r.domain=function(e){return arguments.length?(n=+e[0],t=+e[e.length-1],i()):[n,t]},r.range=function(n){return arguments.length?(e=n,i()):e},r.invertExtent=function(t){return t=e.indexOf(t),t=0>t?NaN:t/u+n,[t,t+1/u]},r.copy=function(){return lu(n,t,e)},i()}function cu(n,t){function e(e){return e>=e?t[ao.bisect(n,e)]:void 0}return e.domain=function(t){return arguments.length?(n=t,e):n},e.range=function(n){return arguments.length?(t=n,e):t},e.invertExtent=function(e){return e=t.indexOf(e),[n[e-1],n[e]]},e.copy=function(){return cu(n,t)},e}function fu(n){function t(n){return+n}return t.invert=t,t.domain=t.range=function(e){return arguments.length?(n=e.map(t),t):n},t.ticks=function(t){return Qi(n,t)},t.tickFormat=function(t,e){return nu(n,t,e)},t.copy=function(){return fu(n)},t}function su(){return 0}function hu(n){return n.innerRadius}function pu(n){return n.outerRadius}function gu(n){return n.startAngle}function vu(n){return n.endAngle}function du(n){return n&&n.padAngle}function yu(n,t,e,r){return(n-e)*t-(t-r)*n>0?0:1}function mu(n,t,e,r,i){var u=n[0]-t[0],o=n[1]-t[1],a=(i?r:-r)/Math.sqrt(u*u+o*o),l=a*o,c=-a*u,f=n[0]+l,s=n[1]+c,h=t[0]+l,p=t[1]+c,g=(f+h)/2,v=(s+p)/2,d=h-f,y=p-s,m=d*d+y*y,M=e-r,x=f*p-h*s,b=(0>y?-1:1)*Math.sqrt(Math.max(0,M*M*m-x*x)),_=(x*y-d*b)/m,w=(-x*d-y*b)/m,S=(x*y+d*b)/m,k=(-x*d+y*b)/m,N=_-g,E=w-v,A=S-g,C=k-v;return N*N+E*E>A*A+C*C&&(_=S,w=k),[[_-l,w-c],[_*e/M,w*e/M]]}function Mu(n){function t(t){function o(){c.push("M",u(n(f),a))}for(var l,c=[],f=[],s=-1,h=t.length,p=En(e),g=En(r);++s<h;)i.call(this,l=t[s],s)?f.push([+p.call(this,l,s),+g.call(this,l,s)]):f.length&&(o(),f=[]);return f.length&&o(),c.length?c.join(""):null}var e=Ce,r=ze,i=zt,u=xu,o=u.key,a=.7;return t.x=function(n){return arguments.length?(e=n,t):e},t.y=function(n){return arguments.length?(r=n,t):r},t.defined=function(n){return arguments.length?(i=n,t):i},t.interpolate=function(n){return arguments.length?(o="function"==typeof n?u=n:(u=Tl.get(n)||xu).key,t):o},t.tension=function(n){return arguments.length?(a=n,t):a},t}function xu(n){return n.length>1?n.join("L"):n+"Z"}function bu(n){return n.join("L")+"Z"}function _u(n){for(var t=0,e=n.length,r=n[0],i=[r[0],",",r[1]];++t<e;)i.push("H",(r[0]+(r=n[t])[0])/2,"V",r[1]);return e>1&&i.push("H",r[0]),i.join("")}function wu(n){for(var t=0,e=n.length,r=n[0],i=[r[0],",",r[1]];++t<e;)i.push("V",(r=n[t])[1],"H",r[0]);return i.join("")}function Su(n){for(var t=0,e=n.length,r=n[0],i=[r[0],",",r[1]];++t<e;)i.push("H",(r=n[t])[0],"V",r[1]);return i.join("")}function ku(n,t){return n.length<4?xu(n):n[1]+Au(n.slice(1,-1),Cu(n,t))}function Nu(n,t){return n.length<3?bu(n):n[0]+Au((n.push(n[0]),n),Cu([n[n.length-2]].concat(n,[n[1]]),t))}function Eu(n,t){return n.length<3?xu(n):n[0]+Au(n,Cu(n,t))}function Au(n,t){if(t.length<1||n.length!=t.length&&n.length!=t.length+2)return xu(n);var e=n.length!=t.length,r="",i=n[0],u=n[1],o=t[0],a=o,l=1;if(e&&(r+="Q"+(u[0]-2*o[0]/3)+","+(u[1]-2*o[1]/3)+","+u[0]+","+u[1],i=n[1],l=2),t.length>1){a=t[1],u=n[l],l++,r+="C"+(i[0]+o[0])+","+(i[1]+o[1])+","+(u[0]-a[0])+","+(u[1]-a[1])+","+u[0]+","+u[1];for(var c=2;c<t.length;c++,l++)u=n[l],a=t[c],r+="S"+(u[0]-a[0])+","+(u[1]-a[1])+","+u[0]+","+u[1]}if(e){var f=n[l];r+="Q"+(u[0]+2*a[0]/3)+","+(u[1]+2*a[1]/3)+","+f[0]+","+f[1]}return r}function Cu(n,t){for(var e,r=[],i=(1-t)/2,u=n[0],o=n[1],a=1,l=n.length;++a<l;)e=u,u=o,o=n[a],r.push([i*(o[0]-e[0]),i*(o[1]-e[1])]);return r}function zu(n){if(n.length<3)return xu(n);var t=1,e=n.length,r=n[0],i=r[0],u=r[1],o=[i,i,i,(r=n[1])[0]],a=[u,u,u,r[1]],l=[i,",",u,"L",Ru(Pl,o),",",Ru(Pl,a)];for(n.push(n[e-1]);++t<=e;)r=n[t],o.shift(),o.push(r[0]),a.shift(),a.push(r[1]),Du(l,o,a);return n.pop(),l.push("L",r),l.join("")}function Lu(n){if(n.length<4)return xu(n);for(var t,e=[],r=-1,i=n.length,u=[0],o=[0];++r<3;)t=n[r],u.push(t[0]),o.push(t[1]);for(e.push(Ru(Pl,u)+","+Ru(Pl,o)),--r;++r<i;)t=n[r],u.shift(),u.push(t[0]),o.shift(),o.push(t[1]),Du(e,u,o);return e.join("")}function qu(n){for(var t,e,r=-1,i=n.length,u=i+4,o=[],a=[];++r<4;)e=n[r%i],o.push(e[0]),a.push(e[1]);for(t=[Ru(Pl,o),",",Ru(Pl,a)],--r;++r<u;)e=n[r%i],o.shift(),o.push(e[0]),a.shift(),a.push(e[1]),Du(t,o,a);return t.join("")}function Tu(n,t){var e=n.length-1;if(e)for(var r,i,u=n[0][0],o=n[0][1],a=n[e][0]-u,l=n[e][1]-o,c=-1;++c<=e;)r=n[c],i=c/e,r[0]=t*r[0]+(1-t)*(u+i*a),r[1]=t*r[1]+(1-t)*(o+i*l);return zu(n)}function Ru(n,t){return n[0]*t[0]+n[1]*t[1]+n[2]*t[2]+n[3]*t[3]}function Du(n,t,e){n.push("C",Ru(Rl,t),",",Ru(Rl,e),",",Ru(Dl,t),",",Ru(Dl,e),",",Ru(Pl,t),",",Ru(Pl,e))}function Pu(n,t){return(t[1]-n[1])/(t[0]-n[0])}function Uu(n){for(var t=0,e=n.length-1,r=[],i=n[0],u=n[1],o=r[0]=Pu(i,u);++t<e;)r[t]=(o+(o=Pu(i=u,u=n[t+1])))/2;return r[t]=o,r}function ju(n){for(var t,e,r,i,u=[],o=Uu(n),a=-1,l=n.length-1;++a<l;)t=Pu(n[a],n[a+1]),xo(t)<Uo?o[a]=o[a+1]=0:(e=o[a]/t,r=o[a+1]/t,i=e*e+r*r,i>9&&(i=3*t/Math.sqrt(i),o[a]=i*e,o[a+1]=i*r));for(a=-1;++a<=l;)i=(n[Math.min(l,a+1)][0]-n[Math.max(0,a-1)][0])/(6*(1+o[a]*o[a])),u.push([i||0,o[a]*i||0]);return u}function Fu(n){return n.length<3?xu(n):n[0]+Au(n,ju(n))}function Hu(n){for(var t,e,r,i=-1,u=n.length;++i<u;)t=n[i],e=t[0],r=t[1]-Io,t[0]=e*Math.cos(r),t[1]=e*Math.sin(r);return n}function Ou(n){function t(t){function l(){v.push("M",a(n(y),s),f,c(n(d.reverse()),s),"Z")}for(var h,p,g,v=[],d=[],y=[],m=-1,M=t.length,x=En(e),b=En(i),_=e===r?function(){return p}:En(r),w=i===u?function(){return g}:En(u);++m<M;)o.call(this,h=t[m],m)?(d.push([p=+x.call(this,h,m),g=+b.call(this,h,m)]),y.push([+_.call(this,h,m),+w.call(this,h,m)])):d.length&&(l(),d=[],y=[]);return d.length&&l(),v.length?v.join(""):null}var e=Ce,r=Ce,i=0,u=ze,o=zt,a=xu,l=a.key,c=a,f="L",s=.7;return t.x=function(n){return arguments.length?(e=r=n,t):r},t.x0=function(n){return arguments.length?(e=n,t):e},t.x1=function(n){return arguments.length?(r=n,t):r},t.y=function(n){return arguments.length?(i=u=n,t):u},t.y0=function(n){return arguments.length?(i=n,t):i},t.y1=function(n){return arguments.length?(u=n,t):u},t.defined=function(n){return arguments.length?(o=n,t):o},t.interpolate=function(n){return arguments.length?(l="function"==typeof n?a=n:(a=Tl.get(n)||xu).key,c=a.reverse||a,f=a.closed?"M":"L",t):l},t.tension=function(n){return arguments.length?(s=n,t):s},t}function Iu(n){return n.radius}function Yu(n){return[n.x,n.y]}function Zu(n){return function(){var t=n.apply(this,arguments),e=t[0],r=t[1]-Io;return[e*Math.cos(r),e*Math.sin(r)]}}function Vu(){return 64}function Xu(){return"circle"}function $u(n){var t=Math.sqrt(n/Fo);return"M0,"+t+"A"+t+","+t+" 0 1,1 0,"+-t+"A"+t+","+t+" 0 1,1 0,"+t+"Z"}function Bu(n){return function(){var t,e,r;(t=this[n])&&(r=t[e=t.active])&&(r.timer.c=null,r.timer.t=NaN,--t.count?delete t[e]:delete this[n],t.active+=.5,r.event&&r.event.interrupt.call(this,this.__data__,r.index))}}function Wu(n,t,e){return ko(n,Yl),n.namespace=t,n.id=e,n}function Ju(n,t,e,r){var i=n.id,u=n.namespace;return Y(n,"function"==typeof e?function(n,o,a){n[u][i].tween.set(t,r(e.call(n,n.__data__,o,a)))}:(e=r(e),function(n){n[u][i].tween.set(t,e)}))}function Gu(n){return null==n&&(n=""),function(){this.textContent=n}}function Ku(n){return null==n?"__transition__":"__transition_"+n+"__"}function Qu(n,t,e,r,i){function u(n){var t=v.delay;return f.t=t+l,n>=t?o(n-t):void(f.c=o)}function o(e){var i=g.active,u=g[i];u&&(u.timer.c=null,u.timer.t=NaN,--g.count,delete g[i],u.event&&u.event.interrupt.call(n,n.__data__,u.index));for(var o in g)if(r>+o){var c=g[o];c.timer.c=null,c.timer.t=NaN,--g.count,delete g[o]}f.c=a,qn(function(){return f.c&&a(e||1)&&(f.c=null,f.t=NaN),1},0,l),g.active=r,v.event&&v.event.start.call(n,n.__data__,t),p=[],v.tween.forEach(function(e,r){(r=r.call(n,n.__data__,t))&&p.push(r)}),h=v.ease,s=v.duration}function a(i){for(var u=i/s,o=h(u),a=p.length;a>0;)p[--a].call(n,o);return u>=1?(v.event&&v.event.end.call(n,n.__data__,t),--g.count?delete g[r]:delete n[e],1):void 0}var l,f,s,h,p,g=n[e]||(n[e]={active:0,count:0}),v=g[r];v||(l=i.time,f=qn(u,0,l),v=g[r]={tween:new c,time:l,timer:f,delay:i.delay,duration:i.duration,ease:i.ease,index:t},i=null,++g.count)}function no(n,t,e){n.attr("transform",function(n){var r=t(n);return"translate("+(isFinite(r)?r:e(n))+",0)"})}function to(n,t,e){n.attr("transform",function(n){var r=t(n);return"translate(0,"+(isFinite(r)?r:e(n))+")"})}function eo(n){return n.toISOString()}function ro(n,t,e){function r(t){return n(t)}function i(n,e){var r=n[1]-n[0],i=r/e,u=ao.bisect(Kl,i);return u==Kl.length?[t.year,Ki(n.map(function(n){return n/31536e6}),e)[2]]:u?t[i/Kl[u-1]<Kl[u]/i?u-1:u]:[tc,Ki(n,e)[2]]}return r.invert=function(t){return io(n.invert(t))},r.domain=function(t){return arguments.length?(n.domain(t),r):n.domain().map(io)},r.nice=function(n,t){function e(e){return!isNaN(e)&&!n.range(e,io(+e+1),t).length}var u=r.domain(),o=Yi(u),a=null==n?i(o,10):"number"==typeof n&&i(o,n);return a&&(n=a[0],t=a[1]),r.domain(Xi(u,t>1?{floor:function(t){for(;e(t=n.floor(t));)t=io(t-1);return t},ceil:function(t){for(;e(t=n.ceil(t));)t=io(+t+1);return t}}:n))},r.ticks=function(n,t){var e=Yi(r.domain()),u=null==n?i(e,10):"number"==typeof n?i(e,n):!n.range&&[{range:n},t];return u&&(n=u[0],t=u[1]),n.range(e[0],io(+e[1]+1),1>t?1:t)},r.tickFormat=function(){return e},r.copy=function(){return ro(n.copy(),t,e)},Ji(r,n)}function io(n){return new Date(n)}function uo(n){return JSON.parse(n.responseText)}function oo(n){var t=fo.createRange();return t.selectNode(fo.body),t.createContextualFragment(n.responseText)}var ao={version:"3.5.17"},lo=[].slice,co=function(n){return lo.call(n)},fo=this.document;if(fo)try{co(fo.documentElement.childNodes)[0].nodeType}catch(so){co=function(n){for(var t=n.length,e=new Array(t);t--;)e[t]=n[t];return e}}if(Date.now||(Date.now=function(){return+new Date}),fo)try{fo.createElement("DIV").style.setProperty("opacity",0,"")}catch(ho){var po=this.Element.prototype,go=po.setAttribute,vo=po.setAttributeNS,yo=this.CSSStyleDeclaration.prototype,mo=yo.setProperty;po.setAttribute=function(n,t){go.call(this,n,t+"")},po.setAttributeNS=function(n,t,e){vo.call(this,n,t,e+"")},yo.setProperty=function(n,t,e){mo.call(this,n,t+"",e)}}ao.ascending=e,ao.descending=function(n,t){return n>t?-1:t>n?1:t>=n?0:NaN},ao.min=function(n,t){var e,r,i=-1,u=n.length;if(1===arguments.length){for(;++i<u;)if(null!=(r=n[i])&&r>=r){e=r;break}for(;++i<u;)null!=(r=n[i])&&e>r&&(e=r)}else{for(;++i<u;)if(null!=(r=t.call(n,n[i],i))&&r>=r){e=r;break}for(;++i<u;)null!=(r=t.call(n,n[i],i))&&e>r&&(e=r)}return e},ao.max=function(n,t){var e,r,i=-1,u=n.length;if(1===arguments.length){for(;++i<u;)if(null!=(r=n[i])&&r>=r){e=r;break}for(;++i<u;)null!=(r=n[i])&&r>e&&(e=r)}else{for(;++i<u;)if(null!=(r=t.call(n,n[i],i))&&r>=r){e=r;break}for(;++i<u;)null!=(r=t.call(n,n[i],i))&&r>e&&(e=r)}return e},ao.extent=function(n,t){var e,r,i,u=-1,o=n.length;if(1===arguments.length){for(;++u<o;)if(null!=(r=n[u])&&r>=r){e=i=r;break}for(;++u<o;)null!=(r=n[u])&&(e>r&&(e=r),r>i&&(i=r))}else{for(;++u<o;)if(null!=(r=t.call(n,n[u],u))&&r>=r){e=i=r;break}for(;++u<o;)null!=(r=t.call(n,n[u],u))&&(e>r&&(e=r),r>i&&(i=r))}return[e,i]},ao.sum=function(n,t){var e,r=0,u=n.length,o=-1;if(1===arguments.length)for(;++o<u;)i(e=+n[o])&&(r+=e);else for(;++o<u;)i(e=+t.call(n,n[o],o))&&(r+=e);return r},ao.mean=function(n,t){var e,u=0,o=n.length,a=-1,l=o;if(1===arguments.length)for(;++a<o;)i(e=r(n[a]))?u+=e:--l;else for(;++a<o;)i(e=r(t.call(n,n[a],a)))?u+=e:--l;return l?u/l:void 0},ao.quantile=function(n,t){var e=(n.length-1)*t+1,r=Math.floor(e),i=+n[r-1],u=e-r;return u?i+u*(n[r]-i):i},ao.median=function(n,t){var u,o=[],a=n.length,l=-1;if(1===arguments.length)for(;++l<a;)i(u=r(n[l]))&&o.push(u);else for(;++l<a;)i(u=r(t.call(n,n[l],l)))&&o.push(u);return o.length?ao.quantile(o.sort(e),.5):void 0},ao.variance=function(n,t){var e,u,o=n.length,a=0,l=0,c=-1,f=0;if(1===arguments.length)for(;++c<o;)i(e=r(n[c]))&&(u=e-a,a+=u/++f,l+=u*(e-a));else for(;++c<o;)i(e=r(t.call(n,n[c],c)))&&(u=e-a,a+=u/++f,l+=u*(e-a));return f>1?l/(f-1):void 0},ao.deviation=function(){var n=ao.variance.apply(this,arguments);return n?Math.sqrt(n):n};var Mo=u(e);ao.bisectLeft=Mo.left,ao.bisect=ao.bisectRight=Mo.right,ao.bisector=function(n){return u(1===n.length?function(t,r){return e(n(t),r)}:n)},ao.shuffle=function(n,t,e){(u=arguments.length)<3&&(e=n.length,2>u&&(t=0));for(var r,i,u=e-t;u;)i=Math.random()*u--|0,r=n[u+t],n[u+t]=n[i+t],n[i+t]=r;return n},ao.permute=function(n,t){for(var e=t.length,r=new Array(e);e--;)r[e]=n[t[e]];return r},ao.pairs=function(n){for(var t,e=0,r=n.length-1,i=n[0],u=new Array(0>r?0:r);r>e;)u[e]=[t=i,i=n[++e]];return u},ao.transpose=function(n){if(!(i=n.length))return[];for(var t=-1,e=ao.min(n,o),r=new Array(e);++t<e;)for(var i,u=-1,a=r[t]=new Array(i);++u<i;)a[u]=n[u][t];return r},ao.zip=function(){return ao.transpose(arguments)},ao.keys=function(n){var t=[];for(var e in n)t.push(e);return t},ao.values=function(n){var t=[];for(var e in n)t.push(n[e]);return t},ao.entries=function(n){var t=[];for(var e in n)t.push({key:e,value:n[e]});return t},ao.merge=function(n){for(var t,e,r,i=n.length,u=-1,o=0;++u<i;)o+=n[u].length;for(e=new Array(o);--i>=0;)for(r=n[i],t=r.length;--t>=0;)e[--o]=r[t];return e};var xo=Math.abs;ao.range=function(n,t,e){if(arguments.length<3&&(e=1,arguments.length<2&&(t=n,n=0)),(t-n)/e===1/0)throw new Error("infinite range");var r,i=[],u=a(xo(e)),o=-1;if(n*=u,t*=u,e*=u,0>e)for(;(r=n+e*++o)>t;)i.push(r/u);else for(;(r=n+e*++o)<t;)i.push(r/u);return i},ao.map=function(n,t){var e=new c;if(n instanceof c)n.forEach(function(n,t){e.set(n,t)});else if(Array.isArray(n)){var r,i=-1,u=n.length;if(1===arguments.length)for(;++i<u;)e.set(i,n[i]);else for(;++i<u;)e.set(t.call(n,r=n[i],i),r)}else for(var o in n)e.set(o,n[o]);return e};var bo="__proto__",_o="\x00";l(c,{has:h,get:function(n){return this._[f(n)]},set:function(n,t){return this._[f(n)]=t},remove:p,keys:g,values:function(){var n=[];for(var t in this._)n.push(this._[t]);return n},entries:function(){var n=[];for(var t in this._)n.push({key:s(t),value:this._[t]});return n},size:v,empty:d,forEach:function(n){for(var t in this._)n.call(this,s(t),this._[t])}}),ao.nest=function(){function n(t,o,a){if(a>=u.length)return r?r.call(i,o):e?o.sort(e):o;for(var l,f,s,h,p=-1,g=o.length,v=u[a++],d=new c;++p<g;)(h=d.get(l=v(f=o[p])))?h.push(f):d.set(l,[f]);return t?(f=t(),s=function(e,r){f.set(e,n(t,r,a))}):(f={},s=function(e,r){f[e]=n(t,r,a)}),d.forEach(s),f}function t(n,e){if(e>=u.length)return n;var r=[],i=o[e++];return n.forEach(function(n,i){r.push({key:n,values:t(i,e)})}),i?r.sort(function(n,t){return i(n.key,t.key)}):r}var e,r,i={},u=[],o=[];return i.map=function(t,e){return n(e,t,0)},i.entries=function(e){return t(n(ao.map,e,0),0)},i.key=function(n){return u.push(n),i},i.sortKeys=function(n){return o[u.length-1]=n,i},i.sortValues=function(n){return e=n,i},i.rollup=function(n){return r=n,i},i},ao.set=function(n){var t=new y;if(n)for(var e=0,r=n.length;r>e;++e)t.add(n[e]);return t},l(y,{has:h,add:function(n){return this._[f(n+="")]=!0,n},remove:p,values:g,size:v,empty:d,forEach:function(n){for(var t in this._)n.call(this,s(t))}}),ao.behavior={},ao.rebind=function(n,t){for(var e,r=1,i=arguments.length;++r<i;)n[e=arguments[r]]=M(n,t,t[e]);return n};var wo=["webkit","ms","moz","Moz","o","O"];ao.dispatch=function(){for(var n=new _,t=-1,e=arguments.length;++t<e;)n[arguments[t]]=w(n);return n},_.prototype.on=function(n,t){var e=n.indexOf("."),r="";if(e>=0&&(r=n.slice(e+1),n=n.slice(0,e)),n)return arguments.length<2?this[n].on(r):this[n].on(r,t);if(2===arguments.length){if(null==t)for(n in this)this.hasOwnProperty(n)&&this[n].on(r,null);return this}},ao.event=null,ao.requote=function(n){return n.replace(So,"\\$&")};var So=/[\\\^\$\*\+\?\|\[\]\(\)\.\{\}]/g,ko={}.__proto__?function(n,t){n.__proto__=t}:function(n,t){for(var e in t)n[e]=t[e]},No=function(n,t){return t.querySelector(n)},Eo=function(n,t){return t.querySelectorAll(n)},Ao=function(n,t){var e=n.matches||n[x(n,"matchesSelector")];return(Ao=function(n,t){return e.call(n,t)})(n,t)};"function"==typeof Sizzle&&(No=function(n,t){return Sizzle(n,t)[0]||null},Eo=Sizzle,Ao=Sizzle.matchesSelector),ao.selection=function(){return ao.select(fo.documentElement)};var Co=ao.selection.prototype=[];Co.select=function(n){var t,e,r,i,u=[];n=A(n);for(var o=-1,a=this.length;++o<a;){u.push(t=[]),t.parentNode=(r=this[o]).parentNode;for(var l=-1,c=r.length;++l<c;)(i=r[l])?(t.push(e=n.call(i,i.__data__,l,o)),e&&"__data__"in i&&(e.__data__=i.__data__)):t.push(null)}return E(u)},Co.selectAll=function(n){var t,e,r=[];n=C(n);for(var i=-1,u=this.length;++i<u;)for(var o=this[i],a=-1,l=o.length;++a<l;)(e=o[a])&&(r.push(t=co(n.call(e,e.__data__,a,i))),t.parentNode=e);return E(r)};var zo="http://www.w3.org/1999/xhtml",Lo={svg:"http://www.w3.org/2000/svg",xhtml:zo,xlink:"http://www.w3.org/1999/xlink",xml:"http://www.w3.org/XML/1998/namespace",xmlns:"http://www.w3.org/2000/xmlns/"};ao.ns={prefix:Lo,qualify:function(n){var t=n.indexOf(":"),e=n;return t>=0&&"xmlns"!==(e=n.slice(0,t))&&(n=n.slice(t+1)),Lo.hasOwnProperty(e)?{space:Lo[e],local:n}:n}},Co.attr=function(n,t){if(arguments.length<2){if("string"==typeof n){var e=this.node();return n=ao.ns.qualify(n),n.local?e.getAttributeNS(n.space,n.local):e.getAttribute(n)}for(t in n)this.each(z(t,n[t]));return this}return this.each(z(n,t))},Co.classed=function(n,t){if(arguments.length<2){if("string"==typeof n){var e=this.node(),r=(n=T(n)).length,i=-1;if(t=e.classList){for(;++i<r;)if(!t.contains(n[i]))return!1}else for(t=e.getAttribute("class");++i<r;)if(!q(n[i]).test(t))return!1;return!0}for(t in n)this.each(R(t,n[t]));return this}return this.each(R(n,t))},Co.style=function(n,e,r){var i=arguments.length;if(3>i){if("string"!=typeof n){2>i&&(e="");for(r in n)this.each(P(r,n[r],e));return this}if(2>i){var u=this.node();return t(u).getComputedStyle(u,null).getPropertyValue(n)}r=""}return this.each(P(n,e,r))},Co.property=function(n,t){if(arguments.length<2){if("string"==typeof n)return this.node()[n];for(t in n)this.each(U(t,n[t]));return this}return this.each(U(n,t))},Co.text=function(n){return arguments.length?this.each("function"==typeof n?function(){var t=n.apply(this,arguments);this.textContent=null==t?"":t}:null==n?function(){this.textContent=""}:function(){this.textContent=n}):this.node().textContent},Co.html=function(n){return arguments.length?this.each("function"==typeof n?function(){var t=n.apply(this,arguments);this.innerHTML=null==t?"":t}:null==n?function(){this.innerHTML=""}:function(){this.innerHTML=n}):this.node().innerHTML},Co.append=function(n){return n=j(n),this.select(function(){return this.appendChild(n.apply(this,arguments))})},Co.insert=function(n,t){return n=j(n),t=A(t),this.select(function(){return this.insertBefore(n.apply(this,arguments),t.apply(this,arguments)||null)})},Co.remove=function(){return this.each(F)},Co.data=function(n,t){function e(n,e){var r,i,u,o=n.length,s=e.length,h=Math.min(o,s),p=new Array(s),g=new Array(s),v=new Array(o);if(t){var d,y=new c,m=new Array(o);for(r=-1;++r<o;)(i=n[r])&&(y.has(d=t.call(i,i.__data__,r))?v[r]=i:y.set(d,i),m[r]=d);for(r=-1;++r<s;)(i=y.get(d=t.call(e,u=e[r],r)))?i!==!0&&(p[r]=i,i.__data__=u):g[r]=H(u),y.set(d,!0);for(r=-1;++r<o;)r in m&&y.get(m[r])!==!0&&(v[r]=n[r])}else{for(r=-1;++r<h;)i=n[r],u=e[r],i?(i.__data__=u,p[r]=i):g[r]=H(u);for(;s>r;++r)g[r]=H(e[r]);for(;o>r;++r)v[r]=n[r]}g.update=p,g.parentNode=p.parentNode=v.parentNode=n.parentNode,a.push(g),l.push(p),f.push(v)}var r,i,u=-1,o=this.length;if(!arguments.length){for(n=new Array(o=(r=this[0]).length);++u<o;)(i=r[u])&&(n[u]=i.__data__);return n}var a=Z([]),l=E([]),f=E([]);if("function"==typeof n)for(;++u<o;)e(r=this[u],n.call(r,r.parentNode.__data__,u));else for(;++u<o;)e(r=this[u],n);return l.enter=function(){return a},l.exit=function(){return f},l},Co.datum=function(n){return arguments.length?this.property("__data__",n):this.property("__data__")},Co.filter=function(n){var t,e,r,i=[];"function"!=typeof n&&(n=O(n));for(var u=0,o=this.length;o>u;u++){i.push(t=[]),t.parentNode=(e=this[u]).parentNode;for(var a=0,l=e.length;l>a;a++)(r=e[a])&&n.call(r,r.__data__,a,u)&&t.push(r)}return E(i)},Co.order=function(){for(var n=-1,t=this.length;++n<t;)for(var e,r=this[n],i=r.length-1,u=r[i];--i>=0;)(e=r[i])&&(u&&u!==e.nextSibling&&u.parentNode.insertBefore(e,u),u=e);return this},Co.sort=function(n){n=I.apply(this,arguments);for(var t=-1,e=this.length;++t<e;)this[t].sort(n);return this.order()},Co.each=function(n){return Y(this,function(t,e,r){n.call(t,t.__data__,e,r)})},Co.call=function(n){var t=co(arguments);return n.apply(t[0]=this,t),this},Co.empty=function(){return!this.node()},Co.node=function(){for(var n=0,t=this.length;t>n;n++)for(var e=this[n],r=0,i=e.length;i>r;r++){var u=e[r];if(u)return u}return null},Co.size=function(){var n=0;return Y(this,function(){++n}),n};var qo=[];ao.selection.enter=Z,ao.selection.enter.prototype=qo,qo.append=Co.append,qo.empty=Co.empty,qo.node=Co.node,qo.call=Co.call,qo.size=Co.size,qo.select=function(n){for(var t,e,r,i,u,o=[],a=-1,l=this.length;++a<l;){r=(i=this[a]).update,o.push(t=[]),t.parentNode=i.parentNode;for(var c=-1,f=i.length;++c<f;)(u=i[c])?(t.push(r[c]=e=n.call(i.parentNode,u.__data__,c,a)),e.__data__=u.__data__):t.push(null)}return E(o)},qo.insert=function(n,t){return arguments.length<2&&(t=V(this)),Co.insert.call(this,n,t)},ao.select=function(t){var e;return"string"==typeof t?(e=[No(t,fo)],e.parentNode=fo.documentElement):(e=[t],e.parentNode=n(t)),E([e])},ao.selectAll=function(n){var t;return"string"==typeof n?(t=co(Eo(n,fo)),t.parentNode=fo.documentElement):(t=co(n),t.parentNode=null),E([t])},Co.on=function(n,t,e){var r=arguments.length;if(3>r){if("string"!=typeof n){2>r&&(t=!1);for(e in n)this.each(X(e,n[e],t));return this}if(2>r)return(r=this.node()["__on"+n])&&r._;e=!1}return this.each(X(n,t,e))};var To=ao.map({mouseenter:"mouseover",mouseleave:"mouseout"});fo&&To.forEach(function(n){"on"+n in fo&&To.remove(n)});var Ro,Do=0;ao.mouse=function(n){return J(n,k())};var Po=this.navigator&&/WebKit/.test(this.navigator.userAgent)?-1:0;ao.touch=function(n,t,e){if(arguments.length<3&&(e=t,t=k().changedTouches),t)for(var r,i=0,u=t.length;u>i;++i)if((r=t[i]).identifier===e)return J(n,r)},ao.behavior.drag=function(){function n(){this.on("mousedown.drag",u).on("touchstart.drag",o)}function e(n,t,e,u,o){return function(){function a(){var n,e,r=t(h,v);r&&(n=r[0]-M[0],e=r[1]-M[1],g|=n|e,M=r,p({type:"drag",x:r[0]+c[0],y:r[1]+c[1],dx:n,dy:e}))}function l(){t(h,v)&&(y.on(u+d,null).on(o+d,null),m(g),p({type:"dragend"}))}var c,f=this,s=ao.event.target.correspondingElement||ao.event.target,h=f.parentNode,p=r.of(f,arguments),g=0,v=n(),d=".drag"+(null==v?"":"-"+v),y=ao.select(e(s)).on(u+d,a).on(o+d,l),m=W(s),M=t(h,v);i?(c=i.apply(f,arguments),c=[c.x-M[0],c.y-M[1]]):c=[0,0],p({type:"dragstart"})}}var r=N(n,"drag","dragstart","dragend"),i=null,u=e(b,ao.mouse,t,"mousemove","mouseup"),o=e(G,ao.touch,m,"touchmove","touchend");return n.origin=function(t){return arguments.length?(i=t,n):i},ao.rebind(n,r,"on")},ao.touches=function(n,t){return arguments.length<2&&(t=k().touches),t?co(t).map(function(t){var e=J(n,t);return e.identifier=t.identifier,e}):[]};var Uo=1e-6,jo=Uo*Uo,Fo=Math.PI,Ho=2*Fo,Oo=Ho-Uo,Io=Fo/2,Yo=Fo/180,Zo=180/Fo,Vo=Math.SQRT2,Xo=2,$o=4;ao.interpolateZoom=function(n,t){var e,r,i=n[0],u=n[1],o=n[2],a=t[0],l=t[1],c=t[2],f=a-i,s=l-u,h=f*f+s*s;if(jo>h)r=Math.log(c/o)/Vo,e=function(n){return[i+n*f,u+n*s,o*Math.exp(Vo*n*r)]};else{var p=Math.sqrt(h),g=(c*c-o*o+$o*h)/(2*o*Xo*p),v=(c*c-o*o-$o*h)/(2*c*Xo*p),d=Math.log(Math.sqrt(g*g+1)-g),y=Math.log(Math.sqrt(v*v+1)-v);r=(y-d)/Vo,e=function(n){var t=n*r,e=rn(d),a=o/(Xo*p)*(e*un(Vo*t+d)-en(d));return[i+a*f,u+a*s,o*e/rn(Vo*t+d)]}}return e.duration=1e3*r,e},ao.behavior.zoom=function(){function n(n){n.on(L,s).on(Wo+".zoom",p).on("dblclick.zoom",g).on(R,h)}function e(n){return[(n[0]-k.x)/k.k,(n[1]-k.y)/k.k]}function r(n){return[n[0]*k.k+k.x,n[1]*k.k+k.y]}function i(n){k.k=Math.max(A[0],Math.min(A[1],n))}function u(n,t){t=r(t),k.x+=n[0]-t[0],k.y+=n[1]-t[1]}function o(t,e,r,o){t.__chart__={x:k.x,y:k.y,k:k.k},i(Math.pow(2,o)),u(d=e,r),t=ao.select(t),C>0&&(t=t.transition().duration(C)),t.call(n.event)}function a(){b&&b.domain(x.range().map(function(n){return(n-k.x)/k.k}).map(x.invert)),w&&w.domain(_.range().map(function(n){return(n-k.y)/k.k}).map(_.invert))}function l(n){z++||n({type:"zoomstart"})}function c(n){a(),n({type:"zoom",scale:k.k,translate:[k.x,k.y]})}function f(n){--z||(n({type:"zoomend"}),d=null)}function s(){function n(){a=1,u(ao.mouse(i),h),c(o)}function r(){s.on(q,null).on(T,null),p(a),f(o)}var i=this,o=D.of(i,arguments),a=0,s=ao.select(t(i)).on(q,n).on(T,r),h=e(ao.mouse(i)),p=W(i);Il.call(i),l(o)}function h(){function n(){var n=ao.touches(g);return p=k.k,n.forEach(function(n){n.identifier in d&&(d[n.identifier]=e(n))}),n}function t(){var t=ao.event.target;ao.select(t).on(x,r).on(b,a),_.push(t);for(var e=ao.event.changedTouches,i=0,u=e.length;u>i;++i)d[e[i].identifier]=null;var l=n(),c=Date.now();if(1===l.length){if(500>c-M){var f=l[0];o(g,f,d[f.identifier],Math.floor(Math.log(k.k)/Math.LN2)+1),S()}M=c}else if(l.length>1){var f=l[0],s=l[1],h=f[0]-s[0],p=f[1]-s[1];y=h*h+p*p}}function r(){var n,t,e,r,o=ao.touches(g);Il.call(g);for(var a=0,l=o.length;l>a;++a,r=null)if(e=o[a],r=d[e.identifier]){if(t)break;n=e,t=r}if(r){var f=(f=e[0]-n[0])*f+(f=e[1]-n[1])*f,s=y&&Math.sqrt(f/y);n=[(n[0]+e[0])/2,(n[1]+e[1])/2],t=[(t[0]+r[0])/2,(t[1]+r[1])/2],i(s*p)}M=null,u(n,t),c(v)}function a(){if(ao.event.touches.length){for(var t=ao.event.changedTouches,e=0,r=t.length;r>e;++e)delete d[t[e].identifier];for(var i in d)return void n()}ao.selectAll(_).on(m,null),w.on(L,s).on(R,h),N(),f(v)}var p,g=this,v=D.of(g,arguments),d={},y=0,m=".zoom-"+ao.event.changedTouches[0].identifier,x="touchmove"+m,b="touchend"+m,_=[],w=ao.select(g),N=W(g);t(),l(v),w.on(L,null).on(R,t)}function p(){var n=D.of(this,arguments);m?clearTimeout(m):(Il.call(this),v=e(d=y||ao.mouse(this)),l(n)),m=setTimeout(function(){m=null,f(n)},50),S(),i(Math.pow(2,.002*Bo())*k.k),u(d,v),c(n)}function g(){var n=ao.mouse(this),t=Math.log(k.k)/Math.LN2;o(this,n,e(n),ao.event.shiftKey?Math.ceil(t)-1:Math.floor(t)+1)}var v,d,y,m,M,x,b,_,w,k={x:0,y:0,k:1},E=[960,500],A=Jo,C=250,z=0,L="mousedown.zoom",q="mousemove.zoom",T="mouseup.zoom",R="touchstart.zoom",D=N(n,"zoomstart","zoom","zoomend");return Wo||(Wo="onwheel"in fo?(Bo=function(){return-ao.event.deltaY*(ao.event.deltaMode?120:1)},"wheel"):"onmousewheel"in fo?(Bo=function(){return ao.event.wheelDelta},"mousewheel"):(Bo=function(){return-ao.event.detail},"MozMousePixelScroll")),n.event=function(n){n.each(function(){var n=D.of(this,arguments),t=k;Hl?ao.select(this).transition().each("start.zoom",function(){k=this.__chart__||{x:0,y:0,k:1},l(n)}).tween("zoom:zoom",function(){var e=E[0],r=E[1],i=d?d[0]:e/2,u=d?d[1]:r/2,o=ao.interpolateZoom([(i-k.x)/k.k,(u-k.y)/k.k,e/k.k],[(i-t.x)/t.k,(u-t.y)/t.k,e/t.k]);return function(t){var r=o(t),a=e/r[2];this.__chart__=k={x:i-r[0]*a,y:u-r[1]*a,k:a},c(n)}}).each("interrupt.zoom",function(){f(n)}).each("end.zoom",function(){f(n)}):(this.__chart__=k,l(n),c(n),f(n))})},n.translate=function(t){return arguments.length?(k={x:+t[0],y:+t[1],k:k.k},a(),n):[k.x,k.y]},n.scale=function(t){return arguments.length?(k={x:k.x,y:k.y,k:null},i(+t),a(),n):k.k},n.scaleExtent=function(t){return arguments.length?(A=null==t?Jo:[+t[0],+t[1]],n):A},n.center=function(t){return arguments.length?(y=t&&[+t[0],+t[1]],n):y},n.size=function(t){return arguments.length?(E=t&&[+t[0],+t[1]],n):E},n.duration=function(t){return arguments.length?(C=+t,n):C},n.x=function(t){return arguments.length?(b=t,x=t.copy(),k={x:0,y:0,k:1},n):b},n.y=function(t){return arguments.length?(w=t,_=t.copy(),k={x:0,y:0,k:1},n):w},ao.rebind(n,D,"on")};var Bo,Wo,Jo=[0,1/0];ao.color=an,an.prototype.toString=function(){return this.rgb()+""},ao.hsl=ln;var Go=ln.prototype=new an;Go.brighter=function(n){return n=Math.pow(.7,arguments.length?n:1),new ln(this.h,this.s,this.l/n)},Go.darker=function(n){return n=Math.pow(.7,arguments.length?n:1),new ln(this.h,this.s,n*this.l)},Go.rgb=function(){return cn(this.h,this.s,this.l)},ao.hcl=fn;var Ko=fn.prototype=new an;Ko.brighter=function(n){return new fn(this.h,this.c,Math.min(100,this.l+Qo*(arguments.length?n:1)))},Ko.darker=function(n){return new fn(this.h,this.c,Math.max(0,this.l-Qo*(arguments.length?n:1)))},Ko.rgb=function(){return sn(this.h,this.c,this.l).rgb()},ao.lab=hn;var Qo=18,na=.95047,ta=1,ea=1.08883,ra=hn.prototype=new an;ra.brighter=function(n){return new hn(Math.min(100,this.l+Qo*(arguments.length?n:1)),this.a,this.b)},ra.darker=function(n){return new hn(Math.max(0,this.l-Qo*(arguments.length?n:1)),this.a,this.b)},ra.rgb=function(){return pn(this.l,this.a,this.b)},ao.rgb=mn;var ia=mn.prototype=new an;ia.brighter=function(n){n=Math.pow(.7,arguments.length?n:1);var t=this.r,e=this.g,r=this.b,i=30;return t||e||r?(t&&i>t&&(t=i),e&&i>e&&(e=i),r&&i>r&&(r=i),new mn(Math.min(255,t/n),Math.min(255,e/n),Math.min(255,r/n))):new mn(i,i,i)},ia.darker=function(n){return n=Math.pow(.7,arguments.length?n:1),new mn(n*this.r,n*this.g,n*this.b)},ia.hsl=function(){return wn(this.r,this.g,this.b)},ia.toString=function(){return"#"+bn(this.r)+bn(this.g)+bn(this.b)};var ua=ao.map({aliceblue:15792383,antiquewhite:16444375,aqua:65535,aquamarine:8388564,azure:15794175,beige:16119260,bisque:16770244,black:0,blanchedalmond:16772045,blue:255,blueviolet:9055202,brown:10824234,burlywood:14596231,cadetblue:6266528,chartreuse:8388352,chocolate:13789470,coral:16744272,cornflowerblue:6591981,cornsilk:16775388,crimson:14423100,cyan:65535,darkblue:139,darkcyan:35723,darkgoldenrod:12092939,darkgray:11119017,darkgreen:25600,darkgrey:11119017,darkkhaki:12433259,darkmagenta:9109643,darkolivegreen:5597999,darkorange:16747520,darkorchid:10040012,darkred:9109504,darksalmon:15308410,darkseagreen:9419919,darkslateblue:4734347,darkslategray:3100495,darkslategrey:3100495,darkturquoise:52945,darkviolet:9699539,deeppink:16716947,deepskyblue:49151,dimgray:6908265,dimgrey:6908265,dodgerblue:2003199,firebrick:11674146,floralwhite:16775920,forestgreen:2263842,fuchsia:16711935,gainsboro:14474460,ghostwhite:16316671,gold:16766720,goldenrod:14329120,gray:8421504,green:32768,greenyellow:11403055,grey:8421504,honeydew:15794160,hotpink:16738740,indianred:13458524,indigo:4915330,ivory:16777200,khaki:15787660,lavender:15132410,lavenderblush:16773365,lawngreen:8190976,lemonchiffon:16775885,lightblue:11393254,lightcoral:15761536,lightcyan:14745599,lightgoldenrodyellow:16448210,lightgray:13882323,lightgreen:9498256,lightgrey:13882323,lightpink:16758465,lightsalmon:16752762,lightseagreen:2142890,lightskyblue:8900346,lightslategray:7833753,lightslategrey:7833753,lightsteelblue:11584734,lightyellow:16777184,lime:65280,limegreen:3329330,linen:16445670,magenta:16711935,maroon:8388608,mediumaquamarine:6737322,mediumblue:205,mediumorchid:12211667,mediumpurple:9662683,mediumseagreen:3978097,mediumslateblue:8087790,mediumspringgreen:64154,mediumturquoise:4772300,mediumvioletred:13047173,midnightblue:1644912,mintcream:16121850,mistyrose:16770273,moccasin:16770229,navajowhite:16768685,navy:128,oldlace:16643558,olive:8421376,olivedrab:7048739,orange:16753920,orangered:16729344,orchid:14315734,palegoldenrod:15657130,palegreen:10025880,paleturquoise:11529966,palevioletred:14381203,papayawhip:16773077,peachpuff:16767673,peru:13468991,pink:16761035,plum:14524637,powderblue:11591910,purple:8388736,rebeccapurple:6697881,red:16711680,rosybrown:12357519,royalblue:4286945,saddlebrown:9127187,salmon:16416882,sandybrown:16032864,seagreen:3050327,seashell:16774638,sienna:10506797,silver:12632256,skyblue:8900331,slateblue:6970061,slategray:7372944,slategrey:7372944,snow:16775930,springgreen:65407,steelblue:4620980,tan:13808780,teal:32896,thistle:14204888,tomato:16737095,turquoise:4251856,violet:15631086,wheat:16113331,white:16777215,whitesmoke:16119285,yellow:16776960,yellowgreen:10145074});ua.forEach(function(n,t){ua.set(n,Mn(t))}),ao.functor=En,ao.xhr=An(m),ao.dsv=function(n,t){function e(n,e,u){arguments.length<3&&(u=e,e=null);var o=Cn(n,t,null==e?r:i(e),u);return o.row=function(n){return arguments.length?o.response(null==(e=n)?r:i(n)):e},o}function r(n){return e.parse(n.responseText)}function i(n){return function(t){return e.parse(t.responseText,n)}}function u(t){return t.map(o).join(n)}function o(n){return a.test(n)?'"'+n.replace(/\"/g,'""')+'"':n}var a=new RegExp('["'+n+"\n]"),l=n.charCodeAt(0);return e.parse=function(n,t){var r;return e.parseRows(n,function(n,e){if(r)return r(n,e-1);var i=new Function("d","return {"+n.map(function(n,t){return JSON.stringify(n)+": d["+t+"]"}).join(",")+"}");r=t?function(n,e){return t(i(n),e)}:i})},e.parseRows=function(n,t){function e(){if(f>=c)return o;if(i)return i=!1,u;var t=f;if(34===n.charCodeAt(t)){for(var e=t;e++<c;)if(34===n.charCodeAt(e)){if(34!==n.charCodeAt(e+1))break;++e}f=e+2;var r=n.charCodeAt(e+1);return 13===r?(i=!0,10===n.charCodeAt(e+2)&&++f):10===r&&(i=!0),n.slice(t+1,e).replace(/""/g,'"')}for(;c>f;){var r=n.charCodeAt(f++),a=1;if(10===r)i=!0;else if(13===r)i=!0,10===n.charCodeAt(f)&&(++f,++a);else if(r!==l)continue;return n.slice(t,f-a)}return n.slice(t)}for(var r,i,u={},o={},a=[],c=n.length,f=0,s=0;(r=e())!==o;){for(var h=[];r!==u&&r!==o;)h.push(r),r=e();t&&null==(h=t(h,s++))||a.push(h)}return a},e.format=function(t){if(Array.isArray(t[0]))return e.formatRows(t);var r=new y,i=[];return t.forEach(function(n){for(var t in n)r.has(t)||i.push(r.add(t))}),[i.map(o).join(n)].concat(t.map(function(t){return i.map(function(n){return o(t[n])}).join(n)})).join("\n")},e.formatRows=function(n){return n.map(u).join("\n")},e},ao.csv=ao.dsv(",","text/csv"),ao.tsv=ao.dsv("  ","text/tab-separated-values");var oa,aa,la,ca,fa=this[x(this,"requestAnimationFrame")]||function(n){setTimeout(n,17)};ao.timer=function(){qn.apply(this,arguments)},ao.timer.flush=function(){Rn(),Dn()},ao.round=function(n,t){return t?Math.round(n*(t=Math.pow(10,t)))/t:Math.round(n)};var sa=["y","z","a","f","p","n","\xb5","m","","k","M","G","T","P","E","Z","Y"].map(Un);ao.formatPrefix=function(n,t){var e=0;return(n=+n)&&(0>n&&(n*=-1),t&&(n=ao.round(n,Pn(n,t))),e=1+Math.floor(1e-12+Math.log(n)/Math.LN10),e=Math.max(-24,Math.min(24,3*Math.floor((e-1)/3)))),sa[8+e/3]};var ha=/(?:([^{])?([<>=^]))?([+\- ])?([$#])?(0)?(\d+)?(,)?(\.-?\d+)?([a-z%])?/i,pa=ao.map({b:function(n){return n.toString(2)},c:function(n){return String.fromCharCode(n)},o:function(n){return n.toString(8)},x:function(n){return n.toString(16)},X:function(n){return n.toString(16).toUpperCase()},g:function(n,t){return n.toPrecision(t)},e:function(n,t){return n.toExponential(t)},f:function(n,t){return n.toFixed(t)},r:function(n,t){return(n=ao.round(n,Pn(n,t))).toFixed(Math.max(0,Math.min(20,Pn(n*(1+1e-15),t))))}}),ga=ao.time={},va=Date;Hn.prototype={getDate:function(){return this._.getUTCDate()},getDay:function(){return this._.getUTCDay()},getFullYear:function(){return this._.getUTCFullYear()},getHours:function(){return this._.getUTCHours()},getMilliseconds:function(){return this._.getUTCMilliseconds()},getMinutes:function(){return this._.getUTCMinutes()},getMonth:function(){return this._.getUTCMonth()},getSeconds:function(){return this._.getUTCSeconds()},getTime:function(){return this._.getTime()},getTimezoneOffset:function(){return 0},valueOf:function(){return this._.valueOf()},setDate:function(){da.setUTCDate.apply(this._,arguments)},setDay:function(){da.setUTCDay.apply(this._,arguments)},setFullYear:function(){da.setUTCFullYear.apply(this._,arguments)},setHours:function(){da.setUTCHours.apply(this._,arguments)},setMilliseconds:function(){da.setUTCMilliseconds.apply(this._,arguments)},setMinutes:function(){da.setUTCMinutes.apply(this._,arguments)},setMonth:function(){da.setUTCMonth.apply(this._,arguments)},setSeconds:function(){da.setUTCSeconds.apply(this._,arguments)},setTime:function(){da.setTime.apply(this._,arguments)}};var da=Date.prototype;ga.year=On(function(n){return n=ga.day(n),n.setMonth(0,1),n},function(n,t){n.setFullYear(n.getFullYear()+t)},function(n){return n.getFullYear()}),ga.years=ga.year.range,ga.years.utc=ga.year.utc.range,ga.day=On(function(n){var t=new va(2e3,0);return t.setFullYear(n.getFullYear(),n.getMonth(),n.getDate()),t},function(n,t){n.setDate(n.getDate()+t)},function(n){return n.getDate()-1}),ga.days=ga.day.range,ga.days.utc=ga.day.utc.range,ga.dayOfYear=function(n){var t=ga.year(n);return Math.floor((n-t-6e4*(n.getTimezoneOffset()-t.getTimezoneOffset()))/864e5)},["sunday","monday","tuesday","wednesday","thursday","friday","saturday"].forEach(function(n,t){t=7-t;var e=ga[n]=On(function(n){return(n=ga.day(n)).setDate(n.getDate()-(n.getDay()+t)%7),n},function(n,t){n.setDate(n.getDate()+7*Math.floor(t))},function(n){var e=ga.year(n).getDay();return Math.floor((ga.dayOfYear(n)+(e+t)%7)/7)-(e!==t)});ga[n+"s"]=e.range,ga[n+"s"].utc=e.utc.range,ga[n+"OfYear"]=function(n){var e=ga.year(n).getDay();return Math.floor((ga.dayOfYear(n)+(e+t)%7)/7)}}),ga.week=ga.sunday,ga.weeks=ga.sunday.range,ga.weeks.utc=ga.sunday.utc.range,ga.weekOfYear=ga.sundayOfYear;var ya={"-":"",_:" ",0:"0"},ma=/^\s*\d+/,Ma=/^%/;ao.locale=function(n){return{numberFormat:jn(n),timeFormat:Yn(n)}};var xa=ao.locale({decimal:".",thousands:",",grouping:[3],currency:["$",""],dateTime:"%a %b %e %X %Y",date:"%m/%d/%Y",time:"%H:%M:%S",periods:["AM","PM"],days:["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"],shortDays:["Sun","Mon","Tue","Wed","Thu","Fri","Sat"],months:["January","February","March","April","May","June","July","August","September","October","November","December"],shortMonths:["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]});ao.format=xa.numberFormat,ao.geo={},ft.prototype={s:0,t:0,add:function(n){st(n,this.t,ba),st(ba.s,this.s,this),this.s?this.t+=ba.t:this.s=ba.t},reset:function(){this.s=this.t=0},valueOf:function(){return this.s}};var ba=new ft;ao.geo.stream=function(n,t){n&&_a.hasOwnProperty(n.type)?_a[n.type](n,t):ht(n,t)};var _a={Feature:function(n,t){ht(n.geometry,t)},FeatureCollection:function(n,t){for(var e=n.features,r=-1,i=e.length;++r<i;)ht(e[r].geometry,t)}},wa={Sphere:function(n,t){t.sphere()},Point:function(n,t){n=n.coordinates,t.point(n[0],n[1],n[2])},MultiPoint:function(n,t){for(var e=n.coordinates,r=-1,i=e.length;++r<i;)n=e[r],t.point(n[0],n[1],n[2])},LineString:function(n,t){pt(n.coordinates,t,0)},MultiLineString:function(n,t){for(var e=n.coordinates,r=-1,i=e.length;++r<i;)pt(e[r],t,0)},Polygon:function(n,t){gt(n.coordinates,t)},MultiPolygon:function(n,t){for(var e=n.coordinates,r=-1,i=e.length;++r<i;)gt(e[r],t)},GeometryCollection:function(n,t){for(var e=n.geometries,r=-1,i=e.length;++r<i;)ht(e[r],t)}};ao.geo.area=function(n){return Sa=0,ao.geo.stream(n,Na),Sa};var Sa,ka=new ft,Na={sphere:function(){Sa+=4*Fo},point:b,lineStart:b,lineEnd:b,polygonStart:function(){ka.reset(),Na.lineStart=vt},polygonEnd:function(){var n=2*ka;Sa+=0>n?4*Fo+n:n,Na.lineStart=Na.lineEnd=Na.point=b}};ao.geo.bounds=function(){function n(n,t){M.push(x=[f=n,h=n]),s>t&&(s=t),t>p&&(p=t)}function t(t,e){var r=dt([t*Yo,e*Yo]);if(y){var i=mt(y,r),u=[i[1],-i[0],0],o=mt(u,i);bt(o),o=_t(o);var l=t-g,c=l>0?1:-1,v=o[0]*Zo*c,d=xo(l)>180;if(d^(v>c*g&&c*t>v)){var m=o[1]*Zo;m>p&&(p=m)}else if(v=(v+360)%360-180,d^(v>c*g&&c*t>v)){var m=-o[1]*Zo;s>m&&(s=m)}else s>e&&(s=e),e>p&&(p=e);d?g>t?a(f,t)>a(f,h)&&(h=t):a(t,h)>a(f,h)&&(f=t):h>=f?(f>t&&(f=t),t>h&&(h=t)):t>g?a(f,t)>a(f,h)&&(h=t):a(t,h)>a(f,h)&&(f=t)}else n(t,e);y=r,g=t}function e(){b.point=t}function r(){x[0]=f,x[1]=h,b.point=n,y=null}function i(n,e){if(y){var r=n-g;m+=xo(r)>180?r+(r>0?360:-360):r}else v=n,d=e;Na.point(n,e),t(n,e)}function u(){Na.lineStart()}function o(){i(v,d),Na.lineEnd(),xo(m)>Uo&&(f=-(h=180)),x[0]=f,x[1]=h,y=null}function a(n,t){return(t-=n)<0?t+360:t}function l(n,t){return n[0]-t[0]}function c(n,t){return t[0]<=t[1]?t[0]<=n&&n<=t[1]:n<t[0]||t[1]<n}var f,s,h,p,g,v,d,y,m,M,x,b={point:n,lineStart:e,lineEnd:r,polygonStart:function(){b.point=i,b.lineStart=u,b.lineEnd=o,m=0,Na.polygonStart()},polygonEnd:function(){Na.polygonEnd(),b.point=n,b.lineStart=e,b.lineEnd=r,0>ka?(f=-(h=180),s=-(p=90)):m>Uo?p=90:-Uo>m&&(s=-90),x[0]=f,x[1]=h}};return function(n){p=h=-(f=s=1/0),M=[],ao.geo.stream(n,b);var t=M.length;if(t){M.sort(l);for(var e,r=1,i=M[0],u=[i];t>r;++r)e=M[r],c(e[0],i)||c(e[1],i)?(a(i[0],e[1])>a(i[0],i[1])&&(i[1]=e[1]),a(e[0],i[1])>a(i[0],i[1])&&(i[0]=e[0])):u.push(i=e);for(var o,e,g=-(1/0),t=u.length-1,r=0,i=u[t];t>=r;i=e,++r)e=u[r],(o=a(i[1],e[0]))>g&&(g=o,f=e[0],h=i[1])}return M=x=null,f===1/0||s===1/0?[[NaN,NaN],[NaN,NaN]]:[[f,s],[h,p]]}}(),ao.geo.centroid=function(n){Ea=Aa=Ca=za=La=qa=Ta=Ra=Da=Pa=Ua=0,ao.geo.stream(n,ja);var t=Da,e=Pa,r=Ua,i=t*t+e*e+r*r;return jo>i&&(t=qa,e=Ta,r=Ra,Uo>Aa&&(t=Ca,e=za,r=La),i=t*t+e*e+r*r,jo>i)?[NaN,NaN]:[Math.atan2(e,t)*Zo,tn(r/Math.sqrt(i))*Zo]};var Ea,Aa,Ca,za,La,qa,Ta,Ra,Da,Pa,Ua,ja={sphere:b,point:St,lineStart:Nt,lineEnd:Et,polygonStart:function(){ja.lineStart=At},polygonEnd:function(){ja.lineStart=Nt}},Fa=Rt(zt,jt,Ht,[-Fo,-Fo/2]),Ha=1e9;ao.geo.clipExtent=function(){var n,t,e,r,i,u,o={stream:function(n){return i&&(i.valid=!1),i=u(n),i.valid=!0,i},extent:function(a){return arguments.length?(u=Zt(n=+a[0][0],t=+a[0][1],e=+a[1][0],r=+a[1][1]),i&&(i.valid=!1,i=null),o):[[n,t],[e,r]]}};return o.extent([[0,0],[960,500]])},(ao.geo.conicEqualArea=function(){return Vt(Xt)}).raw=Xt,ao.geo.albers=function(){return ao.geo.conicEqualArea().rotate([96,0]).center([-.6,38.7]).parallels([29.5,45.5]).scale(1070)},ao.geo.albersUsa=function(){function n(n){var u=n[0],o=n[1];return t=null,e(u,o),t||(r(u,o),t)||i(u,o),t}var t,e,r,i,u=ao.geo.albers(),o=ao.geo.conicEqualArea().rotate([154,0]).center([-2,58.5]).parallels([55,65]),a=ao.geo.conicEqualArea().rotate([157,0]).center([-3,19.9]).parallels([8,18]),l={point:function(n,e){t=[n,e]}};return n.invert=function(n){var t=u.scale(),e=u.translate(),r=(n[0]-e[0])/t,i=(n[1]-e[1])/t;return(i>=.12&&.234>i&&r>=-.425&&-.214>r?o:i>=.166&&.234>i&&r>=-.214&&-.115>r?a:u).invert(n)},n.stream=function(n){var t=u.stream(n),e=o.stream(n),r=a.stream(n);return{point:function(n,i){t.point(n,i),e.point(n,i),r.point(n,i)},sphere:function(){t.sphere(),e.sphere(),r.sphere()},lineStart:function(){t.lineStart(),e.lineStart(),r.lineStart()},lineEnd:function(){t.lineEnd(),e.lineEnd(),r.lineEnd()},polygonStart:function(){t.polygonStart(),e.polygonStart(),r.polygonStart()},polygonEnd:function(){t.polygonEnd(),e.polygonEnd(),r.polygonEnd()}}},n.precision=function(t){return arguments.length?(u.precision(t),o.precision(t),a.precision(t),n):u.precision()},n.scale=function(t){return arguments.length?(u.scale(t),o.scale(.35*t),a.scale(t),n.translate(u.translate())):u.scale()},n.translate=function(t){if(!arguments.length)return u.translate();var c=u.scale(),f=+t[0],s=+t[1];return e=u.translate(t).clipExtent([[f-.455*c,s-.238*c],[f+.455*c,s+.238*c]]).stream(l).point,r=o.translate([f-.307*c,s+.201*c]).clipExtent([[f-.425*c+Uo,s+.12*c+Uo],[f-.214*c-Uo,s+.234*c-Uo]]).stream(l).point,i=a.translate([f-.205*c,s+.212*c]).clipExtent([[f-.214*c+Uo,s+.166*c+Uo],[f-.115*c-Uo,s+.234*c-Uo]]).stream(l).point,n},n.scale(1070)};var Oa,Ia,Ya,Za,Va,Xa,$a={point:b,lineStart:b,lineEnd:b,polygonStart:function(){Ia=0,$a.lineStart=$t},polygonEnd:function(){$a.lineStart=$a.lineEnd=$a.point=b,Oa+=xo(Ia/2)}},Ba={point:Bt,lineStart:b,lineEnd:b,polygonStart:b,polygonEnd:b},Wa={point:Gt,lineStart:Kt,lineEnd:Qt,polygonStart:function(){Wa.lineStart=ne},polygonEnd:function(){Wa.point=Gt,Wa.lineStart=Kt,Wa.lineEnd=Qt}};ao.geo.path=function(){function n(n){return n&&("function"==typeof a&&u.pointRadius(+a.apply(this,arguments)),o&&o.valid||(o=i(u)),ao.geo.stream(n,o)),u.result()}function t(){return o=null,n}var e,r,i,u,o,a=4.5;return n.area=function(n){return Oa=0,ao.geo.stream(n,i($a)),Oa},n.centroid=function(n){return Ca=za=La=qa=Ta=Ra=Da=Pa=Ua=0,ao.geo.stream(n,i(Wa)),Ua?[Da/Ua,Pa/Ua]:Ra?[qa/Ra,Ta/Ra]:La?[Ca/La,za/La]:[NaN,NaN]},n.bounds=function(n){return Va=Xa=-(Ya=Za=1/0),ao.geo.stream(n,i(Ba)),[[Ya,Za],[Va,Xa]]},n.projection=function(n){return arguments.length?(i=(e=n)?n.stream||re(n):m,t()):e},n.context=function(n){return arguments.length?(u=null==(r=n)?new Wt:new te(n),"function"!=typeof a&&u.pointRadius(a),t()):r},n.pointRadius=function(t){return arguments.length?(a="function"==typeof t?t:(u.pointRadius(+t),+t),n):a},n.projection(ao.geo.albersUsa()).context(null)},ao.geo.transform=function(n){return{stream:function(t){var e=new ie(t);for(var r in n)e[r]=n[r];return e}}},ie.prototype={point:function(n,t){this.stream.point(n,t)},sphere:function(){this.stream.sphere()},lineStart:function(){this.stream.lineStart()},lineEnd:function(){this.stream.lineEnd()},polygonStart:function(){this.stream.polygonStart()},polygonEnd:function(){this.stream.polygonEnd()}},ao.geo.projection=oe,ao.geo.projectionMutator=ae,(ao.geo.equirectangular=function(){return oe(ce)}).raw=ce.invert=ce,ao.geo.rotation=function(n){function t(t){return t=n(t[0]*Yo,t[1]*Yo),t[0]*=Zo,t[1]*=Zo,t}return n=se(n[0]%360*Yo,n[1]*Yo,n.length>2?n[2]*Yo:0),t.invert=function(t){return t=n.invert(t[0]*Yo,t[1]*Yo),t[0]*=Zo,t[1]*=Zo,t},t},fe.invert=ce,ao.geo.circle=function(){function n(){var n="function"==typeof r?r.apply(this,arguments):r,t=se(-n[0]*Yo,-n[1]*Yo,0).invert,i=[];return e(null,null,1,{point:function(n,e){i.push(n=t(n,e)),n[0]*=Zo,n[1]*=Zo}}),{type:"Polygon",coordinates:[i]}}var t,e,r=[0,0],i=6;return n.origin=function(t){return arguments.length?(r=t,n):r},n.angle=function(r){return arguments.length?(e=ve((t=+r)*Yo,i*Yo),n):t},n.precision=function(r){return arguments.length?(e=ve(t*Yo,(i=+r)*Yo),n):i},n.angle(90)},ao.geo.distance=function(n,t){var e,r=(t[0]-n[0])*Yo,i=n[1]*Yo,u=t[1]*Yo,o=Math.sin(r),a=Math.cos(r),l=Math.sin(i),c=Math.cos(i),f=Math.sin(u),s=Math.cos(u);return Math.atan2(Math.sqrt((e=s*o)*e+(e=c*f-l*s*a)*e),l*f+c*s*a)},ao.geo.graticule=function(){function n(){return{type:"MultiLineString",coordinates:t()}}function t(){return ao.range(Math.ceil(u/d)*d,i,d).map(h).concat(ao.range(Math.ceil(c/y)*y,l,y).map(p)).concat(ao.range(Math.ceil(r/g)*g,e,g).filter(function(n){return xo(n%d)>Uo}).map(f)).concat(ao.range(Math.ceil(a/v)*v,o,v).filter(function(n){return xo(n%y)>Uo}).map(s))}var e,r,i,u,o,a,l,c,f,s,h,p,g=10,v=g,d=90,y=360,m=2.5;return n.lines=function(){return t().map(function(n){return{type:"LineString",coordinates:n}})},n.outline=function(){return{type:"Polygon",coordinates:[h(u).concat(p(l).slice(1),h(i).reverse().slice(1),p(c).reverse().slice(1))]}},n.extent=function(t){return arguments.length?n.majorExtent(t).minorExtent(t):n.minorExtent()},n.majorExtent=function(t){return arguments.length?(u=+t[0][0],i=+t[1][0],c=+t[0][1],l=+t[1][1],u>i&&(t=u,u=i,i=t),c>l&&(t=c,c=l,l=t),n.precision(m)):[[u,c],[i,l]]},n.minorExtent=function(t){return arguments.length?(r=+t[0][0],e=+t[1][0],a=+t[0][1],o=+t[1][1],r>e&&(t=r,r=e,e=t),a>o&&(t=a,a=o,o=t),n.precision(m)):[[r,a],[e,o]]},n.step=function(t){return arguments.length?n.majorStep(t).minorStep(t):n.minorStep()},n.majorStep=function(t){return arguments.length?(d=+t[0],y=+t[1],n):[d,y]},n.minorStep=function(t){return arguments.length?(g=+t[0],v=+t[1],n):[g,v]},n.precision=function(t){return arguments.length?(m=+t,f=ye(a,o,90),s=me(r,e,m),h=ye(c,l,90),p=me(u,i,m),n):m},n.majorExtent([[-180,-90+Uo],[180,90-Uo]]).minorExtent([[-180,-80-Uo],[180,80+Uo]])},ao.geo.greatArc=function(){function n(){return{type:"LineString",coordinates:[t||r.apply(this,arguments),e||i.apply(this,arguments)]}}var t,e,r=Me,i=xe;return n.distance=function(){return ao.geo.distance(t||r.apply(this,arguments),e||i.apply(this,arguments))},n.source=function(e){return arguments.length?(r=e,t="function"==typeof e?null:e,n):r},n.target=function(t){return arguments.length?(i=t,e="function"==typeof t?null:t,n):i},n.precision=function(){return arguments.length?n:0},n},ao.geo.interpolate=function(n,t){return be(n[0]*Yo,n[1]*Yo,t[0]*Yo,t[1]*Yo)},ao.geo.length=function(n){return Ja=0,ao.geo.stream(n,Ga),Ja};var Ja,Ga={sphere:b,point:b,lineStart:_e,lineEnd:b,polygonStart:b,polygonEnd:b},Ka=we(function(n){return Math.sqrt(2/(1+n))},function(n){return 2*Math.asin(n/2)});(ao.geo.azimuthalEqualArea=function(){return oe(Ka)}).raw=Ka;var Qa=we(function(n){var t=Math.acos(n);return t&&t/Math.sin(t)},m);(ao.geo.azimuthalEquidistant=function(){return oe(Qa)}).raw=Qa,(ao.geo.conicConformal=function(){return Vt(Se)}).raw=Se,(ao.geo.conicEquidistant=function(){return Vt(ke)}).raw=ke;var nl=we(function(n){return 1/n},Math.atan);(ao.geo.gnomonic=function(){return oe(nl)}).raw=nl,Ne.invert=function(n,t){return[n,2*Math.atan(Math.exp(t))-Io]},(ao.geo.mercator=function(){return Ee(Ne)}).raw=Ne;var tl=we(function(){return 1},Math.asin);(ao.geo.orthographic=function(){return oe(tl)}).raw=tl;var el=we(function(n){return 1/(1+n)},function(n){return 2*Math.atan(n)});(ao.geo.stereographic=function(){return oe(el)}).raw=el,Ae.invert=function(n,t){return[-t,2*Math.atan(Math.exp(n))-Io]},(ao.geo.transverseMercator=function(){var n=Ee(Ae),t=n.center,e=n.rotate;return n.center=function(n){return n?t([-n[1],n[0]]):(n=t(),[n[1],-n[0]])},n.rotate=function(n){return n?e([n[0],n[1],n.length>2?n[2]+90:90]):(n=e(),[n[0],n[1],n[2]-90])},e([0,0,90])}).raw=Ae,ao.geom={},ao.geom.hull=function(n){function t(n){if(n.length<3)return[];var t,i=En(e),u=En(r),o=n.length,a=[],l=[];for(t=0;o>t;t++)a.push([+i.call(this,n[t],t),+u.call(this,n[t],t),t]);for(a.sort(qe),t=0;o>t;t++)l.push([a[t][0],-a[t][1]]);var c=Le(a),f=Le(l),s=f[0]===c[0],h=f[f.length-1]===c[c.length-1],p=[];for(t=c.length-1;t>=0;--t)p.push(n[a[c[t]][2]]);for(t=+s;t<f.length-h;++t)p.push(n[a[f[t]][2]]);return p}var e=Ce,r=ze;return arguments.length?t(n):(t.x=function(n){return arguments.length?(e=n,t):e},t.y=function(n){return arguments.length?(r=n,t):r},t)},ao.geom.polygon=function(n){return ko(n,rl),n};var rl=ao.geom.polygon.prototype=[];rl.area=function(){for(var n,t=-1,e=this.length,r=this[e-1],i=0;++t<e;)n=r,r=this[t],i+=n[1]*r[0]-n[0]*r[1];return.5*i},rl.centroid=function(n){var t,e,r=-1,i=this.length,u=0,o=0,a=this[i-1];for(arguments.length||(n=-1/(6*this.area()));++r<i;)t=a,a=this[r],e=t[0]*a[1]-a[0]*t[1],u+=(t[0]+a[0])*e,o+=(t[1]+a[1])*e;return[u*n,o*n]},rl.clip=function(n){for(var t,e,r,i,u,o,a=De(n),l=-1,c=this.length-De(this),f=this[c-1];++l<c;){for(t=n.slice(),n.length=0,i=this[l],u=t[(r=t.length-a)-1],e=-1;++e<r;)o=t[e],Te(o,f,i)?(Te(u,f,i)||n.push(Re(u,o,f,i)),n.push(o)):Te(u,f,i)&&n.push(Re(u,o,f,i)),u=o;a&&n.push(n[0]),f=i}return n};var il,ul,ol,al,ll,cl=[],fl=[];Ye.prototype.prepare=function(){for(var n,t=this.edges,e=t.length;e--;)n=t[e].edge,n.b&&n.a||t.splice(e,1);return t.sort(Ve),t.length},tr.prototype={start:function(){return this.edge.l===this.site?this.edge.a:this.edge.b},end:function(){return this.edge.l===this.site?this.edge.b:this.edge.a}},er.prototype={insert:function(n,t){var e,r,i;if(n){if(t.P=n,t.N=n.N,n.N&&(n.N.P=t),n.N=t,n.R){for(n=n.R;n.L;)n=n.L;n.L=t}else n.R=t;e=n}else this._?(n=or(this._),t.P=null,t.N=n,n.P=n.L=t,e=n):(t.P=t.N=null,this._=t,e=null);for(t.L=t.R=null,t.U=e,t.C=!0,n=t;e&&e.C;)r=e.U,e===r.L?(i=r.R,i&&i.C?(e.C=i.C=!1,r.C=!0,n=r):(n===e.R&&(ir(this,e),n=e,e=n.U),e.C=!1,r.C=!0,ur(this,r))):(i=r.L,i&&i.C?(e.C=i.C=!1,r.C=!0,n=r):(n===e.L&&(ur(this,e),n=e,e=n.U),e.C=!1,r.C=!0,ir(this,r))),e=n.U;this._.C=!1},remove:function(n){n.N&&(n.N.P=n.P),n.P&&(n.P.N=n.N),n.N=n.P=null;var t,e,r,i=n.U,u=n.L,o=n.R;if(e=u?o?or(o):u:o,i?i.L===n?i.L=e:i.R=e:this._=e,u&&o?(r=e.C,e.C=n.C,e.L=u,u.U=e,e!==o?(i=e.U,e.U=n.U,n=e.R,i.L=n,e.R=o,o.U=e):(e.U=i,i=e,n=e.R)):(r=n.C,n=e),n&&(n.U=i),!r){if(n&&n.C)return void(n.C=!1);do{if(n===this._)break;if(n===i.L){if(t=i.R,t.C&&(t.C=!1,i.C=!0,ir(this,i),t=i.R),t.L&&t.L.C||t.R&&t.R.C){t.R&&t.R.C||(t.L.C=!1,t.C=!0,ur(this,t),t=i.R),t.C=i.C,i.C=t.R.C=!1,ir(this,i),n=this._;break}}else if(t=i.L,t.C&&(t.C=!1,i.C=!0,ur(this,i),t=i.L),t.L&&t.L.C||t.R&&t.R.C){t.L&&t.L.C||(t.R.C=!1,t.C=!0,ir(this,t),t=i.L),t.C=i.C,i.C=t.L.C=!1,ur(this,i),n=this._;break}t.C=!0,n=i,i=i.U}while(!n.C);n&&(n.C=!1)}}},ao.geom.voronoi=function(n){function t(n){var t=new Array(n.length),r=a[0][0],i=a[0][1],u=a[1][0],o=a[1][1];return ar(e(n),a).cells.forEach(function(e,a){var l=e.edges,c=e.site,f=t[a]=l.length?l.map(function(n){var t=n.start();return[t.x,t.y]}):c.x>=r&&c.x<=u&&c.y>=i&&c.y<=o?[[r,o],[u,o],[u,i],[r,i]]:[];f.point=n[a]}),t}function e(n){return n.map(function(n,t){return{x:Math.round(u(n,t)/Uo)*Uo,y:Math.round(o(n,t)/Uo)*Uo,i:t}})}var r=Ce,i=ze,u=r,o=i,a=sl;return n?t(n):(t.links=function(n){return ar(e(n)).edges.filter(function(n){return n.l&&n.r}).map(function(t){return{source:n[t.l.i],target:n[t.r.i]}})},t.triangles=function(n){var t=[];return ar(e(n)).cells.forEach(function(e,r){for(var i,u,o=e.site,a=e.edges.sort(Ve),l=-1,c=a.length,f=a[c-1].edge,s=f.l===o?f.r:f.l;++l<c;)i=f,u=s,f=a[l].edge,s=f.l===o?f.r:f.l,r<u.i&&r<s.i&&cr(o,u,s)<0&&t.push([n[r],n[u.i],n[s.i]])}),t},t.x=function(n){return arguments.length?(u=En(r=n),t):r},t.y=function(n){return arguments.length?(o=En(i=n),t):i},t.clipExtent=function(n){return arguments.length?(a=null==n?sl:n,t):a===sl?null:a},t.size=function(n){return arguments.length?t.clipExtent(n&&[[0,0],n]):a===sl?null:a&&a[1]},t)};var sl=[[-1e6,-1e6],[1e6,1e6]];ao.geom.delaunay=function(n){return ao.geom.voronoi().triangles(n)},ao.geom.quadtree=function(n,t,e,r,i){function u(n){function u(n,t,e,r,i,u,o,a){if(!isNaN(e)&&!isNaN(r))if(n.leaf){var l=n.x,f=n.y;if(null!=l)if(xo(l-e)+xo(f-r)<.01)c(n,t,e,r,i,u,o,a);else{var s=n.point;n.x=n.y=n.point=null,c(n,s,l,f,i,u,o,a),c(n,t,e,r,i,u,o,a)}else n.x=e,n.y=r,n.point=t}else c(n,t,e,r,i,u,o,a)}function c(n,t,e,r,i,o,a,l){var c=.5*(i+a),f=.5*(o+l),s=e>=c,h=r>=f,p=h<<1|s;n.leaf=!1,n=n.nodes[p]||(n.nodes[p]=hr()),s?i=c:a=c,h?o=f:l=f,u(n,t,e,r,i,o,a,l)}var f,s,h,p,g,v,d,y,m,M=En(a),x=En(l);if(null!=t)v=t,d=e,y=r,m=i;else if(y=m=-(v=d=1/0),s=[],h=[],g=n.length,o)for(p=0;g>p;++p)f=n[p],f.x<v&&(v=f.x),f.y<d&&(d=f.y),f.x>y&&(y=f.x),f.y>m&&(m=f.y),s.push(f.x),h.push(f.y);else for(p=0;g>p;++p){var b=+M(f=n[p],p),_=+x(f,p);v>b&&(v=b),d>_&&(d=_),b>y&&(y=b),_>m&&(m=_),s.push(b),h.push(_)}var w=y-v,S=m-d;w>S?m=d+w:y=v+S;var k=hr();if(k.add=function(n){u(k,n,+M(n,++p),+x(n,p),v,d,y,m)},k.visit=function(n){pr(n,k,v,d,y,m)},k.find=function(n){return gr(k,n[0],n[1],v,d,y,m)},p=-1,null==t){for(;++p<g;)u(k,n[p],s[p],h[p],v,d,y,m);--p}else n.forEach(k.add);return s=h=n=f=null,k}var o,a=Ce,l=ze;return(o=arguments.length)?(a=fr,l=sr,3===o&&(i=e,r=t,e=t=0),u(n)):(u.x=function(n){return arguments.length?(a=n,u):a},u.y=function(n){return arguments.length?(l=n,u):l},u.extent=function(n){return arguments.length?(null==n?t=e=r=i=null:(t=+n[0][0],e=+n[0][1],r=+n[1][0],i=+n[1][1]),u):null==t?null:[[t,e],[r,i]]},u.size=function(n){return arguments.length?(null==n?t=e=r=i=null:(t=e=0,r=+n[0],i=+n[1]),u):null==t?null:[r-t,i-e]},u)},ao.interpolateRgb=vr,ao.interpolateObject=dr,ao.interpolateNumber=yr,ao.interpolateString=mr;var hl=/[-+]?(?:\d+\.?\d*|\.?\d+)(?:[eE][-+]?\d+)?/g,pl=new RegExp(hl.source,"g");ao.interpolate=Mr,ao.interpolators=[function(n,t){var e=typeof t;return("string"===e?ua.has(t.toLowerCase())||/^(#|rgb\(|hsl\()/i.test(t)?vr:mr:t instanceof an?vr:Array.isArray(t)?xr:"object"===e&&isNaN(t)?dr:yr)(n,t)}],ao.interpolateArray=xr;var gl=function(){return m},vl=ao.map({linear:gl,poly:Er,quad:function(){return Sr},cubic:function(){return kr},sin:function(){return Ar},exp:function(){return Cr},circle:function(){return zr},elastic:Lr,back:qr,bounce:function(){return Tr}}),dl=ao.map({"in":m,out:_r,"in-out":wr,"out-in":function(n){return wr(_r(n))}});ao.ease=function(n){var t=n.indexOf("-"),e=t>=0?n.slice(0,t):n,r=t>=0?n.slice(t+1):"in";return e=vl.get(e)||gl,r=dl.get(r)||m,br(r(e.apply(null,lo.call(arguments,1))))},ao.interpolateHcl=Rr,ao.interpolateHsl=Dr,ao.interpolateLab=Pr,ao.interpolateRound=Ur,ao.transform=function(n){var t=fo.createElementNS(ao.ns.prefix.svg,"g");return(ao.transform=function(n){if(null!=n){t.setAttribute("transform",n);var e=t.transform.baseVal.consolidate()}return new jr(e?e.matrix:yl)})(n)},jr.prototype.toString=function(){return"translate("+this.translate+")rotate("+this.rotate+")skewX("+this.skew+")scale("+this.scale+")"};var yl={a:1,b:0,c:0,d:1,e:0,f:0};ao.interpolateTransform=$r,ao.layout={},ao.layout.bundle=function(){return function(n){for(var t=[],e=-1,r=n.length;++e<r;)t.push(Jr(n[e]));return t}},ao.layout.chord=function(){function n(){var n,c,s,h,p,g={},v=[],d=ao.range(u),y=[];for(e=[],r=[],n=0,h=-1;++h<u;){for(c=0,p=-1;++p<u;)c+=i[h][p];v.push(c),y.push(ao.range(u)),n+=c}for(o&&d.sort(function(n,t){return o(v[n],v[t])}),a&&y.forEach(function(n,t){n.sort(function(n,e){return a(i[t][n],i[t][e])})}),n=(Ho-f*u)/n,c=0,h=-1;++h<u;){for(s=c,p=-1;++p<u;){var m=d[h],M=y[m][p],x=i[m][M],b=c,_=c+=x*n;g[m+"-"+M]={index:m,subindex:M,startAngle:b,endAngle:_,value:x}}r[m]={index:m,startAngle:s,endAngle:c,value:v[m]},c+=f}for(h=-1;++h<u;)for(p=h-1;++p<u;){var w=g[h+"-"+p],S=g[p+"-"+h];(w.value||S.value)&&e.push(w.value<S.value?{source:S,target:w}:{source:w,target:S})}l&&t()}function t(){e.sort(function(n,t){return l((n.source.value+n.target.value)/2,(t.source.value+t.target.value)/2)})}var e,r,i,u,o,a,l,c={},f=0;return c.matrix=function(n){return arguments.length?(u=(i=n)&&i.length,e=r=null,c):i},c.padding=function(n){return arguments.length?(f=n,e=r=null,c):f},c.sortGroups=function(n){return arguments.length?(o=n,e=r=null,c):o},c.sortSubgroups=function(n){return arguments.length?(a=n,e=null,c):a},c.sortChords=function(n){return arguments.length?(l=n,e&&t(),c):l},c.chords=function(){return e||n(),e},c.groups=function(){return r||n(),r},c},ao.layout.force=function(){function n(n){return function(t,e,r,i){if(t.point!==n){var u=t.cx-n.x,o=t.cy-n.y,a=i-e,l=u*u+o*o;if(l>a*a/y){if(v>l){var c=t.charge/l;n.px-=u*c,n.py-=o*c}return!0}if(t.point&&l&&v>l){var c=t.pointCharge/l;n.px-=u*c,n.py-=o*c}}return!t.charge}}function t(n){n.px=ao.event.x,n.py=ao.event.y,l.resume()}var e,r,i,u,o,a,l={},c=ao.dispatch("start","tick","end"),f=[1,1],s=.9,h=ml,p=Ml,g=-30,v=xl,d=.1,y=.64,M=[],x=[];return l.tick=function(){if((i*=.99)<.005)return e=null,c.end({type:"end",alpha:i=0}),!0;var t,r,l,h,p,v,y,m,b,_=M.length,w=x.length;for(r=0;w>r;++r)l=x[r],h=l.source,p=l.target,m=p.x-h.x,b=p.y-h.y,(v=m*m+b*b)&&(v=i*o[r]*((v=Math.sqrt(v))-u[r])/v,m*=v,b*=v,p.x-=m*(y=h.weight+p.weight?h.weight/(h.weight+p.weight):.5),p.y-=b*y,h.x+=m*(y=1-y),h.y+=b*y);if((y=i*d)&&(m=f[0]/2,b=f[1]/2,r=-1,y))for(;++r<_;)l=M[r],l.x+=(m-l.x)*y,l.y+=(b-l.y)*y;if(g)for(ri(t=ao.geom.quadtree(M),i,a),r=-1;++r<_;)(l=M[r]).fixed||t.visit(n(l));for(r=-1;++r<_;)l=M[r],l.fixed?(l.x=l.px,l.y=l.py):(l.x-=(l.px-(l.px=l.x))*s,l.y-=(l.py-(l.py=l.y))*s);c.tick({type:"tick",alpha:i})},l.nodes=function(n){return arguments.length?(M=n,l):M},l.links=function(n){return arguments.length?(x=n,l):x},l.size=function(n){return arguments.length?(f=n,l):f},l.linkDistance=function(n){return arguments.length?(h="function"==typeof n?n:+n,l):h},l.distance=l.linkDistance,l.linkStrength=function(n){return arguments.length?(p="function"==typeof n?n:+n,l):p},l.friction=function(n){return arguments.length?(s=+n,l):s},l.charge=function(n){return arguments.length?(g="function"==typeof n?n:+n,l):g},l.chargeDistance=function(n){return arguments.length?(v=n*n,l):Math.sqrt(v)},l.gravity=function(n){return arguments.length?(d=+n,l):d},l.theta=function(n){return arguments.length?(y=n*n,l):Math.sqrt(y)},l.alpha=function(n){return arguments.length?(n=+n,i?n>0?i=n:(e.c=null,e.t=NaN,e=null,c.end({type:"end",alpha:i=0})):n>0&&(c.start({type:"start",alpha:i=n}),e=qn(l.tick)),l):i},l.start=function(){function n(n,r){if(!e){for(e=new Array(i),l=0;i>l;++l)e[l]=[];for(l=0;c>l;++l){var u=x[l];e[u.source.index].push(u.target),e[u.target.index].push(u.source)}}for(var o,a=e[t],l=-1,f=a.length;++l<f;)if(!isNaN(o=a[l][n]))return o;return Math.random()*r}var t,e,r,i=M.length,c=x.length,s=f[0],v=f[1];for(t=0;i>t;++t)(r=M[t]).index=t,r.weight=0;for(t=0;c>t;++t)r=x[t],"number"==typeof r.source&&(r.source=M[r.source]),"number"==typeof r.target&&(r.target=M[r.target]),++r.source.weight,++r.target.weight;for(t=0;i>t;++t)r=M[t],isNaN(r.x)&&(r.x=n("x",s)),isNaN(r.y)&&(r.y=n("y",v)),isNaN(r.px)&&(r.px=r.x),isNaN(r.py)&&(r.py=r.y);if(u=[],"function"==typeof h)for(t=0;c>t;++t)u[t]=+h.call(this,x[t],t);else for(t=0;c>t;++t)u[t]=h;if(o=[],"function"==typeof p)for(t=0;c>t;++t)o[t]=+p.call(this,x[t],t);else for(t=0;c>t;++t)o[t]=p;if(a=[],"function"==typeof g)for(t=0;i>t;++t)a[t]=+g.call(this,M[t],t);else for(t=0;i>t;++t)a[t]=g;return l.resume()},l.resume=function(){return l.alpha(.1)},l.stop=function(){return l.alpha(0)},l.drag=function(){return r||(r=ao.behavior.drag().origin(m).on("dragstart.force",Qr).on("drag.force",t).on("dragend.force",ni)),arguments.length?void this.on("mouseover.force",ti).on("mouseout.force",ei).call(r):r},ao.rebind(l,c,"on")};var ml=20,Ml=1,xl=1/0;ao.layout.hierarchy=function(){function n(i){var u,o=[i],a=[];for(i.depth=0;null!=(u=o.pop());)if(a.push(u),(c=e.call(n,u,u.depth))&&(l=c.length)){for(var l,c,f;--l>=0;)o.push(f=c[l]),f.parent=u,f.depth=u.depth+1;r&&(u.value=0),u.children=c}else r&&(u.value=+r.call(n,u,u.depth)||0),delete u.children;return oi(i,function(n){var e,i;t&&(e=n.children)&&e.sort(t),r&&(i=n.parent)&&(i.value+=n.value)}),a}var t=ci,e=ai,r=li;return n.sort=function(e){return arguments.length?(t=e,n):t},n.children=function(t){return arguments.length?(e=t,n):e},n.value=function(t){return arguments.length?(r=t,n):r},n.revalue=function(t){return r&&(ui(t,function(n){n.children&&(n.value=0)}),oi(t,function(t){var e;t.children||(t.value=+r.call(n,t,t.depth)||0),(e=t.parent)&&(e.value+=t.value)})),t},n},ao.layout.partition=function(){function n(t,e,r,i){var u=t.children;if(t.x=e,t.y=t.depth*i,t.dx=r,t.dy=i,u&&(o=u.length)){var o,a,l,c=-1;for(r=t.value?r/t.value:0;++c<o;)n(a=u[c],e,l=a.value*r,i),e+=l}}function t(n){var e=n.children,r=0;if(e&&(i=e.length))for(var i,u=-1;++u<i;)r=Math.max(r,t(e[u]));return 1+r}function e(e,u){var o=r.call(this,e,u);return n(o[0],0,i[0],i[1]/t(o[0])),o}var r=ao.layout.hierarchy(),i=[1,1];return e.size=function(n){return arguments.length?(i=n,e):i},ii(e,r)},ao.layout.pie=function(){function n(o){var a,l=o.length,c=o.map(function(e,r){return+t.call(n,e,r)}),f=+("function"==typeof r?r.apply(this,arguments):r),s=("function"==typeof i?i.apply(this,arguments):i)-f,h=Math.min(Math.abs(s)/l,+("function"==typeof u?u.apply(this,arguments):u)),p=h*(0>s?-1:1),g=ao.sum(c),v=g?(s-l*p)/g:0,d=ao.range(l),y=[];return null!=e&&d.sort(e===bl?function(n,t){return c[t]-c[n]}:function(n,t){return e(o[n],o[t])}),d.forEach(function(n){y[n]={data:o[n],value:a=c[n],startAngle:f,endAngle:f+=a*v+p,padAngle:h}}),y}var t=Number,e=bl,r=0,i=Ho,u=0;return n.value=function(e){return arguments.length?(t=e,n):t},n.sort=function(t){return arguments.length?(e=t,n):e},n.startAngle=function(t){return arguments.length?(r=t,n):r},n.endAngle=function(t){return arguments.length?(i=t,n):i},n.padAngle=function(t){return arguments.length?(u=t,n):u},n};var bl={};ao.layout.stack=function(){function n(a,l){if(!(h=a.length))return a;var c=a.map(function(e,r){return t.call(n,e,r)}),f=c.map(function(t){return t.map(function(t,e){return[u.call(n,t,e),o.call(n,t,e)]})}),s=e.call(n,f,l);c=ao.permute(c,s),f=ao.permute(f,s);var h,p,g,v,d=r.call(n,f,l),y=c[0].length;for(g=0;y>g;++g)for(i.call(n,c[0][g],v=d[g],f[0][g][1]),p=1;h>p;++p)i.call(n,c[p][g],v+=f[p-1][g][1],f[p][g][1]);return a}var t=m,e=gi,r=vi,i=pi,u=si,o=hi;return n.values=function(e){return arguments.length?(t=e,n):t},n.order=function(t){return arguments.length?(e="function"==typeof t?t:_l.get(t)||gi,n):e},n.offset=function(t){return arguments.length?(r="function"==typeof t?t:wl.get(t)||vi,n):r},n.x=function(t){return arguments.length?(u=t,n):u},n.y=function(t){return arguments.length?(o=t,n):o},n.out=function(t){return arguments.length?(i=t,n):i},n};var _l=ao.map({"inside-out":function(n){var t,e,r=n.length,i=n.map(di),u=n.map(yi),o=ao.range(r).sort(function(n,t){return i[n]-i[t]}),a=0,l=0,c=[],f=[];for(t=0;r>t;++t)e=o[t],l>a?(a+=u[e],c.push(e)):(l+=u[e],f.push(e));return f.reverse().concat(c)},reverse:function(n){return ao.range(n.length).reverse()},"default":gi}),wl=ao.map({silhouette:function(n){var t,e,r,i=n.length,u=n[0].length,o=[],a=0,l=[];for(e=0;u>e;++e){for(t=0,r=0;i>t;t++)r+=n[t][e][1];r>a&&(a=r),o.push(r)}for(e=0;u>e;++e)l[e]=(a-o[e])/2;return l},wiggle:function(n){var t,e,r,i,u,o,a,l,c,f=n.length,s=n[0],h=s.length,p=[];for(p[0]=l=c=0,e=1;h>e;++e){for(t=0,i=0;f>t;++t)i+=n[t][e][1];for(t=0,u=0,a=s[e][0]-s[e-1][0];f>t;++t){for(r=0,o=(n[t][e][1]-n[t][e-1][1])/(2*a);t>r;++r)o+=(n[r][e][1]-n[r][e-1][1])/a;u+=o*n[t][e][1]}p[e]=l-=i?u/i*a:0,c>l&&(c=l)}for(e=0;h>e;++e)p[e]-=c;return p},expand:function(n){var t,e,r,i=n.length,u=n[0].length,o=1/i,a=[];for(e=0;u>e;++e){for(t=0,r=0;i>t;t++)r+=n[t][e][1];if(r)for(t=0;i>t;t++)n[t][e][1]/=r;else for(t=0;i>t;t++)n[t][e][1]=o}for(e=0;u>e;++e)a[e]=0;return a},zero:vi});ao.layout.histogram=function(){function n(n,u){for(var o,a,l=[],c=n.map(e,this),f=r.call(this,c,u),s=i.call(this,f,c,u),u=-1,h=c.length,p=s.length-1,g=t?1:1/h;++u<p;)o=l[u]=[],o.dx=s[u+1]-(o.x=s[u]),o.y=0;if(p>0)for(u=-1;++u<h;)a=c[u],a>=f[0]&&a<=f[1]&&(o=l[ao.bisect(s,a,1,p)-1],o.y+=g,o.push(n[u]));return l}var t=!0,e=Number,r=bi,i=Mi;return n.value=function(t){return arguments.length?(e=t,n):e},n.range=function(t){return arguments.length?(r=En(t),n):r},n.bins=function(t){return arguments.length?(i="number"==typeof t?function(n){return xi(n,t)}:En(t),n):i},n.frequency=function(e){return arguments.length?(t=!!e,n):t},n},ao.layout.pack=function(){function n(n,u){var o=e.call(this,n,u),a=o[0],l=i[0],c=i[1],f=null==t?Math.sqrt:"function"==typeof t?t:function(){return t};if(a.x=a.y=0,oi(a,function(n){n.r=+f(n.value)}),oi(a,Ni),r){var s=r*(t?1:Math.max(2*a.r/l,2*a.r/c))/2;oi(a,function(n){n.r+=s}),oi(a,Ni),oi(a,function(n){n.r-=s})}return Ci(a,l/2,c/2,t?1:1/Math.max(2*a.r/l,2*a.r/c)),o}var t,e=ao.layout.hierarchy().sort(_i),r=0,i=[1,1];return n.size=function(t){return arguments.length?(i=t,n):i},n.radius=function(e){return arguments.length?(t=null==e||"function"==typeof e?e:+e,n):t},n.padding=function(t){return arguments.length?(r=+t,n):r},ii(n,e)},ao.layout.tree=function(){function n(n,i){var f=o.call(this,n,i),s=f[0],h=t(s);if(oi(h,e),h.parent.m=-h.z,ui(h,r),c)ui(s,u);else{var p=s,g=s,v=s;ui(s,function(n){n.x<p.x&&(p=n),n.x>g.x&&(g=n),n.depth>v.depth&&(v=n)});var d=a(p,g)/2-p.x,y=l[0]/(g.x+a(g,p)/2+d),m=l[1]/(v.depth||1);ui(s,function(n){n.x=(n.x+d)*y,n.y=n.depth*m})}return f}function t(n){for(var t,e={A:null,children:[n]},r=[e];null!=(t=r.pop());)for(var i,u=t.children,o=0,a=u.length;a>o;++o)r.push((u[o]=i={_:u[o],parent:t,children:(i=u[o].children)&&i.slice()||[],A:null,a:null,z:0,m:0,c:0,s:0,t:null,i:o}).a=i);return e.children[0]}function e(n){var t=n.children,e=n.parent.children,r=n.i?e[n.i-1]:null;if(t.length){Di(n);var u=(t[0].z+t[t.length-1].z)/2;r?(n.z=r.z+a(n._,r._),n.m=n.z-u):n.z=u}else r&&(n.z=r.z+a(n._,r._));n.parent.A=i(n,r,n.parent.A||e[0])}function r(n){n._.x=n.z+n.parent.m,n.m+=n.parent.m}function i(n,t,e){if(t){for(var r,i=n,u=n,o=t,l=i.parent.children[0],c=i.m,f=u.m,s=o.m,h=l.m;o=Ti(o),i=qi(i),o&&i;)l=qi(l),u=Ti(u),u.a=n,r=o.z+s-i.z-c+a(o._,i._),r>0&&(Ri(Pi(o,n,e),n,r),c+=r,f+=r),s+=o.m,c+=i.m,h+=l.m,f+=u.m;o&&!Ti(u)&&(u.t=o,u.m+=s-f),i&&!qi(l)&&(l.t=i,l.m+=c-h,e=n)}return e}function u(n){n.x*=l[0],n.y=n.depth*l[1]}var o=ao.layout.hierarchy().sort(null).value(null),a=Li,l=[1,1],c=null;return n.separation=function(t){return arguments.length?(a=t,n):a},n.size=function(t){return arguments.length?(c=null==(l=t)?u:null,n):c?null:l},n.nodeSize=function(t){return arguments.length?(c=null==(l=t)?null:u,n):c?l:null},ii(n,o)},ao.layout.cluster=function(){function n(n,u){var o,a=t.call(this,n,u),l=a[0],c=0;oi(l,function(n){var t=n.children;t&&t.length?(n.x=ji(t),n.y=Ui(t)):(n.x=o?c+=e(n,o):0,n.y=0,o=n)});var f=Fi(l),s=Hi(l),h=f.x-e(f,s)/2,p=s.x+e(s,f)/2;return oi(l,i?function(n){n.x=(n.x-l.x)*r[0],n.y=(l.y-n.y)*r[1]}:function(n){n.x=(n.x-h)/(p-h)*r[0],n.y=(1-(l.y?n.y/l.y:1))*r[1]}),a}var t=ao.layout.hierarchy().sort(null).value(null),e=Li,r=[1,1],i=!1;return n.separation=function(t){return arguments.length?(e=t,n):e},n.size=function(t){return arguments.length?(i=null==(r=t),n):i?null:r},n.nodeSize=function(t){return arguments.length?(i=null!=(r=t),n):i?r:null},ii(n,t)},ao.layout.treemap=function(){function n(n,t){for(var e,r,i=-1,u=n.length;++i<u;)r=(e=n[i]).value*(0>t?0:t),e.area=isNaN(r)||0>=r?0:r}function t(e){var u=e.children;if(u&&u.length){var o,a,l,c=s(e),f=[],h=u.slice(),g=1/0,v="slice"===p?c.dx:"dice"===p?c.dy:"slice-dice"===p?1&e.depth?c.dy:c.dx:Math.min(c.dx,c.dy);for(n(h,c.dx*c.dy/e.value),f.area=0;(l=h.length)>0;)f.push(o=h[l-1]),f.area+=o.area,"squarify"!==p||(a=r(f,v))<=g?(h.pop(),g=a):(f.area-=f.pop().area,i(f,v,c,!1),v=Math.min(c.dx,c.dy),f.length=f.area=0,g=1/0);f.length&&(i(f,v,c,!0),f.length=f.area=0),u.forEach(t)}}function e(t){var r=t.children;if(r&&r.length){var u,o=s(t),a=r.slice(),l=[];for(n(a,o.dx*o.dy/t.value),l.area=0;u=a.pop();)l.push(u),l.area+=u.area,null!=u.z&&(i(l,u.z?o.dx:o.dy,o,!a.length),l.length=l.area=0);r.forEach(e)}}function r(n,t){for(var e,r=n.area,i=0,u=1/0,o=-1,a=n.length;++o<a;)(e=n[o].area)&&(u>e&&(u=e),e>i&&(i=e));return r*=r,t*=t,r?Math.max(t*i*g/r,r/(t*u*g)):1/0}function i(n,t,e,r){var i,u=-1,o=n.length,a=e.x,c=e.y,f=t?l(n.area/t):0;if(t==e.dx){for((r||f>e.dy)&&(f=e.dy);++u<o;)i=n[u],i.x=a,i.y=c,i.dy=f,a+=i.dx=Math.min(e.x+e.dx-a,f?l(i.area/f):0);i.z=!0,i.dx+=e.x+e.dx-a,e.y+=f,e.dy-=f}else{for((r||f>e.dx)&&(f=e.dx);++u<o;)i=n[u],i.x=a,i.y=c,i.dx=f,c+=i.dy=Math.min(e.y+e.dy-c,f?l(i.area/f):0);i.z=!1,i.dy+=e.y+e.dy-c,e.x+=f,e.dx-=f}}function u(r){var i=o||a(r),u=i[0];return u.x=u.y=0,u.value?(u.dx=c[0],u.dy=c[1]):u.dx=u.dy=0,o&&a.revalue(u),n([u],u.dx*u.dy/u.value),(o?e:t)(u),h&&(o=i),i}var o,a=ao.layout.hierarchy(),l=Math.round,c=[1,1],f=null,s=Oi,h=!1,p="squarify",g=.5*(1+Math.sqrt(5));return u.size=function(n){return arguments.length?(c=n,u):c},u.padding=function(n){function t(t){var e=n.call(u,t,t.depth);return null==e?Oi(t):Ii(t,"number"==typeof e?[e,e,e,e]:e)}function e(t){return Ii(t,n)}if(!arguments.length)return f;var r;return s=null==(f=n)?Oi:"function"==(r=typeof n)?t:"number"===r?(n=[n,n,n,n],e):e,u},u.round=function(n){return arguments.length?(l=n?Math.round:Number,u):l!=Number},u.sticky=function(n){return arguments.length?(h=n,o=null,u):h},u.ratio=function(n){return arguments.length?(g=n,u):g},u.mode=function(n){return arguments.length?(p=n+"",u):p},ii(u,a)},ao.random={normal:function(n,t){var e=arguments.length;return 2>e&&(t=1),1>e&&(n=0),function(){var e,r,i;do e=2*Math.random()-1,r=2*Math.random()-1,i=e*e+r*r;while(!i||i>1);return n+t*e*Math.sqrt(-2*Math.log(i)/i)}},logNormal:function(){var n=ao.random.normal.apply(ao,arguments);return function(){return Math.exp(n())}},bates:function(n){var t=ao.random.irwinHall(n);return function(){return t()/n}},irwinHall:function(n){return function(){for(var t=0,e=0;n>e;e++)t+=Math.random();return t}}},ao.scale={};var Sl={floor:m,ceil:m};ao.scale.linear=function(){return Wi([0,1],[0,1],Mr,!1)};var kl={s:1,g:1,p:1,r:1,e:1};ao.scale.log=function(){return ru(ao.scale.linear().domain([0,1]),10,!0,[1,10])};var Nl=ao.format(".0e"),El={floor:function(n){return-Math.ceil(-n)},ceil:function(n){return-Math.floor(-n)}};ao.scale.pow=function(){return iu(ao.scale.linear(),1,[0,1])},ao.scale.sqrt=function(){return ao.scale.pow().exponent(.5)},ao.scale.ordinal=function(){return ou([],{t:"range",a:[[]]})},ao.scale.category10=function(){return ao.scale.ordinal().range(Al)},ao.scale.category20=function(){return ao.scale.ordinal().range(Cl)},ao.scale.category20b=function(){return ao.scale.ordinal().range(zl)},ao.scale.category20c=function(){return ao.scale.ordinal().range(Ll)};var Al=[2062260,16744206,2924588,14034728,9725885,9197131,14907330,8355711,12369186,1556175].map(xn),Cl=[2062260,11454440,16744206,16759672,2924588,10018698,14034728,16750742,9725885,12955861,9197131,12885140,14907330,16234194,8355711,13092807,12369186,14408589,1556175,10410725].map(xn),zl=[3750777,5395619,7040719,10264286,6519097,9216594,11915115,13556636,9202993,12426809,15186514,15190932,8666169,11356490,14049643,15177372,8077683,10834324,13528509,14589654].map(xn),Ll=[3244733,7057110,10406625,13032431,15095053,16616764,16625259,16634018,3253076,7652470,10607003,13101504,7695281,10394312,12369372,14342891,6513507,9868950,12434877,14277081].map(xn);ao.scale.quantile=function(){return au([],[])},ao.scale.quantize=function(){return lu(0,1,[0,1])},ao.scale.threshold=function(){return cu([.5],[0,1])},ao.scale.identity=function(){return fu([0,1])},ao.svg={},ao.svg.arc=function(){function n(){var n=Math.max(0,+e.apply(this,arguments)),c=Math.max(0,+r.apply(this,arguments)),f=o.apply(this,arguments)-Io,s=a.apply(this,arguments)-Io,h=Math.abs(s-f),p=f>s?0:1;if(n>c&&(g=c,c=n,n=g),h>=Oo)return t(c,p)+(n?t(n,1-p):"")+"Z";var g,v,d,y,m,M,x,b,_,w,S,k,N=0,E=0,A=[];if((y=(+l.apply(this,arguments)||0)/2)&&(d=u===ql?Math.sqrt(n*n+c*c):+u.apply(this,arguments),p||(E*=-1),c&&(E=tn(d/c*Math.sin(y))),n&&(N=tn(d/n*Math.sin(y)))),c){m=c*Math.cos(f+E),M=c*Math.sin(f+E),x=c*Math.cos(s-E),b=c*Math.sin(s-E);var C=Math.abs(s-f-2*E)<=Fo?0:1;if(E&&yu(m,M,x,b)===p^C){var z=(f+s)/2;m=c*Math.cos(z),M=c*Math.sin(z),x=b=null}}else m=M=0;if(n){_=n*Math.cos(s-N),w=n*Math.sin(s-N),S=n*Math.cos(f+N),k=n*Math.sin(f+N);var L=Math.abs(f-s+2*N)<=Fo?0:1;if(N&&yu(_,w,S,k)===1-p^L){var q=(f+s)/2;_=n*Math.cos(q),w=n*Math.sin(q),S=k=null}}else _=w=0;if(h>Uo&&(g=Math.min(Math.abs(c-n)/2,+i.apply(this,arguments)))>.001){v=c>n^p?0:1;var T=g,R=g;if(Fo>h){var D=null==S?[_,w]:null==x?[m,M]:Re([m,M],[S,k],[x,b],[_,w]),P=m-D[0],U=M-D[1],j=x-D[0],F=b-D[1],H=1/Math.sin(Math.acos((P*j+U*F)/(Math.sqrt(P*P+U*U)*Math.sqrt(j*j+F*F)))/2),O=Math.sqrt(D[0]*D[0]+D[1]*D[1]);R=Math.min(g,(n-O)/(H-1)),T=Math.min(g,(c-O)/(H+1))}if(null!=x){var I=mu(null==S?[_,w]:[S,k],[m,M],c,T,p),Y=mu([x,b],[_,w],c,T,p);g===T?A.push("M",I[0],"A",T,",",T," 0 0,",v," ",I[1],"A",c,",",c," 0 ",1-p^yu(I[1][0],I[1][1],Y[1][0],Y[1][1]),",",p," ",Y[1],"A",T,",",T," 0 0,",v," ",Y[0]):A.push("M",I[0],"A",T,",",T," 0 1,",v," ",Y[0])}else A.push("M",m,",",M);if(null!=S){var Z=mu([m,M],[S,k],n,-R,p),V=mu([_,w],null==x?[m,M]:[x,b],n,-R,p);g===R?A.push("L",V[0],"A",R,",",R," 0 0,",v," ",V[1],"A",n,",",n," 0 ",p^yu(V[1][0],V[1][1],Z[1][0],Z[1][1]),",",1-p," ",Z[1],"A",R,",",R," 0 0,",v," ",Z[0]):A.push("L",V[0],"A",R,",",R," 0 0,",v," ",Z[0])}else A.push("L",_,",",w)}else A.push("M",m,",",M),null!=x&&A.push("A",c,",",c," 0 ",C,",",p," ",x,",",b),A.push("L",_,",",w),null!=S&&A.push("A",n,",",n," 0 ",L,",",1-p," ",S,",",k);return A.push("Z"),A.join("")}function t(n,t){return"M0,"+n+"A"+n+","+n+" 0 1,"+t+" 0,"+-n+"A"+n+","+n+" 0 1,"+t+" 0,"+n}var e=hu,r=pu,i=su,u=ql,o=gu,a=vu,l=du;return n.innerRadius=function(t){return arguments.length?(e=En(t),n):e},n.outerRadius=function(t){return arguments.length?(r=En(t),n):r},n.cornerRadius=function(t){return arguments.length?(i=En(t),n):i},n.padRadius=function(t){return arguments.length?(u=t==ql?ql:En(t),n):u},n.startAngle=function(t){return arguments.length?(o=En(t),n):o},n.endAngle=function(t){return arguments.length?(a=En(t),n):a},n.padAngle=function(t){return arguments.length?(l=En(t),n):l},n.centroid=function(){var n=(+e.apply(this,arguments)+ +r.apply(this,arguments))/2,t=(+o.apply(this,arguments)+ +a.apply(this,arguments))/2-Io;return[Math.cos(t)*n,Math.sin(t)*n]},n};var ql="auto";ao.svg.line=function(){return Mu(m)};var Tl=ao.map({linear:xu,"linear-closed":bu,step:_u,"step-before":wu,"step-after":Su,basis:zu,"basis-open":Lu,"basis-closed":qu,bundle:Tu,cardinal:Eu,"cardinal-open":ku,"cardinal-closed":Nu,monotone:Fu});Tl.forEach(function(n,t){t.key=n,t.closed=/-closed$/.test(n)});var Rl=[0,2/3,1/3,0],Dl=[0,1/3,2/3,0],Pl=[0,1/6,2/3,1/6];ao.svg.line.radial=function(){var n=Mu(Hu);return n.radius=n.x,delete n.x,n.angle=n.y,delete n.y,n},wu.reverse=Su,Su.reverse=wu,ao.svg.area=function(){return Ou(m)},ao.svg.area.radial=function(){var n=Ou(Hu);return n.radius=n.x,delete n.x,n.innerRadius=n.x0,delete n.x0,n.outerRadius=n.x1,delete n.x1,n.angle=n.y,delete n.y,n.startAngle=n.y0,delete n.y0,n.endAngle=n.y1,delete n.y1,n},ao.svg.chord=function(){function n(n,a){var l=t(this,u,n,a),c=t(this,o,n,a);return"M"+l.p0+r(l.r,l.p1,l.a1-l.a0)+(e(l,c)?i(l.r,l.p1,l.r,l.p0):i(l.r,l.p1,c.r,c.p0)+r(c.r,c.p1,c.a1-c.a0)+i(c.r,c.p1,l.r,l.p0))+"Z"}function t(n,t,e,r){var i=t.call(n,e,r),u=a.call(n,i,r),o=l.call(n,i,r)-Io,f=c.call(n,i,r)-Io;return{r:u,a0:o,a1:f,p0:[u*Math.cos(o),u*Math.sin(o)],p1:[u*Math.cos(f),u*Math.sin(f)]}}function e(n,t){return n.a0==t.a0&&n.a1==t.a1}function r(n,t,e){return"A"+n+","+n+" 0 "+ +(e>Fo)+",1 "+t}function i(n,t,e,r){return"Q 0,0 "+r}var u=Me,o=xe,a=Iu,l=gu,c=vu;return n.radius=function(t){return arguments.length?(a=En(t),n):a},n.source=function(t){return arguments.length?(u=En(t),n):u},n.target=function(t){return arguments.length?(o=En(t),n):o},n.startAngle=function(t){return arguments.length?(l=En(t),n):l},n.endAngle=function(t){return arguments.length?(c=En(t),n):c},n},ao.svg.diagonal=function(){function n(n,i){var u=t.call(this,n,i),o=e.call(this,n,i),a=(u.y+o.y)/2,l=[u,{x:u.x,y:a},{x:o.x,y:a},o];return l=l.map(r),"M"+l[0]+"C"+l[1]+" "+l[2]+" "+l[3]}var t=Me,e=xe,r=Yu;return n.source=function(e){return arguments.length?(t=En(e),n):t},n.target=function(t){return arguments.length?(e=En(t),n):e},n.projection=function(t){return arguments.length?(r=t,n):r},n},ao.svg.diagonal.radial=function(){var n=ao.svg.diagonal(),t=Yu,e=n.projection;return n.projection=function(n){return arguments.length?e(Zu(t=n)):t},n},ao.svg.symbol=function(){function n(n,r){return(Ul.get(t.call(this,n,r))||$u)(e.call(this,n,r))}var t=Xu,e=Vu;return n.type=function(e){return arguments.length?(t=En(e),n):t},n.size=function(t){return arguments.length?(e=En(t),n):e},n};var Ul=ao.map({circle:$u,cross:function(n){var t=Math.sqrt(n/5)/2;return"M"+-3*t+","+-t+"H"+-t+"V"+-3*t+"H"+t+"V"+-t+"H"+3*t+"V"+t+"H"+t+"V"+3*t+"H"+-t+"V"+t+"H"+-3*t+"Z"},diamond:function(n){var t=Math.sqrt(n/(2*Fl)),e=t*Fl;return"M0,"+-t+"L"+e+",0 0,"+t+" "+-e+",0Z"},square:function(n){var t=Math.sqrt(n)/2;return"M"+-t+","+-t+"L"+t+","+-t+" "+t+","+t+" "+-t+","+t+"Z"},"triangle-down":function(n){var t=Math.sqrt(n/jl),e=t*jl/2;return"M0,"+e+"L"+t+","+-e+" "+-t+","+-e+"Z"},"triangle-up":function(n){var t=Math.sqrt(n/jl),e=t*jl/2;return"M0,"+-e+"L"+t+","+e+" "+-t+","+e+"Z"}});ao.svg.symbolTypes=Ul.keys();var jl=Math.sqrt(3),Fl=Math.tan(30*Yo);Co.transition=function(n){for(var t,e,r=Hl||++Zl,i=Ku(n),u=[],o=Ol||{time:Date.now(),ease:Nr,delay:0,duration:250},a=-1,l=this.length;++a<l;){u.push(t=[]);for(var c=this[a],f=-1,s=c.length;++f<s;)(e=c[f])&&Qu(e,f,i,r,o),t.push(e)}return Wu(u,i,r)},Co.interrupt=function(n){return this.each(null==n?Il:Bu(Ku(n)))};var Hl,Ol,Il=Bu(Ku()),Yl=[],Zl=0;Yl.call=Co.call,Yl.empty=Co.empty,Yl.node=Co.node,Yl.size=Co.size,ao.transition=function(n,t){return n&&n.transition?Hl?n.transition(t):n:ao.selection().transition(n)},ao.transition.prototype=Yl,Yl.select=function(n){var t,e,r,i=this.id,u=this.namespace,o=[];n=A(n);for(var a=-1,l=this.length;++a<l;){o.push(t=[]);for(var c=this[a],f=-1,s=c.length;++f<s;)(r=c[f])&&(e=n.call(r,r.__data__,f,a))?("__data__"in r&&(e.__data__=r.__data__),Qu(e,f,u,i,r[u][i]),t.push(e)):t.push(null)}return Wu(o,u,i)},Yl.selectAll=function(n){var t,e,r,i,u,o=this.id,a=this.namespace,l=[];n=C(n);for(var c=-1,f=this.length;++c<f;)for(var s=this[c],h=-1,p=s.length;++h<p;)if(r=s[h]){u=r[a][o],e=n.call(r,r.__data__,h,c),l.push(t=[]);for(var g=-1,v=e.length;++g<v;)(i=e[g])&&Qu(i,g,a,o,u),t.push(i)}return Wu(l,a,o)},Yl.filter=function(n){var t,e,r,i=[];"function"!=typeof n&&(n=O(n));for(var u=0,o=this.length;o>u;u++){i.push(t=[]);for(var e=this[u],a=0,l=e.length;l>a;a++)(r=e[a])&&n.call(r,r.__data__,a,u)&&t.push(r)}return Wu(i,this.namespace,this.id)},Yl.tween=function(n,t){var e=this.id,r=this.namespace;return arguments.length<2?this.node()[r][e].tween.get(n):Y(this,null==t?function(t){t[r][e].tween.remove(n)}:function(i){i[r][e].tween.set(n,t)})},Yl.attr=function(n,t){function e(){this.removeAttribute(a)}function r(){this.removeAttributeNS(a.space,a.local)}function i(n){return null==n?e:(n+="",function(){var t,e=this.getAttribute(a);return e!==n&&(t=o(e,n),function(n){this.setAttribute(a,t(n))})})}function u(n){return null==n?r:(n+="",function(){var t,e=this.getAttributeNS(a.space,a.local);return e!==n&&(t=o(e,n),function(n){this.setAttributeNS(a.space,a.local,t(n))})})}if(arguments.length<2){for(t in n)this.attr(t,n[t]);return this}var o="transform"==n?$r:Mr,a=ao.ns.qualify(n);return Ju(this,"attr."+n,t,a.local?u:i)},Yl.attrTween=function(n,t){function e(n,e){var r=t.call(this,n,e,this.getAttribute(i));return r&&function(n){this.setAttribute(i,r(n))}}function r(n,e){var r=t.call(this,n,e,this.getAttributeNS(i.space,i.local));return r&&function(n){this.setAttributeNS(i.space,i.local,r(n))}}var i=ao.ns.qualify(n);return this.tween("attr."+n,i.local?r:e)},Yl.style=function(n,e,r){function i(){this.style.removeProperty(n)}function u(e){return null==e?i:(e+="",function(){var i,u=t(this).getComputedStyle(this,null).getPropertyValue(n);return u!==e&&(i=Mr(u,e),function(t){this.style.setProperty(n,i(t),r)})})}var o=arguments.length;if(3>o){if("string"!=typeof n){2>o&&(e="");for(r in n)this.style(r,n[r],e);return this}r=""}return Ju(this,"style."+n,e,u)},Yl.styleTween=function(n,e,r){function i(i,u){var o=e.call(this,i,u,t(this).getComputedStyle(this,null).getPropertyValue(n));return o&&function(t){this.style.setProperty(n,o(t),r)}}return arguments.length<3&&(r=""),this.tween("style."+n,i)},Yl.text=function(n){return Ju(this,"text",n,Gu)},Yl.remove=function(){var n=this.namespace;return this.each("end.transition",function(){var t;this[n].count<2&&(t=this.parentNode)&&t.removeChild(this)})},Yl.ease=function(n){var t=this.id,e=this.namespace;return arguments.length<1?this.node()[e][t].ease:("function"!=typeof n&&(n=ao.ease.apply(ao,arguments)),Y(this,function(r){r[e][t].ease=n}))},Yl.delay=function(n){var t=this.id,e=this.namespace;return arguments.length<1?this.node()[e][t].delay:Y(this,"function"==typeof n?function(r,i,u){r[e][t].delay=+n.call(r,r.__data__,i,u)}:(n=+n,function(r){r[e][t].delay=n}))},Yl.duration=function(n){var t=this.id,e=this.namespace;return arguments.length<1?this.node()[e][t].duration:Y(this,"function"==typeof n?function(r,i,u){r[e][t].duration=Math.max(1,n.call(r,r.__data__,i,u))}:(n=Math.max(1,n),function(r){r[e][t].duration=n}))},Yl.each=function(n,t){var e=this.id,r=this.namespace;if(arguments.length<2){var i=Ol,u=Hl;try{Hl=e,Y(this,function(t,i,u){Ol=t[r][e],n.call(t,t.__data__,i,u)})}finally{Ol=i,Hl=u}}else Y(this,function(i){var u=i[r][e];(u.event||(u.event=ao.dispatch("start","end","interrupt"))).on(n,t)});return this},Yl.transition=function(){for(var n,t,e,r,i=this.id,u=++Zl,o=this.namespace,a=[],l=0,c=this.length;c>l;l++){a.push(n=[]);for(var t=this[l],f=0,s=t.length;s>f;f++)(e=t[f])&&(r=e[o][i],Qu(e,f,o,u,{time:r.time,ease:r.ease,delay:r.delay+r.duration,duration:r.duration})),n.push(e)}return Wu(a,o,u)},ao.svg.axis=function(){function n(n){n.each(function(){var n,c=ao.select(this),f=this.__chart__||e,s=this.__chart__=e.copy(),h=null==l?s.ticks?s.ticks.apply(s,a):s.domain():l,p=null==t?s.tickFormat?s.tickFormat.apply(s,a):m:t,g=c.selectAll(".tick").data(h,s),v=g.enter().insert("g",".domain").attr("class","tick").style("opacity",Uo),d=ao.transition(g.exit()).style("opacity",Uo).remove(),y=ao.transition(g.order()).style("opacity",1),M=Math.max(i,0)+o,x=Zi(s),b=c.selectAll(".domain").data([0]),_=(b.enter().append("path").attr("class","domain"),ao.transition(b));v.append("line"),v.append("text");var w,S,k,N,E=v.select("line"),A=y.select("line"),C=g.select("text").text(p),z=v.select("text"),L=y.select("text"),q="top"===r||"left"===r?-1:1;if("bottom"===r||"top"===r?(n=no,w="x",k="y",S="x2",N="y2",C.attr("dy",0>q?"0em":".71em").style("text-anchor","middle"),_.attr("d","M"+x[0]+","+q*u+"V0H"+x[1]+"V"+q*u)):(n=to,w="y",k="x",S="y2",N="x2",C.attr("dy",".32em").style("text-anchor",0>q?"end":"start"),_.attr("d","M"+q*u+","+x[0]+"H0V"+x[1]+"H"+q*u)),E.attr(N,q*i),z.attr(k,q*M),A.attr(S,0).attr(N,q*i),L.attr(w,0).attr(k,q*M),s.rangeBand){var T=s,R=T.rangeBand()/2;f=s=function(n){return T(n)+R}}else f.rangeBand?f=s:d.call(n,s,f);v.call(n,f,s),y.call(n,s,s)})}var t,e=ao.scale.linear(),r=Vl,i=6,u=6,o=3,a=[10],l=null;return n.scale=function(t){return arguments.length?(e=t,n):e},n.orient=function(t){return arguments.length?(r=t in Xl?t+"":Vl,n):r},n.ticks=function(){return arguments.length?(a=co(arguments),n):a},n.tickValues=function(t){return arguments.length?(l=t,n):l},n.tickFormat=function(e){return arguments.length?(t=e,n):t},n.tickSize=function(t){var e=arguments.length;return e?(i=+t,u=+arguments[e-1],n):i},n.innerTickSize=function(t){return arguments.length?(i=+t,n):i},n.outerTickSize=function(t){return arguments.length?(u=+t,n):u},n.tickPadding=function(t){return arguments.length?(o=+t,n):o},n.tickSubdivide=function(){return arguments.length&&n},n};var Vl="bottom",Xl={top:1,right:1,bottom:1,left:1};ao.svg.brush=function(){function n(t){t.each(function(){var t=ao.select(this).style("pointer-events","all").style("-webkit-tap-highlight-color","rgba(0,0,0,0)").on("mousedown.brush",u).on("touchstart.brush",u),o=t.selectAll(".background").data([0]);o.enter().append("rect").attr("class","background").style("visibility","hidden").style("cursor","crosshair"),t.selectAll(".extent").data([0]).enter().append("rect").attr("class","extent").style("cursor","move");var a=t.selectAll(".resize").data(v,m);a.exit().remove(),a.enter().append("g").attr("class",function(n){return"resize "+n}).style("cursor",function(n){return $l[n]}).append("rect").attr("x",function(n){return/[ew]$/.test(n)?-3:null}).attr("y",function(n){return/^[ns]/.test(n)?-3:null}).attr("width",6).attr("height",6).style("visibility","hidden"),a.style("display",n.empty()?"none":null);var l,s=ao.transition(t),h=ao.transition(o);c&&(l=Zi(c),h.attr("x",l[0]).attr("width",l[1]-l[0]),r(s)),f&&(l=Zi(f),h.attr("y",l[0]).attr("height",l[1]-l[0]),i(s)),e(s)})}function e(n){n.selectAll(".resize").attr("transform",function(n){return"translate("+s[+/e$/.test(n)]+","+h[+/^s/.test(n)]+")"})}function r(n){n.select(".extent").attr("x",s[0]),n.selectAll(".extent,.n>rect,.s>rect").attr("width",s[1]-s[0])}function i(n){n.select(".extent").attr("y",h[0]),n.selectAll(".extent,.e>rect,.w>rect").attr("height",h[1]-h[0])}function u(){function u(){32==ao.event.keyCode&&(C||(M=null,L[0]-=s[1],L[1]-=h[1],C=2),S())}function v(){32==ao.event.keyCode&&2==C&&(L[0]+=s[1],L[1]+=h[1],C=0,S())}function d(){var n=ao.mouse(b),t=!1;x&&(n[0]+=x[0],n[1]+=x[1]),C||(ao.event.altKey?(M||(M=[(s[0]+s[1])/2,(h[0]+h[1])/2]),L[0]=s[+(n[0]<M[0])],L[1]=h[+(n[1]<M[1])]):M=null),E&&y(n,c,0)&&(r(k),t=!0),A&&y(n,f,1)&&(i(k),t=!0),t&&(e(k),w({type:"brush",mode:C?"move":"resize"}))}function y(n,t,e){var r,i,u=Zi(t),l=u[0],c=u[1],f=L[e],v=e?h:s,d=v[1]-v[0];return C&&(l-=f,c-=d+f),r=(e?g:p)?Math.max(l,Math.min(c,n[e])):n[e],C?i=(r+=f)+d:(M&&(f=Math.max(l,Math.min(c,2*M[e]-r))),r>f?(i=r,r=f):i=f),v[0]!=r||v[1]!=i?(e?a=null:o=null,v[0]=r,v[1]=i,!0):void 0}function m(){d(),k.style("pointer-events","all").selectAll(".resize").style("display",n.empty()?"none":null),ao.select("body").style("cursor",null),q.on("mousemove.brush",null).on("mouseup.brush",null).on("touchmove.brush",null).on("touchend.brush",null).on("keydown.brush",null).on("keyup.brush",null),z(),w({type:"brushend"})}var M,x,b=this,_=ao.select(ao.event.target),w=l.of(b,arguments),k=ao.select(b),N=_.datum(),E=!/^(n|s)$/.test(N)&&c,A=!/^(e|w)$/.test(N)&&f,C=_.classed("extent"),z=W(b),L=ao.mouse(b),q=ao.select(t(b)).on("keydown.brush",u).on("keyup.brush",v);if(ao.event.changedTouches?q.on("touchmove.brush",d).on("touchend.brush",m):q.on("mousemove.brush",d).on("mouseup.brush",m),k.interrupt().selectAll("*").interrupt(),C)L[0]=s[0]-L[0],L[1]=h[0]-L[1];else if(N){var T=+/w$/.test(N),R=+/^n/.test(N);x=[s[1-T]-L[0],h[1-R]-L[1]],L[0]=s[T],L[1]=h[R]}else ao.event.altKey&&(M=L.slice());k.style("pointer-events","none").selectAll(".resize").style("display",null),ao.select("body").style("cursor",_.style("cursor")),w({type:"brushstart"}),d()}var o,a,l=N(n,"brushstart","brush","brushend"),c=null,f=null,s=[0,0],h=[0,0],p=!0,g=!0,v=Bl[0];return n.event=function(n){n.each(function(){var n=l.of(this,arguments),t={x:s,y:h,i:o,j:a},e=this.__chart__||t;this.__chart__=t,Hl?ao.select(this).transition().each("start.brush",function(){o=e.i,a=e.j,s=e.x,h=e.y,n({type:"brushstart"})}).tween("brush:brush",function(){var e=xr(s,t.x),r=xr(h,t.y);return o=a=null,function(i){s=t.x=e(i),h=t.y=r(i),n({type:"brush",mode:"resize"})}}).each("end.brush",function(){o=t.i,a=t.j,n({type:"brush",mode:"resize"}),n({type:"brushend"})}):(n({type:"brushstart"}),n({type:"brush",mode:"resize"}),n({type:"brushend"}))})},n.x=function(t){return arguments.length?(c=t,v=Bl[!c<<1|!f],n):c},n.y=function(t){return arguments.length?(f=t,v=Bl[!c<<1|!f],n):f},n.clamp=function(t){return arguments.length?(c&&f?(p=!!t[0],g=!!t[1]):c?p=!!t:f&&(g=!!t),n):c&&f?[p,g]:c?p:f?g:null},n.extent=function(t){var e,r,i,u,l;return arguments.length?(c&&(e=t[0],r=t[1],f&&(e=e[0],r=r[0]),o=[e,r],c.invert&&(e=c(e),r=c(r)),e>r&&(l=e,e=r,r=l),e==s[0]&&r==s[1]||(s=[e,r])),f&&(i=t[0],u=t[1],c&&(i=i[1],u=u[1]),a=[i,u],f.invert&&(i=f(i),u=f(u)),i>u&&(l=i,i=u,u=l),i==h[0]&&u==h[1]||(h=[i,u])),n):(c&&(o?(e=o[0],r=o[1]):(e=s[0],r=s[1],c.invert&&(e=c.invert(e),r=c.invert(r)),e>r&&(l=e,e=r,r=l))),f&&(a?(i=a[0],u=a[1]):(i=h[0],u=h[1],f.invert&&(i=f.invert(i),u=f.invert(u)),i>u&&(l=i,i=u,u=l))),c&&f?[[e,i],[r,u]]:c?[e,r]:f&&[i,u])},n.clear=function(){return n.empty()||(s=[0,0],h=[0,0],o=a=null),n},n.empty=function(){return!!c&&s[0]==s[1]||!!f&&h[0]==h[1]},ao.rebind(n,l,"on")};var $l={n:"ns-resize",e:"ew-resize",s:"ns-resize",w:"ew-resize",nw:"nwse-resize",ne:"nesw-resize",se:"nwse-resize",sw:"nesw-resize"},Bl=[["n","e","s","w","nw","ne","se","sw"],["e","w"],["n","s"],[]],Wl=ga.format=xa.timeFormat,Jl=Wl.utc,Gl=Jl("%Y-%m-%dT%H:%M:%S.%LZ");Wl.iso=Date.prototype.toISOString&&+new Date("2000-01-01T00:00:00.000Z")?eo:Gl,eo.parse=function(n){var t=new Date(n);return isNaN(t)?null:t},eo.toString=Gl.toString,ga.second=On(function(n){return new va(1e3*Math.floor(n/1e3))},function(n,t){n.setTime(n.getTime()+1e3*Math.floor(t))},function(n){return n.getSeconds()}),ga.seconds=ga.second.range,ga.seconds.utc=ga.second.utc.range,ga.minute=On(function(n){return new va(6e4*Math.floor(n/6e4))},function(n,t){n.setTime(n.getTime()+6e4*Math.floor(t))},function(n){return n.getMinutes()}),ga.minutes=ga.minute.range,ga.minutes.utc=ga.minute.utc.range,ga.hour=On(function(n){var t=n.getTimezoneOffset()/60;return new va(36e5*(Math.floor(n/36e5-t)+t))},function(n,t){n.setTime(n.getTime()+36e5*Math.floor(t))},function(n){return n.getHours()}),ga.hours=ga.hour.range,ga.hours.utc=ga.hour.utc.range,ga.month=On(function(n){return n=ga.day(n),n.setDate(1),n},function(n,t){n.setMonth(n.getMonth()+t)},function(n){return n.getMonth()}),ga.months=ga.month.range,ga.months.utc=ga.month.utc.range;var Kl=[1e3,5e3,15e3,3e4,6e4,3e5,9e5,18e5,36e5,108e5,216e5,432e5,864e5,1728e5,6048e5,2592e6,7776e6,31536e6],Ql=[[ga.second,1],[ga.second,5],[ga.second,15],[ga.second,30],[ga.minute,1],[ga.minute,5],[ga.minute,15],[ga.minute,30],[ga.hour,1],[ga.hour,3],[ga.hour,6],[ga.hour,12],[ga.day,1],[ga.day,2],[ga.week,1],[ga.month,1],[ga.month,3],[ga.year,1]],nc=Wl.multi([[".%L",function(n){return n.getMilliseconds()}],[":%S",function(n){return n.getSeconds()}],["%I:%M",function(n){return n.getMinutes()}],["%I %p",function(n){return n.getHours()}],["%a %d",function(n){return n.getDay()&&1!=n.getDate()}],["%b %d",function(n){return 1!=n.getDate()}],["%B",function(n){return n.getMonth()}],["%Y",zt]]),tc={range:function(n,t,e){return ao.range(Math.ceil(n/e)*e,+t,e).map(io)},floor:m,ceil:m};Ql.year=ga.year,ga.scale=function(){return ro(ao.scale.linear(),Ql,nc)};var ec=Ql.map(function(n){return[n[0].utc,n[1]]}),rc=Jl.multi([[".%L",function(n){return n.getUTCMilliseconds()}],[":%S",function(n){return n.getUTCSeconds()}],["%I:%M",function(n){return n.getUTCMinutes()}],["%I %p",function(n){return n.getUTCHours()}],["%a %d",function(n){return n.getUTCDay()&&1!=n.getUTCDate()}],["%b %d",function(n){return 1!=n.getUTCDate()}],["%B",function(n){return n.getUTCMonth()}],["%Y",zt]]);ec.year=ga.year.utc,ga.scale.utc=function(){return ro(ao.scale.linear(),ec,rc)},ao.text=An(function(n){return n.responseText}),ao.json=function(n,t){return Cn(n,"application/json",uo,t)},ao.html=function(n,t){return Cn(n,"text/html",oo,t)},ao.xml=An(function(n){return n.responseXML}),"function"==typeof define&&define.amd?(this.d3=ao,define(ao)):"object"==typeof module&&module.exports?module.exports=ao:this.d3=ao}();
diff --git a/contrib/eigen/bench/perf_monitoring/resources/s2.js b/contrib/eigen/bench/perf_monitoring/resources/s2.js
new file mode 100644
index 0000000000000000000000000000000000000000..54eda2cd59f08a129f82b4684686ed544f599ce8
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/resources/s2.js
@@ -0,0 +1 @@
+!function(){var a={};a.dev=!1,a.tooltip=a.tooltip||{},a.utils=a.utils||{},a.models=a.models||{},a.charts={},a.logs={},a.dom={},"undefined"!=typeof module&&"undefined"!=typeof exports&&"undefined"==typeof d3&&(d3=require("d3")),a.dispatch=d3.dispatch("render_start","render_end"),Function.prototype.bind||(Function.prototype.bind=function(a){if("function"!=typeof this)throw new TypeError("Function.prototype.bind - what is trying to be bound is not callable");var b=Array.prototype.slice.call(arguments,1),c=this,d=function(){},e=function(){return c.apply(this instanceof d&&a?this:a,b.concat(Array.prototype.slice.call(arguments)))};return d.prototype=this.prototype,e.prototype=new d,e}),a.dev&&(a.dispatch.on("render_start",function(b){a.logs.startTime=+new Date}),a.dispatch.on("render_end",function(b){a.logs.endTime=+new Date,a.logs.totalTime=a.logs.endTime-a.logs.startTime,a.log("total",a.logs.totalTime)})),a.log=function(){if(a.dev&&window.console&&console.log&&console.log.apply)console.log.apply(console,arguments);else if(a.dev&&window.console&&"function"==typeof console.log&&Function.prototype.bind){var b=Function.prototype.bind.call(console.log,console);b.apply(console,arguments)}return arguments[arguments.length-1]},a.deprecated=function(a,b){console&&console.warn&&console.warn("nvd3 warning: `"+a+"` has been deprecated. ",b||"")},a.render=function(b){b=b||1,a.render.active=!0,a.dispatch.render_start();var c=function(){for(var d,e,f=0;b>f&&(e=a.render.queue[f]);f++)d=e.generate(),typeof e.callback==typeof Function&&e.callback(d);a.render.queue.splice(0,f),a.render.queue.length?setTimeout(c):(a.dispatch.render_end(),a.render.active=!1)};setTimeout(c)},a.render.active=!1,a.render.queue=[],a.addGraph=function(b){typeof arguments[0]==typeof Function&&(b={generate:arguments[0],callback:arguments[1]}),a.render.queue.push(b),a.render.active||a.render()},"undefined"!=typeof module&&"undefined"!=typeof exports&&(module.exports=a),"undefined"!=typeof window&&(window.nv=a),a.dom.write=function(a){return void 0!==window.fastdom?fastdom.mutate(a):a()},a.dom.read=function(a){return void 0!==window.fastdom?fastdom.measure(a):a()},a.interactiveGuideline=function(){"use strict";function b(l){l.each(function(l){function m(){var a=d3.mouse(this),d=a[0],e=a[1],h=!0,i=!1;if(k&&(d=d3.event.offsetX,e=d3.event.offsetY,"svg"!==d3.event.target.tagName&&(h=!1),d3.event.target.className.baseVal.match("nv-legend")&&(i=!0)),h&&(d-=c.left,e-=c.top),"mouseout"===d3.event.type||0>d||0>e||d>o||e>p||d3.event.relatedTarget&&void 0===d3.event.relatedTarget.ownerSVGElement||i){if(k&&d3.event.relatedTarget&&void 0===d3.event.relatedTarget.ownerSVGElement&&(void 0===d3.event.relatedTarget.className||d3.event.relatedTarget.className.match(j.nvPointerEventsClass)))return;return g.elementMouseout({mouseX:d,mouseY:e}),b.renderGuideLine(null),void j.hidden(!0)}j.hidden(!1);var l="function"==typeof f.rangeBands,m=void 0;if(l){var n=d3.bisect(f.range(),d)-1;if(!(f.range()[n]+f.rangeBand()>=d))return g.elementMouseout({mouseX:d,mouseY:e}),b.renderGuideLine(null),void j.hidden(!0);m=f.domain()[d3.bisect(f.range(),d)-1]}else m=f.invert(d);g.elementMousemove({mouseX:d,mouseY:e,pointXValue:m}),"dblclick"===d3.event.type&&g.elementDblclick({mouseX:d,mouseY:e,pointXValue:m}),"click"===d3.event.type&&g.elementClick({mouseX:d,mouseY:e,pointXValue:m}),"mousedown"===d3.event.type&&g.elementMouseDown({mouseX:d,mouseY:e,pointXValue:m}),"mouseup"===d3.event.type&&g.elementMouseUp({mouseX:d,mouseY:e,pointXValue:m})}var n=d3.select(this),o=d||960,p=e||400,q=n.selectAll("g.nv-wrap.nv-interactiveLineLayer").data([l]),r=q.enter().append("g").attr("class"," nv-wrap nv-interactiveLineLayer");r.append("g").attr("class","nv-interactiveGuideLine"),i&&(i.on("touchmove",m).on("mousemove",m,!0).on("mouseout",m,!0).on("mousedown",m,!0).on("mouseup",m,!0).on("dblclick",m).on("click",m),b.guideLine=null,b.renderGuideLine=function(c){h&&(b.guideLine&&b.guideLine.attr("x1")===c||a.dom.write(function(){var b=q.select(".nv-interactiveGuideLine").selectAll("line").data(null!=c?[a.utils.NaNtoZero(c)]:[],String);b.enter().append("line").attr("class","nv-guideline").attr("x1",function(a){return a}).attr("x2",function(a){return a}).attr("y1",p).attr("y2",0),b.exit().remove()}))})})}var c={left:0,top:0},d=null,e=null,f=d3.scale.linear(),g=d3.dispatch("elementMousemove","elementMouseout","elementClick","elementDblclick","elementMouseDown","elementMouseUp"),h=!0,i=null,j=a.models.tooltip(),k=window.ActiveXObject;return j.duration(0).hideDelay(0).hidden(!1),b.dispatch=g,b.tooltip=j,b.margin=function(a){return arguments.length?(c.top="undefined"!=typeof a.top?a.top:c.top,c.left="undefined"!=typeof a.left?a.left:c.left,b):c},b.width=function(a){return arguments.length?(d=a,b):d},b.height=function(a){return arguments.length?(e=a,b):e},b.xScale=function(a){return arguments.length?(f=a,b):f},b.showGuideLine=function(a){return arguments.length?(h=a,b):h},b.svgContainer=function(a){return arguments.length?(i=a,b):i},b},a.interactiveBisect=function(a,b,c){"use strict";if(!(a instanceof Array))return null;var d;d="function"!=typeof c?function(a){return a.x}:c;var e=function(a,b){return d(a)-b},f=d3.bisector(e).left,g=d3.max([0,f(a,b)-1]),h=d(a[g]);if("undefined"==typeof h&&(h=g),h===b)return g;var i=d3.min([g+1,a.length-1]),j=d(a[i]);return"undefined"==typeof j&&(j=i),Math.abs(j-b)>=Math.abs(h-b)?g:i},a.nearestValueIndex=function(a,b,c){"use strict";var d=1/0,e=null;return a.forEach(function(a,f){var g=Math.abs(b-a);null!=a&&d>=g&&c>g&&(d=g,e=f)}),e},a.models.tooltip=function(){"use strict";function b(){if(!l||!l.node()){var a=[1];l=d3.select(document.body).selectAll(".nvtooltip").data(a),l.enter().append("div").attr("class","nvtooltip "+(i?i:"xy-tooltip")).attr("id",d).style("top",0).style("left",0).style("opacity",0).style("position","fixed").selectAll("div, table, td, tr").classed(q,!0).classed(q,!0),l.exit().remove()}}function c(){return n&&w(e)?(a.dom.write(function(){b();var a=u(e);a&&(l.node().innerHTML=a),y()}),c):void 0}var d="nvtooltip-"+Math.floor(1e5*Math.random()),e=null,f="w",g=25,h=0,i=null,j=!0,k=200,l=null,m={left:null,top:null},n=!0,o=100,p=!0,q="nv-pointer-events-none",r=function(a,b){return a},s=function(a){return a},t=function(a,b){return a},u=function(a){if(null===a)return"";var b=d3.select(document.createElement("table"));if(p){var c=b.selectAll("thead").data([a]).enter().append("thead");c.append("tr").append("td").attr("colspan",3).append("strong").classed("x-value",!0).html(s(a.value))}var d=b.selectAll("tbody").data([a]).enter().append("tbody"),e=d.selectAll("tr").data(function(a){return a.series}).enter().append("tr").classed("highlight",function(a){return a.highlight});e.append("td").classed("legend-color-guide",!0).append("div").style("background-color",function(a){return a.color}),e.append("td").classed("key",!0).classed("total",function(a){return!!a.total}).html(function(a,b){return t(a.key,b)}),e.append("td").classed("value",!0).html(function(a,b){return r(a.value,b)}),e.filter(function(a,b){return void 0!==a.percent}).append("td").classed("percent",!0).html(function(a,b){return"("+d3.format("%")(a.percent)+")"}),e.selectAll("td").each(function(a){if(a.highlight){var b=d3.scale.linear().domain([0,1]).range(["#fff",a.color]),c=.6;d3.select(this).style("border-bottom-color",b(c)).style("border-top-color",b(c))}});var f=b.node().outerHTML;return void 0!==a.footer&&(f+="<div class='footer'>"+a.footer+"</div>"),f},v=function(){var a={left:null!==d3.event?d3.event.clientX:0,top:null!==d3.event?d3.event.clientY:0};if("none"!=getComputedStyle(document.body).transform){var b=document.body.getBoundingClientRect();a.left-=b.left,a.top-=b.top}return a},w=function(b){if(b&&b.series){if(a.utils.isArray(b.series))return!0;if(a.utils.isObject(b.series))return b.series=[b.series],!0}return!1},x=function(a){var b,c,d,e=l.node().offsetHeight,h=l.node().offsetWidth,i=document.documentElement.clientWidth,j=document.documentElement.clientHeight;switch(f){case"e":b=-h-g,c=-(e/2),a.left+b<0&&(b=g),(d=a.top+c)<0&&(c-=d),(d=a.top+c+e)>j&&(c-=d-j);break;case"w":b=g,c=-(e/2),a.left+b+h>i&&(b=-h-g),(d=a.top+c)<0&&(c-=d),(d=a.top+c+e)>j&&(c-=d-j);break;case"n":b=-(h/2)-5,c=g,a.top+c+e>j&&(c=-e-g),(d=a.left+b)<0&&(b-=d),(d=a.left+b+h)>i&&(b-=d-i);break;case"s":b=-(h/2),c=-e-g,a.top+c<0&&(c=g),(d=a.left+b)<0&&(b-=d),(d=a.left+b+h)>i&&(b-=d-i);break;case"center":b=-(h/2),c=-(e/2);break;default:b=0,c=0}return{left:b,top:c}},y=function(){a.dom.read(function(){var a=v(),b=x(a),c=a.left+b.left,d=a.top+b.top;if(j)l.interrupt().transition().delay(k).duration(0).style("opacity",0);else{var e="translate("+m.left+"px, "+m.top+"px)",f="translate("+Math.round(c)+"px, "+Math.round(d)+"px)",g=d3.interpolateString(e,f),h=l.style("opacity")<.1;l.interrupt().transition().duration(h?0:o).styleTween("transform",function(a){return g},"important").styleTween("-webkit-transform",function(a){return g}).style("-ms-transform",f).style("opacity",1)}m.left=c,m.top=d})};return c.nvPointerEventsClass=q,c.options=a.utils.optionsFunc.bind(c),c._options=Object.create({},{duration:{get:function(){return o},set:function(a){o=a}},gravity:{get:function(){return f},set:function(a){f=a}},distance:{get:function(){return g},set:function(a){g=a}},snapDistance:{get:function(){return h},set:function(a){h=a}},classes:{get:function(){return i},set:function(a){i=a}},enabled:{get:function(){return n},set:function(a){n=a}},hideDelay:{get:function(){return k},set:function(a){k=a}},contentGenerator:{get:function(){return u},set:function(a){u=a}},valueFormatter:{get:function(){return r},set:function(a){r=a}},headerFormatter:{get:function(){return s},set:function(a){s=a}},keyFormatter:{get:function(){return t},set:function(a){t=a}},headerEnabled:{get:function(){return p},set:function(a){p=a}},position:{get:function(){return v},set:function(a){v=a}},chartContainer:{get:function(){return document.body},set:function(b){a.deprecated("chartContainer","feature removed after 1.8.3")}},fixedTop:{get:function(){return null},set:function(b){a.deprecated("fixedTop","feature removed after 1.8.1")}},offset:{get:function(){return{left:0,top:0}},set:function(b){a.deprecated("offset","use chart.tooltip.distance() instead")}},hidden:{get:function(){return j},set:function(a){j!=a&&(j=!!a,c())}},data:{get:function(){return e},set:function(a){a.point&&(a.value=a.point.x,a.series=a.series||{},a.series.value=a.point.y,a.series.color=a.point.color||a.series.color),e=a}},node:{get:function(){return l.node()},set:function(a){}},id:{get:function(){return d},set:function(a){}}}),a.utils.initOptions(c),c},a.utils.windowSize=function(){var a={width:640,height:480};return window.innerWidth&&window.innerHeight?(a.width=window.innerWidth,a.height=window.innerHeight,a):"CSS1Compat"==document.compatMode&&document.documentElement&&document.documentElement.offsetWidth?(a.width=document.documentElement.offsetWidth,a.height=document.documentElement.offsetHeight,a):document.body&&document.body.offsetWidth?(a.width=document.body.offsetWidth,a.height=document.body.offsetHeight,a):a},a.utils.isArray=Array.isArray,a.utils.isObject=function(a){return null!==a&&"object"==typeof a},a.utils.isFunction=function(a){return"function"==typeof a},a.utils.isDate=function(a){return"[object Date]"===toString.call(a)},a.utils.isNumber=function(a){return!isNaN(a)&&"number"==typeof a},a.utils.windowResize=function(b){return window.addEventListener?window.addEventListener("resize",b):a.log("ERROR: Failed to bind to window.resize with: ",b),{callback:b,clear:function(){window.removeEventListener("resize",b)}}},a.utils.getColor=function(b){if(void 0===b)return a.utils.defaultColor();if(a.utils.isArray(b)){var c=d3.scale.ordinal().range(b);return function(a,b){var d=void 0===b?a:b;return a.color||c(d)}}return b},a.utils.defaultColor=function(){return a.utils.getColor(d3.scale.category20().range())},a.utils.customTheme=function(b,c,d){c=c||function(a){return a.key},d=d||d3.scale.category20().range();var e=d.length;return function(f,g){var h=c(f);return a.utils.isFunction(b[h])?b[h]():void 0!==b[h]?b[h]:(e||(e=d.length),e-=1,d[e])}},a.utils.pjax=function(b,c){var d=function(d){d3.html(d,function(d){var e=d3.select(c).node();e.parentNode.replaceChild(d3.select(d).select(c).node(),e),a.utils.pjax(b,c)})};d3.selectAll(b).on("click",function(){history.pushState(this.href,this.textContent,this.href),d(this.href),d3.event.preventDefault()}),d3.select(window).on("popstate",function(){d3.event.state&&d(d3.event.state)})},a.utils.calcApproxTextWidth=function(b){if(a.utils.isFunction(b.style)&&a.utils.isFunction(b.text)){var c=parseInt(b.style("font-size").replace("px",""),10),d=b.text().length;return a.utils.NaNtoZero(d*c*.5)}return 0},a.utils.NaNtoZero=function(b){return!a.utils.isNumber(b)||isNaN(b)||null===b||b===1/0||b===-(1/0)?0:b},d3.selection.prototype.watchTransition=function(a){var b=[this].concat([].slice.call(arguments,1));return a.transition.apply(a,b)},a.utils.renderWatch=function(b,c){if(!(this instanceof a.utils.renderWatch))return new a.utils.renderWatch(b,c);var d=void 0!==c?c:250,e=[],f=this;this.models=function(a){return a=[].slice.call(arguments,0),a.forEach(function(a){a.__rendered=!1,function(a){a.dispatch.on("renderEnd",function(b){a.__rendered=!0,f.renderEnd("model")})}(a),e.indexOf(a)<0&&e.push(a)}),this},this.reset=function(a){void 0!==a&&(d=a),e=[]},this.transition=function(a,b,c){if(b=arguments.length>1?[].slice.call(arguments,1):[],c=b.length>1?b.pop():void 0!==d?d:250,a.__rendered=!1,e.indexOf(a)<0&&e.push(a),0===c)return a.__rendered=!0,a.delay=function(){return this},a.duration=function(){return this},a;0===a.length?a.__rendered=!0:a.every(function(a){return!a.length})?a.__rendered=!0:a.__rendered=!1;var g=0;return a.transition().duration(c).each(function(){++g}).each("end",function(c,d){0===--g&&(a.__rendered=!0,f.renderEnd.apply(this,b))})},this.renderEnd=function(){e.every(function(a){return a.__rendered})&&(e.forEach(function(a){a.__rendered=!1}),b.renderEnd.apply(this,arguments))}},a.utils.deepExtend=function(b){var c=arguments.length>1?[].slice.call(arguments,1):[];c.forEach(function(c){for(var d in c){var e=a.utils.isArray(b[d]),f=a.utils.isObject(b[d]),g=a.utils.isObject(c[d]);f&&!e&&g?a.utils.deepExtend(b[d],c[d]):b[d]=c[d]}})},a.utils.state=function(){if(!(this instanceof a.utils.state))return new a.utils.state;var b={},c=function(){},d=function(){return{}},e=null,f=null;this.dispatch=d3.dispatch("change","set"),this.dispatch.on("set",function(a){c(a,!0)}),this.getter=function(a){return d=a,this},this.setter=function(a,b){return b||(b=function(){}),c=function(c,d){a(c),d&&b()},this},this.init=function(b){e=e||{},a.utils.deepExtend(e,b)};var g=function(){var a=d();if(JSON.stringify(a)===JSON.stringify(b))return!1;for(var c in a)void 0===b[c]&&(b[c]={}),b[c]=a[c],f=!0;return!0};this.update=function(){e&&(c(e,!1),e=null),g.call(this)&&this.dispatch.change(b)}},a.utils.optionsFunc=function(b){return b&&d3.map(b).forEach(function(b,c){a.utils.isFunction(this[b])&&this[b](c)}.bind(this)),this},a.utils.calcTicksX=function(b,c){var d=1,e=0;for(e;e<c.length;e+=1){var f=c[e]&&c[e].values?c[e].values.length:0;d=f>d?f:d}return a.log("Requested number of ticks: ",b),a.log("Calculated max values to be: ",d),b=b>d?b=d-1:b,b=1>b?1:b,b=Math.floor(b),a.log("Calculating tick count as: ",b),b},a.utils.calcTicksY=function(b,c){return a.utils.calcTicksX(b,c)},a.utils.initOption=function(a,b){a._calls&&a._calls[b]?a[b]=a._calls[b]:(a[b]=function(c){return arguments.length?(a._overrides[b]=!0,a._options[b]=c,a):a._options[b]},a["_"+b]=function(c){return arguments.length?(a._overrides[b]||(a._options[b]=c),a):a._options[b]})},a.utils.initOptions=function(b){b._overrides=b._overrides||{};var c=Object.getOwnPropertyNames(b._options||{}),d=Object.getOwnPropertyNames(b._calls||{});c=c.concat(d);for(var e in c)a.utils.initOption(b,c[e])},a.utils.inheritOptionsD3=function(a,b,c){a._d3options=c.concat(a._d3options||[]),c.unshift(b),c.unshift(a),d3.rebind.apply(this,c)},a.utils.arrayUnique=function(a){return a.sort().filter(function(b,c){return!c||b!=a[c-1]})},a.utils.symbolMap=d3.map(),a.utils.symbol=function(){function b(b,e){var f=c.call(this,b,e),g=d.call(this,b,e);return-1!==d3.svg.symbolTypes.indexOf(f)?d3.svg.symbol().type(f).size(g)():a.utils.symbolMap.get(f)(g)}var c,d=64;return b.type=function(a){return arguments.length?(c=d3.functor(a),b):c},b.size=function(a){return arguments.length?(d=d3.functor(a),b):d},b},a.utils.inheritOptions=function(b,c){var d=Object.getOwnPropertyNames(c._options||{}),e=Object.getOwnPropertyNames(c._calls||{}),f=c._inherited||[],g=c._d3options||[],h=d.concat(e).concat(f).concat(g);h.unshift(c),h.unshift(b),d3.rebind.apply(this,h),b._inherited=a.utils.arrayUnique(d.concat(e).concat(f).concat(d).concat(b._inherited||[])),b._d3options=a.utils.arrayUnique(g.concat(b._d3options||[]))},a.utils.initSVG=function(a){a.classed({"nvd3-svg":!0})},a.utils.sanitizeHeight=function(a,b){return a||parseInt(b.style("height"),10)||400},a.utils.sanitizeWidth=function(a,b){return a||parseInt(b.style("width"),10)||960},a.utils.availableHeight=function(b,c,d){return Math.max(0,a.utils.sanitizeHeight(b,c)-d.top-d.bottom)},a.utils.availableWidth=function(b,c,d){return Math.max(0,a.utils.sanitizeWidth(b,c)-d.left-d.right)},a.utils.noData=function(b,c){var d=b.options(),e=d.margin(),f=d.noData(),g=null==f?["No Data Available."]:[f],h=a.utils.availableHeight(null,c,e),i=a.utils.availableWidth(null,c,e),j=e.left+i/2,k=e.top+h/2;c.selectAll("g").remove();var l=c.selectAll(".nv-noData").data(g);l.enter().append("text").attr("class","nvd3 nv-noData").attr("dy","-.7em").style("text-anchor","middle"),l.attr("x",j).attr("y",k).text(function(a){return a})},a.utils.wrapTicks=function(a,b){a.each(function(){for(var a,c=d3.select(this),d=c.text().split(/\s+/).reverse(),e=[],f=0,g=1.1,h=c.attr("y"),i=parseFloat(c.attr("dy")),j=c.text(null).append("tspan").attr("x",0).attr("y",h).attr("dy",i+"em");a=d.pop();)e.push(a),j.text(e.join(" ")),j.node().getComputedTextLength()>b&&(e.pop(),j.text(e.join(" ")),e=[a],j=c.append("tspan").attr("x",0).attr("y",h).attr("dy",++f*g+i+"em").text(a))})},a.utils.arrayEquals=function(b,c){if(b===c)return!0;if(!b||!c)return!1;if(b.length!=c.length)return!1;for(var d=0,e=b.length;e>d;d++)if(b[d]instanceof Array&&c[d]instanceof Array){if(!a.arrayEquals(b[d],c[d]))return!1}else if(b[d]!=c[d])return!1;return!0},a.models.axis=function(){"use strict";function b(g){return t.reset(),g.each(function(b){var g=d3.select(this);a.utils.initSVG(g);var q=g.selectAll("g.nv-wrap.nv-axis").data([b]),r=q.enter().append("g").attr("class","nvd3 nv-wrap nv-axis"),u=(r.append("g"),q.select("g"));null!==n?c.ticks(n):("top"==c.orient()||"bottom"==c.orient())&&c.ticks(Math.abs(d.range()[1]-d.range()[0])/100),u.watchTransition(t,"axis").call(c),s=s||c.scale();var v=c.tickFormat();null==v&&(v=s.tickFormat());var w=u.selectAll("text.nv-axislabel").data([h||null]);w.exit().remove(),void 0!==p&&u.selectAll("g").select("text").style("font-size",p);var x,y,z;switch(c.orient()){case"top":w.enter().append("text").attr("class","nv-axislabel"),z=0,1===d.range().length?z=m?2*d.range()[0]+d.rangeBand():0:2===d.range().length?z=m?d.range()[0]+d.range()[1]+d.rangeBand():d.range()[1]:d.range().length>2&&(z=d.range()[d.range().length-1]+(d.range()[1]-d.range()[0])),w.attr("text-anchor","middle").attr("y",0).attr("x",z/2),i&&(y=q.selectAll("g.nv-axisMaxMin").data(d.domain()),y.enter().append("g").attr("class",function(a,b){return["nv-axisMaxMin","nv-axisMaxMin-x",0==b?"nv-axisMin-x":"nv-axisMax-x"].join(" ")}).append("text"),y.exit().remove(),y.attr("transform",function(b,c){return"translate("+a.utils.NaNtoZero(d(b))+",0)"}).select("text").attr("dy","-0.5em").attr("y",-c.tickPadding()).attr("text-anchor","middle").text(function(a,b){var c=v(a);return(""+c).match("NaN")?"":c}),y.watchTransition(t,"min-max top").attr("transform",function(b,c){return"translate("+a.utils.NaNtoZero(d.range()[c])+",0)"}));break;case"bottom":x=o+36;var A=30,B=0,C=u.selectAll("g").select("text"),D="";if(j%360){C.attr("transform",""),C.each(function(a,b){var c=this.getBoundingClientRect(),d=c.width;B=c.height,d>A&&(A=d)}),D="rotate("+j+" 0,"+(B/2+c.tickPadding())+")";var E=Math.abs(Math.sin(j*Math.PI/180));x=(E?E*A:A)+30,C.attr("transform",D).style("text-anchor",j%360>0?"start":"end")}else l?C.attr("transform",function(a,b){return"translate(0,"+(b%2==0?"0":"12")+")"}):C.attr("transform","translate(0,0)");w.enter().append("text").attr("class","nv-axislabel"),z=0,1===d.range().length?z=m?2*d.range()[0]+d.rangeBand():0:2===d.range().length?z=m?d.range()[0]+d.range()[1]+d.rangeBand():d.range()[1]:d.range().length>2&&(z=d.range()[d.range().length-1]+(d.range()[1]-d.range()[0])),w.attr("text-anchor","middle").attr("y",x).attr("x",z/2),i&&(y=q.selectAll("g.nv-axisMaxMin").data([d.domain()[0],d.domain()[d.domain().length-1]]),y.enter().append("g").attr("class",function(a,b){return["nv-axisMaxMin","nv-axisMaxMin-x",0==b?"nv-axisMin-x":"nv-axisMax-x"].join(" ")}).append("text"),y.exit().remove(),y.attr("transform",function(b,c){return"translate("+a.utils.NaNtoZero(d(b)+(m?d.rangeBand()/2:0))+",0)"}).select("text").attr("dy",".71em").attr("y",c.tickPadding()).attr("transform",D).style("text-anchor",j?j%360>0?"start":"end":"middle").text(function(a,b){var c=v(a);return(""+c).match("NaN")?"":c}),y.watchTransition(t,"min-max bottom").attr("transform",function(b,c){return"translate("+a.utils.NaNtoZero(d(b)+(m?d.rangeBand()/2:0))+",0)"}));break;case"right":w.enter().append("text").attr("class","nv-axislabel"),w.style("text-anchor",k?"middle":"begin").attr("transform",k?"rotate(90)":"").attr("y",k?-Math.max(e.right,f)+12-(o||0):-10).attr("x",k?d3.max(d.range())/2:c.tickPadding()),i&&(y=q.selectAll("g.nv-axisMaxMin").data(d.domain()),y.enter().append("g").attr("class",function(a,b){return["nv-axisMaxMin","nv-axisMaxMin-y",0==b?"nv-axisMin-y":"nv-axisMax-y"].join(" ")}).append("text").style("opacity",0),y.exit().remove(),y.attr("transform",function(b,c){return"translate(0,"+a.utils.NaNtoZero(d(b))+")"}).select("text").attr("dy",".32em").attr("y",0).attr("x",c.tickPadding()).style("text-anchor","start").text(function(a,b){var c=v(a);return(""+c).match("NaN")?"":c}),y.watchTransition(t,"min-max right").attr("transform",function(b,c){return"translate(0,"+a.utils.NaNtoZero(d.range()[c])+")"}).select("text").style("opacity",1));break;case"left":w.enter().append("text").attr("class","nv-axislabel"),w.style("text-anchor",k?"middle":"end").attr("transform",k?"rotate(-90)":"").attr("y",k?-Math.max(e.left,f)+25-(o||0):-10).attr("x",k?-d3.max(d.range())/2:-c.tickPadding()),i&&(y=q.selectAll("g.nv-axisMaxMin").data(d.domain()),y.enter().append("g").attr("class",function(a,b){return["nv-axisMaxMin","nv-axisMaxMin-y",0==b?"nv-axisMin-y":"nv-axisMax-y"].join(" ")}).append("text").style("opacity",0),y.exit().remove(),y.attr("transform",function(b,c){return"translate(0,"+a.utils.NaNtoZero(s(b))+")"}).select("text").attr("dy",".32em").attr("y",0).attr("x",-c.tickPadding()).attr("text-anchor","end").text(function(a,b){var c=v(a);return(""+c).match("NaN")?"":c}),y.watchTransition(t,"min-max right").attr("transform",function(b,c){return"translate(0,"+a.utils.NaNtoZero(d.range()[c])+")"}).select("text").style("opacity",1))}if(w.text(function(a){return a}),!i||"left"!==c.orient()&&"right"!==c.orient()||(u.selectAll("g").each(function(a,b){d3.select(this).select("text").attr("opacity",1),(d(a)<d.range()[1]+10||d(a)>d.range()[0]-10)&&((a>1e-10||-1e-10>a)&&d3.select(this).attr("opacity",0),d3.select(this).select("text").attr("opacity",0))}),d.domain()[0]==d.domain()[1]&&0==d.domain()[0]&&q.selectAll("g.nv-axisMaxMin").style("opacity",function(a,b){return b?0:1})),i&&("top"===c.orient()||"bottom"===c.orient())){var F=[];q.selectAll("g.nv-axisMaxMin").each(function(a,b){try{b?F.push(d(a)-this.getBoundingClientRect().width-4):F.push(d(a)+this.getBoundingClientRect().width+4)}catch(c){b?F.push(d(a)-4):F.push(d(a)+4)}}),u.selectAll("g").each(function(a,b){(d(a)<F[0]||d(a)>F[1])&&(a>1e-10||-1e-10>a?d3.select(this).remove():d3.select(this).select("text").remove())})}u.selectAll(".tick").filter(function(a){return!parseFloat(Math.round(1e5*a)/1e6)&&void 0!==a}).classed("zero",!0),s=d.copy()}),t.renderEnd("axis immediate"),b}var c=d3.svg.axis(),d=d3.scale.linear(),e={top:0,right:0,bottom:0,left:0},f=75,g=60,h=null,i=!0,j=0,k=!0,l=!1,m=!1,n=null,o=0,p=void 0,q=250,r=d3.dispatch("renderEnd");c.scale(d).orient("bottom").tickFormat(function(a){return a});var s,t=a.utils.renderWatch(r,q);return b.axis=c,b.dispatch=r,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{axisLabelDistance:{get:function(){return o},set:function(a){o=a}},staggerLabels:{get:function(){return l},set:function(a){l=a}},rotateLabels:{get:function(){return j},set:function(a){j=a}},rotateYLabel:{get:function(){return k},set:function(a){k=a}},showMaxMin:{get:function(){return i},set:function(a){i=a}},axisLabel:{get:function(){return h},set:function(a){h=a}},height:{get:function(){return g},set:function(a){g=a}},ticks:{get:function(){return n},set:function(a){n=a}},width:{get:function(){return f},set:function(a){f=a}},fontSize:{get:function(){return p},set:function(a){p=a}},margin:{get:function(){return e},set:function(a){e.top=void 0!==a.top?a.top:e.top,e.right=void 0!==a.right?a.right:e.right,e.bottom=void 0!==a.bottom?a.bottom:e.bottom,e.left=void 0!==a.left?a.left:e.left}},duration:{get:function(){return q},set:function(a){q=a,t.reset(q)}},scale:{get:function(){return d},set:function(e){d=e,c.scale(d),m="function"==typeof d.rangeBands,a.utils.inheritOptionsD3(b,d,["domain","range","rangeBand","rangeBands"])}}}),a.utils.initOptions(b),a.utils.inheritOptionsD3(b,c,["orient","tickValues","tickSubdivide","tickSize","tickPadding","tickFormat"]),a.utils.inheritOptionsD3(b,d,["domain","range","rangeBand","rangeBands"]),b},a.models.boxPlot=function(){"use strict";function b(l){return E.reset(),l.each(function(b){var l=j-i.left-i.right,F=k-i.top-i.bottom;A=d3.select(this),a.utils.initSVG(A),m.domain(c||b.map(function(a,b){return o(a,b)})).rangeBands(d||[0,l],.1);var G=[];if(!e){var H,I,J=[];b.forEach(function(a,b){var c=p(a),d=r(a),e=s(a),f=t(a),g=v(a);g&&g.forEach(function(a,b){J.push(w(a,b,void 0))}),e&&J.push(e),c&&J.push(c),d&&J.push(d),f&&J.push(f)}),H=d3.min(J),I=d3.max(J),G=[H,I]}n.domain(e||G),n.range(f||[F,0]),g=g||m,h=h||n.copy().range([n(0),n(0)]);var K=A.selectAll("g.nv-wrap").data([b]);K.enter().append("g").attr("class","nvd3 nv-wrap");K.attr("transform","translate("+i.left+","+i.top+")");var L=K.selectAll(".nv-boxplot").data(function(a){return a}),M=L.enter().append("g").style("stroke-opacity",1e-6).style("fill-opacity",1e-6);L.attr("class","nv-boxplot").attr("transform",function(a,b,c){return"translate("+(m(o(a,b))+.05*m.rangeBand())+", 0)"}).classed("hover",function(a){return a.hover}),L.watchTransition(E,"nv-boxplot: boxplots").style("stroke-opacity",1).style("fill-opacity",.75).delay(function(a,c){return c*C/b.length}).attr("transform",function(a,b){return"translate("+(m(o(a,b))+.05*m.rangeBand())+", 0)"}),L.exit().remove(),M.each(function(a,b){var c=d3.select(this);[s,t].forEach(function(d){if(d(a)){var e=d===s?"low":"high";c.append("line").style("stroke",u(a)||z(a,b)).attr("class","nv-boxplot-whisker nv-boxplot-"+e),c.append("line").style("stroke",u(a)||z(a,b)).attr("class","nv-boxplot-tick nv-boxplot-"+e)}})});var N=function(){return null===D?.9*m.rangeBand():Math.min(75,.9*m.rangeBand())},O=function(){return.45*m.rangeBand()-N()/2},P=function(){return.45*m.rangeBand()+N()/2};[s,t].forEach(function(a){var b=a===s?"low":"high",c=a===s?p:r;L.select("line.nv-boxplot-whisker.nv-boxplot-"+b).watchTransition(E,"nv-boxplot: boxplots").attr("x1",.45*m.rangeBand()).attr("y1",function(b,c){return n(a(b))}).attr("x2",.45*m.rangeBand()).attr("y2",function(a,b){return n(c(a))}),L.select("line.nv-boxplot-tick.nv-boxplot-"+b).watchTransition(E,"nv-boxplot: boxplots").attr("x1",O).attr("y1",function(b,c){return n(a(b))}).attr("x2",P).attr("y2",function(b,c){return n(a(b))})}),[s,t].forEach(function(a){var b=a===s?"low":"high";M.selectAll(".nv-boxplot-"+b).on("mouseover",function(b,c,d){d3.select(this).classed("hover",!0),B.elementMouseover({series:{key:a(b),color:u(b)||z(b,d)},e:d3.event})}).on("mouseout",function(b,c,d){d3.select(this).classed("hover",!1),B.elementMouseout({series:{key:a(b),color:u(b)||z(b,d)},e:d3.event})}).on("mousemove",function(a,b){B.elementMousemove({e:d3.event})})}),M.append("rect").attr("class","nv-boxplot-box").on("mouseover",function(a,b){d3.select(this).classed("hover",!0),B.elementMouseover({key:o(a),value:o(a),series:[{key:"Q3",value:r(a),color:u(a)||z(a,b)},{key:"Q2",value:q(a),color:u(a)||z(a,b)},{key:"Q1",value:p(a),color:u(a)||z(a,b)}],data:a,index:b,e:d3.event})}).on("mouseout",function(a,b){d3.select(this).classed("hover",!1),B.elementMouseout({key:o(a),value:o(a),series:[{key:"Q3",value:r(a),color:u(a)||z(a,b)},{key:"Q2",value:q(a),color:u(a)||z(a,b)},{key:"Q1",value:p(a),color:u(a)||z(a,b)}],data:a,index:b,e:d3.event})}).on("mousemove",function(a,b){B.elementMousemove({e:d3.event})}),L.select("rect.nv-boxplot-box").watchTransition(E,"nv-boxplot: boxes").attr("y",function(a,b){return n(r(a))}).attr("width",N).attr("x",O).attr("height",function(a,b){return Math.abs(n(r(a))-n(p(a)))||1}).style("fill",function(a,b){return u(a)||z(a,b)}).style("stroke",function(a,b){return u(a)||z(a,b)}),M.append("line").attr("class","nv-boxplot-median"),L.select("line.nv-boxplot-median").watchTransition(E,"nv-boxplot: boxplots line").attr("x1",O).attr("y1",function(a,b){return n(q(a))}).attr("x2",P).attr("y2",function(a,b){return n(q(a))});var Q=L.selectAll(".nv-boxplot-outlier").data(function(a){return v(a)||[]});Q.enter().append("circle").style("fill",function(a,b,c){return y(a,b,c)||z(a,c)}).style("stroke",function(a,b,c){return y(a,b,c)||z(a,c)}).style("z-index",9e3).on("mouseover",function(a,b,c){d3.select(this).classed("hover",!0),B.elementMouseover({series:{key:x(a,b,c),color:y(a,b,c)||z(a,c)},e:d3.event})}).on("mouseout",function(a,b,c){d3.select(this).classed("hover",!1),B.elementMouseout({series:{key:x(a,b,c),color:y(a,b,c)||z(a,c)},e:d3.event})}).on("mousemove",function(a,b){B.elementMousemove({e:d3.event})}),Q.attr("class","nv-boxplot-outlier"),Q.watchTransition(E,"nv-boxplot: nv-boxplot-outlier").attr("cx",.45*m.rangeBand()).attr("cy",function(a,b,c){return n(w(a,b,c))}).attr("r","3"),Q.exit().remove(),g=m.copy(),h=n.copy()}),E.renderEnd("nv-boxplot immediate"),b}var c,d,e,f,g,h,i={top:0,right:0,bottom:0,left:0},j=960,k=500,l=Math.floor(1e4*Math.random()),m=d3.scale.ordinal(),n=d3.scale.linear(),o=function(a){return a.label},p=function(a){return a.values.Q1},q=function(a){return a.values.Q2},r=function(a){return a.values.Q3},s=function(a){return a.values.whisker_low},t=function(a){return a.values.whisker_high},u=function(a){return a.color},v=function(a){return a.values.outliers},w=function(a,b,c){return a},x=function(a,b,c){return a},y=function(a,b,c){return void 0},z=a.utils.defaultColor(),A=null,B=d3.dispatch("elementMouseover","elementMouseout","elementMousemove","renderEnd"),C=250,D=null,E=a.utils.renderWatch(B,C);return b.dispatch=B,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return j},set:function(a){j=a}},height:{get:function(){return k},set:function(a){k=a}},maxBoxWidth:{get:function(){return D},set:function(a){D=a}},x:{get:function(){return o},set:function(a){o=a}},q1:{get:function(){return p},set:function(a){p=a}},q2:{get:function(){return q},set:function(a){q=a}},q3:{get:function(){return r},set:function(a){r=a}},wl:{get:function(){return s},set:function(a){s=a}},wh:{get:function(){return t},set:function(a){t=a}},itemColor:{get:function(){return u},set:function(a){u=a}},outliers:{get:function(){return v},set:function(a){v=a;}},outlierValue:{get:function(){return w},set:function(a){w=a}},outlierLabel:{get:function(){return x},set:function(a){x=a}},outlierColor:{get:function(){return y},set:function(a){y=a}},xScale:{get:function(){return m},set:function(a){m=a}},yScale:{get:function(){return n},set:function(a){n=a}},xDomain:{get:function(){return c},set:function(a){c=a}},yDomain:{get:function(){return e},set:function(a){e=a}},xRange:{get:function(){return d},set:function(a){d=a}},yRange:{get:function(){return f},set:function(a){f=a}},id:{get:function(){return l},set:function(a){l=a}},y:{get:function(){return console.warn("BoxPlot 'y' chart option is deprecated. Please use model overrides instead."),{}},set:function(a){console.warn("BoxPlot 'y' chart option is deprecated. Please use model overrides instead.")}},margin:{get:function(){return i},set:function(a){i.top=void 0!==a.top?a.top:i.top,i.right=void 0!==a.right?a.right:i.right,i.bottom=void 0!==a.bottom?a.bottom:i.bottom,i.left=void 0!==a.left?a.left:i.left}},color:{get:function(){return z},set:function(b){z=a.utils.getColor(b)}},duration:{get:function(){return C},set:function(a){C=a,E.reset(C)}}}),a.utils.initOptions(b),b},a.models.boxPlotChart=function(){"use strict";function b(k){return t.reset(),t.models(e),l&&t.models(f),m&&t.models(g),k.each(function(k){var p=d3.select(this);a.utils.initSVG(p);var t=(i||parseInt(p.style("width"))||960)-h.left-h.right,u=(j||parseInt(p.style("height"))||400)-h.top-h.bottom;if(b.update=function(){r.beforeUpdate(),p.transition().duration(s).call(b)},b.container=this,!k||!k.length){var v=p.selectAll(".nv-noData").data([q]);return v.enter().append("text").attr("class","nvd3 nv-noData").attr("dy","-.7em").style("text-anchor","middle"),v.attr("x",h.left+t/2).attr("y",h.top+u/2).text(function(a){return a}),b}p.selectAll(".nv-noData").remove(),c=e.xScale(),d=e.yScale().clamp(!0);var w=p.selectAll("g.nv-wrap.nv-boxPlotWithAxes").data([k]),x=w.enter().append("g").attr("class","nvd3 nv-wrap nv-boxPlotWithAxes").append("g"),y=x.append("defs"),z=w.select("g");x.append("g").attr("class","nv-x nv-axis"),x.append("g").attr("class","nv-y nv-axis").append("g").attr("class","nv-zeroLine").append("line"),x.append("g").attr("class","nv-barsWrap"),z.attr("transform","translate("+h.left+","+h.top+")"),n&&z.select(".nv-y.nv-axis").attr("transform","translate("+t+",0)"),e.width(t).height(u);var A=z.select(".nv-barsWrap").datum(k.filter(function(a){return!a.disabled}));if(A.transition().call(e),y.append("clipPath").attr("id","nv-x-label-clip-"+e.id()).append("rect"),z.select("#nv-x-label-clip-"+e.id()+" rect").attr("width",c.rangeBand()*(o?2:1)).attr("height",16).attr("x",-c.rangeBand()/(o?1:2)),l){f.scale(c).ticks(a.utils.calcTicksX(t/100,k)).tickSize(-u,0),z.select(".nv-x.nv-axis").attr("transform","translate(0,"+d.range()[0]+")"),z.select(".nv-x.nv-axis").call(f);var B=z.select(".nv-x.nv-axis").selectAll("g");o&&B.selectAll("text").attr("transform",function(a,b,c){return"translate(0,"+(c%2===0?"5":"17")+")"})}m&&(g.scale(d).ticks(Math.floor(u/36)).tickSize(-t,0),z.select(".nv-y.nv-axis").call(g)),z.select(".nv-zeroLine line").attr("x1",0).attr("x2",t).attr("y1",d(0)).attr("y2",d(0))}),t.renderEnd("nv-boxplot chart immediate"),b}var c,d,e=a.models.boxPlot(),f=a.models.axis(),g=a.models.axis(),h={top:15,right:10,bottom:50,left:60},i=null,j=null,k=a.utils.getColor(),l=!0,m=!0,n=!1,o=!1,p=a.models.tooltip(),q="No Data Available.",r=d3.dispatch("beforeUpdate","renderEnd"),s=250;f.orient("bottom").showMaxMin(!1).tickFormat(function(a){return a}),g.orient(n?"right":"left").tickFormat(d3.format(",.1f")),p.duration(0);var t=a.utils.renderWatch(r,s);return e.dispatch.on("elementMouseover.tooltip",function(a){p.data(a).hidden(!1)}),e.dispatch.on("elementMouseout.tooltip",function(a){p.data(a).hidden(!0)}),e.dispatch.on("elementMousemove.tooltip",function(a){p()}),b.dispatch=r,b.boxplot=e,b.xAxis=f,b.yAxis=g,b.tooltip=p,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return i},set:function(a){i=a}},height:{get:function(){return j},set:function(a){j=a}},staggerLabels:{get:function(){return o},set:function(a){o=a}},showXAxis:{get:function(){return l},set:function(a){l=a}},showYAxis:{get:function(){return m},set:function(a){m=a}},tooltipContent:{get:function(){return p},set:function(a){p=a}},noData:{get:function(){return q},set:function(a){q=a}},margin:{get:function(){return h},set:function(a){h.top=void 0!==a.top?a.top:h.top,h.right=void 0!==a.right?a.right:h.right,h.bottom=void 0!==a.bottom?a.bottom:h.bottom,h.left=void 0!==a.left?a.left:h.left}},duration:{get:function(){return s},set:function(a){s=a,t.reset(s),e.duration(s),f.duration(s),g.duration(s)}},color:{get:function(){return k},set:function(b){k=a.utils.getColor(b),e.color(k)}},rightAlignYAxis:{get:function(){return n},set:function(a){n=a,g.orient(a?"right":"left")}}}),a.utils.inheritOptions(b,e),a.utils.initOptions(b),b},a.models.bullet=function(){"use strict";function b(a,b){var c=a.slice();a.sort(function(a,d){var e=c.indexOf(a),f=c.indexOf(d);return d3.descending(b[e],b[f])})}function c(e){return e.each(function(c,e){var s=p-d.left-d.right,x=q-d.top-d.bottom;r=d3.select(this),a.utils.initSVG(r);var y=g.call(this,c,e).slice(),z=h.call(this,c,e).slice(),A=i.call(this,c,e).slice().sort(d3.descending),B=j.call(this,c,e).slice(),C=k.call(this,c,e).slice(),D=l.call(this,c,e).slice(),E=m.call(this,c,e).slice(),F=n.call(this,c,e).slice();b(C,y),b(D,z),b(E,A),b(F,B),y.sort(d3.descending),z.sort(d3.descending),B.sort(d3.descending);var G=d3.scale.linear().domain(d3.extent(d3.merge([o,y]))).range(f?[s,0]:[0,s]);this.__chart__||d3.scale.linear().domain([0,1/0]).range(G.range());this.__chart__=G;for(var H=(d3.min(y),d3.max(y),y[1],r.selectAll("g.nv-wrap.nv-bullet").data([c])),I=H.enter().append("g").attr("class","nvd3 nv-wrap nv-bullet"),J=I.append("g"),K=H.select("g"),e=0,L=y.length;L>e;e++){var M="nv-range nv-range"+e;2>=e&&(M=M+" nv-range"+w[e]),J.append("rect").attr("class",M)}J.append("rect").attr("class","nv-measure"),H.attr("transform","translate("+d.left+","+d.top+")");for(var N=function(a){return Math.abs(G(a)-G(0))},O=function(a){return G(0>a?a:0)},e=0,L=y.length;L>e;e++){var P=y[e];K.select("rect.nv-range"+e).attr("height",x).attr("width",N(P)).attr("x",O(P)).datum(P)}K.select("rect.nv-measure").style("fill",t).attr("height",x/3).attr("y",x/3).attr("width",0>B?G(0)-G(B[0]):G(B[0])-G(0)).attr("x",O(B)).on("mouseover",function(){u.elementMouseover({value:B[0],label:F[0]||"Current",color:d3.select(this).style("fill")})}).on("mousemove",function(){u.elementMousemove({value:B[0],label:F[0]||"Current",color:d3.select(this).style("fill")})}).on("mouseout",function(){u.elementMouseout({value:B[0],label:F[0]||"Current",color:d3.select(this).style("fill")})});var Q=x/6,R=z.map(function(a,b){return{value:a,label:D[b]}});J.selectAll("path.nv-markerTriangle").data(R).enter().append("path").attr("class","nv-markerTriangle").attr("d","M0,"+Q+"L"+Q+","+-Q+" "+-Q+","+-Q+"Z").on("mouseover",function(a){u.elementMouseover({value:a.value,label:a.label||"Previous",color:d3.select(this).style("fill"),pos:[G(a.value),x/2]})}).on("mousemove",function(a){u.elementMousemove({value:a.value,label:a.label||"Previous",color:d3.select(this).style("fill")})}).on("mouseout",function(a,b){u.elementMouseout({value:a.value,label:a.label||"Previous",color:d3.select(this).style("fill")})}),K.selectAll("path.nv-markerTriangle").data(R).attr("transform",function(a){return"translate("+G(a.value)+","+x/2+")"});var S=A.map(function(a,b){return{value:a,label:E[b]}});J.selectAll("path.nv-markerLine").data(S).enter().append("line").attr("cursor","").attr("class","nv-markerLine").attr("x1",function(a){return G(a.value)}).attr("y1","2").attr("x2",function(a){return G(a.value)}).attr("y2",x-2).on("mouseover",function(a){u.elementMouseover({value:a.value,label:a.label||"Previous",color:d3.select(this).style("fill"),pos:[G(a.value),x/2]})}).on("mousemove",function(a){u.elementMousemove({value:a.value,label:a.label||"Previous",color:d3.select(this).style("fill")})}).on("mouseout",function(a,b){u.elementMouseout({value:a.value,label:a.label||"Previous",color:d3.select(this).style("fill")})}),K.selectAll("path.nv-markerLines").data(S).attr("transform",function(a){return"translate("+G(a.value)+","+x/2+")"}),H.selectAll(".nv-range").on("mouseover",function(a,b){var c=C[b]||v[b];u.elementMouseover({value:a,label:c,color:d3.select(this).style("fill")})}).on("mousemove",function(){u.elementMousemove({value:B[0],label:F[0]||"Previous",color:d3.select(this).style("fill")})}).on("mouseout",function(a,b){var c=C[b]||v[b];u.elementMouseout({value:a,label:c,color:d3.select(this).style("fill")})})}),c}var d={top:0,right:0,bottom:0,left:0},e="left",f=!1,g=function(a){return a.ranges},h=function(a){return a.markers?a.markers:[]},i=function(a){return a.markerLines?a.markerLines:[0]},j=function(a){return a.measures},k=function(a){return a.rangeLabels?a.rangeLabels:[]},l=function(a){return a.markerLabels?a.markerLabels:[]},m=function(a){return a.markerLineLabels?a.markerLineLabels:[]},n=function(a){return a.measureLabels?a.measureLabels:[]},o=[0],p=380,q=30,r=null,s=null,t=a.utils.getColor(["#1f77b4"]),u=d3.dispatch("elementMouseover","elementMouseout","elementMousemove"),v=["Maximum","Mean","Minimum"],w=["Max","Avg","Min"];return c.dispatch=u,c.options=a.utils.optionsFunc.bind(c),c._options=Object.create({},{ranges:{get:function(){return g},set:function(a){g=a}},markers:{get:function(){return h},set:function(a){h=a}},measures:{get:function(){return j},set:function(a){j=a}},forceX:{get:function(){return o},set:function(a){o=a}},width:{get:function(){return p},set:function(a){p=a}},height:{get:function(){return q},set:function(a){q=a}},tickFormat:{get:function(){return s},set:function(a){s=a}},margin:{get:function(){return d},set:function(a){d.top=void 0!==a.top?a.top:d.top,d.right=void 0!==a.right?a.right:d.right,d.bottom=void 0!==a.bottom?a.bottom:d.bottom,d.left=void 0!==a.left?a.left:d.left}},orient:{get:function(){return e},set:function(a){e=a,f="right"==e||"bottom"==e}},color:{get:function(){return t},set:function(b){t=a.utils.getColor(b)}}}),a.utils.initOptions(c),c},a.models.bulletChart=function(){"use strict";function b(d){return d.each(function(e,o){var p=d3.select(this);a.utils.initSVG(p);var q=a.utils.availableWidth(k,p,g),r=l-g.top-g.bottom;if(b.update=function(){b(d)},b.container=this,!e||!h.call(this,e,o))return a.utils.noData(b,p),b;p.selectAll(".nv-noData").remove();var s=h.call(this,e,o).slice().sort(d3.descending),t=i.call(this,e,o).slice().sort(d3.descending),u=j.call(this,e,o).slice().sort(d3.descending),v=p.selectAll("g.nv-wrap.nv-bulletChart").data([e]),w=v.enter().append("g").attr("class","nvd3 nv-wrap nv-bulletChart"),x=w.append("g"),y=v.select("g");x.append("g").attr("class","nv-bulletWrap"),x.append("g").attr("class","nv-titles"),v.attr("transform","translate("+g.left+","+g.top+")");var z=d3.scale.linear().domain([0,Math.max(s[0],t[0]||0,u[0])]).range(f?[q,0]:[0,q]),A=this.__chart__||d3.scale.linear().domain([0,1/0]).range(z.range());this.__chart__=z;var B=x.select(".nv-titles").append("g").attr("text-anchor","end").attr("transform","translate(-6,"+(l-g.top-g.bottom)/2+")");B.append("text").attr("class","nv-title").text(function(a){return a.title}),B.append("text").attr("class","nv-subtitle").attr("dy","1em").text(function(a){return a.subtitle}),c.width(q).height(r);var C=y.select(".nv-bulletWrap");d3.transition(C).call(c);var D=m||z.tickFormat(q/100),E=y.selectAll("g.nv-tick").data(z.ticks(n?n:q/50),function(a){return this.textContent||D(a)}),F=E.enter().append("g").attr("class","nv-tick").attr("transform",function(a){return"translate("+A(a)+",0)"}).style("opacity",1e-6);F.append("line").attr("y1",r).attr("y2",7*r/6),F.append("text").attr("text-anchor","middle").attr("dy","1em").attr("y",7*r/6).text(D);var G=d3.transition(E).attr("transform",function(a){return"translate("+z(a)+",0)"}).style("opacity",1);G.select("line").attr("y1",r).attr("y2",7*r/6),G.select("text").attr("y",7*r/6),d3.transition(E.exit()).attr("transform",function(a){return"translate("+z(a)+",0)"}).style("opacity",1e-6).remove()}),d3.timer.flush(),b}var c=a.models.bullet(),d=a.models.tooltip(),e="left",f=!1,g={top:5,right:40,bottom:20,left:120},h=function(a){return a.ranges},i=function(a){return a.markers?a.markers:[]},j=function(a){return a.measures},k=null,l=55,m=null,n=null,o=null,p=d3.dispatch();return d.duration(0).headerEnabled(!1),c.dispatch.on("elementMouseover.tooltip",function(a){a.series={key:a.label,value:a.value,color:a.color},d.data(a).hidden(!1)}),c.dispatch.on("elementMouseout.tooltip",function(a){d.hidden(!0)}),c.dispatch.on("elementMousemove.tooltip",function(a){d()}),b.bullet=c,b.dispatch=p,b.tooltip=d,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{ranges:{get:function(){return h},set:function(a){h=a}},markers:{get:function(){return i},set:function(a){i=a}},measures:{get:function(){return j},set:function(a){j=a}},width:{get:function(){return k},set:function(a){k=a}},height:{get:function(){return l},set:function(a){l=a}},tickFormat:{get:function(){return m},set:function(a){m=a}},ticks:{get:function(){return n},set:function(a){n=a}},noData:{get:function(){return o},set:function(a){o=a}},margin:{get:function(){return g},set:function(a){g.top=void 0!==a.top?a.top:g.top,g.right=void 0!==a.right?a.right:g.right,g.bottom=void 0!==a.bottom?a.bottom:g.bottom,g.left=void 0!==a.left?a.left:g.left}},orient:{get:function(){return e},set:function(a){e=a,f="right"==e||"bottom"==e}}}),a.utils.inheritOptions(b,c),a.utils.initOptions(b),b},a.models.candlestickBar=function(){"use strict";function b(x){return x.each(function(b){c=d3.select(this);var x=a.utils.availableWidth(i,c,h),y=a.utils.availableHeight(j,c,h);a.utils.initSVG(c);var A=x/b[0].values.length*.45;l.domain(d||d3.extent(b[0].values.map(n).concat(t))),v?l.range(f||[.5*x/b[0].values.length,x*(b[0].values.length-.5)/b[0].values.length]):l.range(f||[5+A/2,x-A/2-5]),m.domain(e||[d3.min(b[0].values.map(s).concat(u)),d3.max(b[0].values.map(r).concat(u))]).range(g||[y,0]),l.domain()[0]===l.domain()[1]&&(l.domain()[0]?l.domain([l.domain()[0]-.01*l.domain()[0],l.domain()[1]+.01*l.domain()[1]]):l.domain([-1,1])),m.domain()[0]===m.domain()[1]&&(m.domain()[0]?m.domain([m.domain()[0]+.01*m.domain()[0],m.domain()[1]-.01*m.domain()[1]]):m.domain([-1,1]));var B=d3.select(this).selectAll("g.nv-wrap.nv-candlestickBar").data([b[0].values]),C=B.enter().append("g").attr("class","nvd3 nv-wrap nv-candlestickBar"),D=C.append("defs"),E=C.append("g"),F=B.select("g");E.append("g").attr("class","nv-ticks"),B.attr("transform","translate("+h.left+","+h.top+")"),c.on("click",function(a,b){z.chartClick({data:a,index:b,pos:d3.event,id:k})}),D.append("clipPath").attr("id","nv-chart-clip-path-"+k).append("rect"),B.select("#nv-chart-clip-path-"+k+" rect").attr("width",x).attr("height",y),F.attr("clip-path",w?"url(#nv-chart-clip-path-"+k+")":"");var G=B.select(".nv-ticks").selectAll(".nv-tick").data(function(a){return a});G.exit().remove();var H=G.enter().append("g");G.attr("class",function(a,b,c){return(p(a,b)>q(a,b)?"nv-tick negative":"nv-tick positive")+" nv-tick-"+c+"-"+b});H.append("line").attr("class","nv-candlestick-lines").attr("transform",function(a,b){return"translate("+l(n(a,b))+",0)"}).attr("x1",0).attr("y1",function(a,b){return m(r(a,b))}).attr("x2",0).attr("y2",function(a,b){return m(s(a,b))}),H.append("rect").attr("class","nv-candlestick-rects nv-bars").attr("transform",function(a,b){return"translate("+(l(n(a,b))-A/2)+","+(m(o(a,b))-(p(a,b)>q(a,b)?m(q(a,b))-m(p(a,b)):0))+")"}).attr("x",0).attr("y",0).attr("width",A).attr("height",function(a,b){var c=p(a,b),d=q(a,b);return c>d?m(d)-m(c):m(c)-m(d)});G.select(".nv-candlestick-lines").transition().attr("transform",function(a,b){return"translate("+l(n(a,b))+",0)"}).attr("x1",0).attr("y1",function(a,b){return m(r(a,b))}).attr("x2",0).attr("y2",function(a,b){return m(s(a,b))}),G.select(".nv-candlestick-rects").transition().attr("transform",function(a,b){return"translate("+(l(n(a,b))-A/2)+","+(m(o(a,b))-(p(a,b)>q(a,b)?m(q(a,b))-m(p(a,b)):0))+")"}).attr("x",0).attr("y",0).attr("width",A).attr("height",function(a,b){var c=p(a,b),d=q(a,b);return c>d?m(d)-m(c):m(c)-m(d)})}),b}var c,d,e,f,g,h={top:0,right:0,bottom:0,left:0},i=null,j=null,k=Math.floor(1e4*Math.random()),l=d3.scale.linear(),m=d3.scale.linear(),n=function(a){return a.x},o=function(a){return a.y},p=function(a){return a.open},q=function(a){return a.close},r=function(a){return a.high},s=function(a){return a.low},t=[],u=[],v=!1,w=!0,x=a.utils.defaultColor(),y=!1,z=d3.dispatch("stateChange","changeState","renderEnd","chartClick","elementClick","elementDblClick","elementMouseover","elementMouseout","elementMousemove");return b.highlightPoint=function(a,d){b.clearHighlights(),c.select(".nv-candlestickBar .nv-tick-0-"+a).classed("hover",d)},b.clearHighlights=function(){c.select(".nv-candlestickBar .nv-tick.hover").classed("hover",!1)},b.dispatch=z,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return i},set:function(a){i=a}},height:{get:function(){return j},set:function(a){j=a}},xScale:{get:function(){return l},set:function(a){l=a}},yScale:{get:function(){return m},set:function(a){m=a}},xDomain:{get:function(){return d},set:function(a){d=a}},yDomain:{get:function(){return e},set:function(a){e=a}},xRange:{get:function(){return f},set:function(a){f=a}},yRange:{get:function(){return g},set:function(a){g=a}},forceX:{get:function(){return t},set:function(a){t=a}},forceY:{get:function(){return u},set:function(a){u=a}},padData:{get:function(){return v},set:function(a){v=a}},clipEdge:{get:function(){return w},set:function(a){w=a}},id:{get:function(){return k},set:function(a){k=a}},interactive:{get:function(){return y},set:function(a){y=a}},x:{get:function(){return n},set:function(a){n=a}},y:{get:function(){return o},set:function(a){o=a}},open:{get:function(){return p()},set:function(a){p=a}},close:{get:function(){return q()},set:function(a){q=a}},high:{get:function(){return r},set:function(a){r=a}},low:{get:function(){return s},set:function(a){s=a}},margin:{get:function(){return h},set:function(a){h.top=void 0!=a.top?a.top:h.top,h.right=void 0!=a.right?a.right:h.right,h.bottom=void 0!=a.bottom?a.bottom:h.bottom,h.left=void 0!=a.left?a.left:h.left}},color:{get:function(){return x},set:function(b){x=a.utils.getColor(b)}}}),a.utils.initOptions(b),b},a.models.cumulativeLineChart=function(){"use strict";function b(l){return H.reset(),H.models(f),r&&H.models(g),s&&H.models(h),l.each(function(l){function A(a,c){d3.select(b.container).style("cursor","ew-resize")}function E(a,b){G.x=d3.event.x,G.i=Math.round(F.invert(G.x)),K()}function H(a,c){d3.select(b.container).style("cursor","auto"),y.index=G.i,C.stateChange(y)}function K(){aa.data([G]);var a=b.duration();b.duration(0),b.update(),b.duration(a)}var L=d3.select(this);a.utils.initSVG(L),L.classed("nv-chart-"+x,!0);var M=a.utils.availableWidth(o,L,m),N=a.utils.availableHeight(p,L,m);if(b.update=function(){0===D?L.call(b):L.transition().duration(D).call(b)},b.container=this,y.setter(J(l),b.update).getter(I(l)).update(),y.disabled=l.map(function(a){return!!a.disabled}),!z){var O;z={};for(O in y)y[O]instanceof Array?z[O]=y[O].slice(0):z[O]=y[O]}var P=d3.behavior.drag().on("dragstart",A).on("drag",E).on("dragend",H);if(!(l&&l.length&&l.filter(function(a){return a.values.length}).length))return a.utils.noData(b,L),b;if(L.selectAll(".nv-noData").remove(),d=f.xScale(),e=f.yScale(),w)f.yDomain(null);else{var Q=l.filter(function(a){return!a.disabled}).map(function(a,b){var c=d3.extent(a.values,f.y());return c[0]<-.95&&(c[0]=-.95),[(c[0]-c[1])/(1+c[1]),(c[1]-c[0])/(1+c[0])]}),R=[d3.min(Q,function(a){return a[0]}),d3.max(Q,function(a){return a[1]})];f.yDomain(R)}F.domain([0,l[0].values.length-1]).range([0,M]).clamp(!0);var l=c(G.i,l),S=v?"none":"all",T=L.selectAll("g.nv-wrap.nv-cumulativeLine").data([l]),U=T.enter().append("g").attr("class","nvd3 nv-wrap nv-cumulativeLine").append("g"),V=T.select("g");if(U.append("g").attr("class","nv-interactive"),U.append("g").attr("class","nv-x nv-axis").style("pointer-events","none"),U.append("g").attr("class","nv-y nv-axis"),U.append("g").attr("class","nv-background"),U.append("g").attr("class","nv-linesWrap").style("pointer-events",S),U.append("g").attr("class","nv-avgLinesWrap").style("pointer-events","none"),U.append("g").attr("class","nv-legendWrap"),U.append("g").attr("class","nv-controlsWrap"),q?(i.width(M),V.select(".nv-legendWrap").datum(l).call(i),i.height()>m.top&&(m.top=i.height(),N=a.utils.availableHeight(p,L,m)),V.select(".nv-legendWrap").attr("transform","translate(0,"+-m.top+")")):V.select(".nv-legendWrap").selectAll("*").remove(),u){var W=[{key:"Re-scale y-axis",disabled:!w}];j.width(140).color(["#444","#444","#444"]).rightAlign(!1).margin({top:5,right:0,bottom:5,left:20}),V.select(".nv-controlsWrap").datum(W).attr("transform","translate(0,"+-m.top+")").call(j)}else V.select(".nv-controlsWrap").selectAll("*").remove();T.attr("transform","translate("+m.left+","+m.top+")"),t&&V.select(".nv-y.nv-axis").attr("transform","translate("+M+",0)");var X=l.filter(function(a){return a.tempDisabled});T.select(".tempDisabled").remove(),X.length&&T.append("text").attr("class","tempDisabled").attr("x",M/2).attr("y","-.71em").style("text-anchor","end").text(X.map(function(a){return a.key}).join(", ")+" values cannot be calculated for this time period."),v&&(k.width(M).height(N).margin({left:m.left,top:m.top}).svgContainer(L).xScale(d),T.select(".nv-interactive").call(k)),U.select(".nv-background").append("rect"),V.select(".nv-background rect").attr("width",M).attr("height",N),f.y(function(a){return a.display.y}).width(M).height(N).color(l.map(function(a,b){return a.color||n(a,b)}).filter(function(a,b){return!l[b].disabled&&!l[b].tempDisabled}));var Y=V.select(".nv-linesWrap").datum(l.filter(function(a){return!a.disabled&&!a.tempDisabled}));Y.call(f),l.forEach(function(a,b){a.seriesIndex=b});var Z=l.filter(function(a){return!a.disabled&&!!B(a)}),$=V.select(".nv-avgLinesWrap").selectAll("line").data(Z,function(a){return a.key}),_=function(a){var b=e(B(a));return 0>b?0:b>N?N:b};$.enter().append("line").style("stroke-width",2).style("stroke-dasharray","10,10").style("stroke",function(a,b){return f.color()(a,a.seriesIndex)}).attr("x1",0).attr("x2",M).attr("y1",_).attr("y2",_),$.style("stroke-opacity",function(a){var b=e(B(a));return 0>b||b>N?0:1}).attr("x1",0).attr("x2",M).attr("y1",_).attr("y2",_),$.exit().remove();var aa=Y.selectAll(".nv-indexLine").data([G]);aa.enter().append("rect").attr("class","nv-indexLine").attr("width",3).attr("x",-2).attr("fill","red").attr("fill-opacity",.5).style("pointer-events","all").call(P),aa.attr("transform",function(a){return"translate("+F(a.i)+",0)"}).attr("height",N),r&&(g.scale(d)._ticks(a.utils.calcTicksX(M/70,l)).tickSize(-N,0),V.select(".nv-x.nv-axis").attr("transform","translate(0,"+e.range()[0]+")"),V.select(".nv-x.nv-axis").call(g)),s&&(h.scale(e)._ticks(a.utils.calcTicksY(N/36,l)).tickSize(-M,0),V.select(".nv-y.nv-axis").call(h)),V.select(".nv-background rect").on("click",function(){G.x=d3.mouse(this)[0],G.i=Math.round(F.invert(G.x)),y.index=G.i,C.stateChange(y),K()}),f.dispatch.on("elementClick",function(a){G.i=a.pointIndex,G.x=F(G.i),y.index=G.i,C.stateChange(y),K()}),j.dispatch.on("legendClick",function(a,c){a.disabled=!a.disabled,w=!a.disabled,y.rescaleY=w,C.stateChange(y),b.update()}),i.dispatch.on("stateChange",function(a){for(var c in a)y[c]=a[c];C.stateChange(y),b.update()}),k.dispatch.on("elementMousemove",function(c){f.clearHighlights();var d,e,i,j=[];if(l.filter(function(a,b){return a.seriesIndex=b,!a.disabled}).forEach(function(g,h){e=a.interactiveBisect(g.values,c.pointXValue,b.x()),f.highlightPoint(h,e,!0);var k=g.values[e];"undefined"!=typeof k&&("undefined"==typeof d&&(d=k),"undefined"==typeof i&&(i=b.xScale()(b.x()(k,e))),j.push({key:g.key,value:b.y()(k,e),color:n(g,g.seriesIndex)}))}),j.length>2){var m=b.yScale().invert(c.mouseY),o=Math.abs(b.yScale().domain()[0]-b.yScale().domain()[1]),p=.03*o,q=a.nearestValueIndex(j.map(function(a){return a.value}),m,p);null!==q&&(j[q].highlight=!0)}var r=g.tickFormat()(b.x()(d,e),e);k.tooltip.valueFormatter(function(a,b){return h.tickFormat()(a)}).data({value:r,series:j})(),k.renderGuideLine(i)}),k.dispatch.on("elementMouseout",function(a){f.clearHighlights()}),C.on("changeState",function(a){"undefined"!=typeof a.disabled&&(l.forEach(function(b,c){b.disabled=a.disabled[c]}),y.disabled=a.disabled),"undefined"!=typeof a.index&&(G.i=a.index,G.x=F(G.i),y.index=a.index,aa.data([G])),"undefined"!=typeof a.rescaleY&&(w=a.rescaleY),b.update()})}),H.renderEnd("cumulativeLineChart immediate"),b}function c(a,b){return K||(K=f.y()),b.map(function(b,c){if(!b.values)return b;var d=b.values[a];if(null==d)return b;var e=K(d,a);return-.95>e&&!E?(b.tempDisabled=!0,b):(b.tempDisabled=!1,b.values=b.values.map(function(a,b){return a.display={y:(K(a,b)-e)/(1+e)},a}),b)})}var d,e,f=a.models.line(),g=a.models.axis(),h=a.models.axis(),i=a.models.legend(),j=a.models.legend(),k=a.interactiveGuideline(),l=a.models.tooltip(),m={top:30,right:30,bottom:50,left:60},n=a.utils.defaultColor(),o=null,p=null,q=!0,r=!0,s=!0,t=!1,u=!0,v=!1,w=!0,x=f.id(),y=a.utils.state(),z=null,A=null,B=function(a){return a.average},C=d3.dispatch("stateChange","changeState","renderEnd"),D=250,E=!1;y.index=0,y.rescaleY=w,g.orient("bottom").tickPadding(7),h.orient(t?"right":"left"),l.valueFormatter(function(a,b){return h.tickFormat()(a,b)}).headerFormatter(function(a,b){return g.tickFormat()(a,b)}),j.updateState(!1);var F=d3.scale.linear(),G={i:0,x:0},H=a.utils.renderWatch(C,D),I=function(a){return function(){return{active:a.map(function(a){return!a.disabled}),index:G.i,rescaleY:w}}},J=function(a){return function(b){void 0!==b.index&&(G.i=b.index),void 0!==b.rescaleY&&(w=b.rescaleY),void 0!==b.active&&a.forEach(function(a,c){a.disabled=!b.active[c]})}};f.dispatch.on("elementMouseover.tooltip",function(a){var c={x:b.x()(a.point),y:b.y()(a.point),color:a.point.color};a.point=c,l.data(a).hidden(!1)}),f.dispatch.on("elementMouseout.tooltip",function(a){l.hidden(!0)});var K=null;return b.dispatch=C,b.lines=f,b.legend=i,b.controls=j,b.xAxis=g,b.yAxis=h,b.interactiveLayer=k,b.state=y,b.tooltip=l,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return o},set:function(a){o=a}},height:{get:function(){return p},set:function(a){p=a}},rescaleY:{get:function(){return w},set:function(a){w=a}},showControls:{get:function(){return u},set:function(a){u=a}},showLegend:{get:function(){return q},set:function(a){q=a}},average:{get:function(){return B},set:function(a){B=a}},defaultState:{get:function(){return z},set:function(a){z=a}},noData:{get:function(){return A},set:function(a){A=a}},showXAxis:{get:function(){return r},set:function(a){r=a}},showYAxis:{get:function(){return s},set:function(a){s=a}},noErrorCheck:{get:function(){return E},set:function(a){E=a}},margin:{get:function(){return m},set:function(a){m.top=void 0!==a.top?a.top:m.top,m.right=void 0!==a.right?a.right:m.right,m.bottom=void 0!==a.bottom?a.bottom:m.bottom,m.left=void 0!==a.left?a.left:m.left}},color:{get:function(){return n},set:function(b){n=a.utils.getColor(b),i.color(n)}},useInteractiveGuideline:{get:function(){return v},set:function(a){v=a,a===!0&&(b.interactive(!1),b.useVoronoi(!1))}},rightAlignYAxis:{get:function(){return t},set:function(a){t=a,h.orient(a?"right":"left")}},duration:{get:function(){return D},set:function(a){D=a,f.duration(D),g.duration(D),h.duration(D),H.reset(D)}}}),a.utils.inheritOptions(b,f),a.utils.initOptions(b),b},a.models.discreteBar=function(){"use strict";function b(m){return y.reset(),m.each(function(b){var m=k-j.left-j.right,x=l-j.top-j.bottom;c=d3.select(this),a.utils.initSVG(c),b.forEach(function(a,b){a.values.forEach(function(a){a.series=b})});var z=d&&e?[]:b.map(function(a){return a.values.map(function(a,b){return{x:p(a,b),y:q(a,b),y0:a.y0}})});n.domain(d||d3.merge(z).map(function(a){return a.x})).rangeBands(f||[0,m],.1),o.domain(e||d3.extent(d3.merge(z).map(function(a){return a.y}).concat(r))),t?o.range(g||[x-(o.domain()[0]<0?12:0),o.domain()[1]>0?12:0]):o.range(g||[x,0]),h=h||n,i=i||o.copy().range([o(0),o(0)]);var A=c.selectAll("g.nv-wrap.nv-discretebar").data([b]),B=A.enter().append("g").attr("class","nvd3 nv-wrap nv-discretebar"),C=B.append("g");A.select("g");C.append("g").attr("class","nv-groups"),A.attr("transform","translate("+j.left+","+j.top+")");var D=A.select(".nv-groups").selectAll(".nv-group").data(function(a){return a},function(a){return a.key});D.enter().append("g").style("stroke-opacity",1e-6).style("fill-opacity",1e-6),D.exit().watchTransition(y,"discreteBar: exit groups").style("stroke-opacity",1e-6).style("fill-opacity",1e-6).remove(),D.attr("class",function(a,b){return"nv-group nv-series-"+b}).classed("hover",function(a){return a.hover}),D.watchTransition(y,"discreteBar: groups").style("stroke-opacity",1).style("fill-opacity",.75);var E=D.selectAll("g.nv-bar").data(function(a){return a.values});E.exit().remove();var F=E.enter().append("g").attr("transform",function(a,b,c){return"translate("+(n(p(a,b))+.05*n.rangeBand())+", "+o(0)+")"}).on("mouseover",function(a,b){d3.select(this).classed("hover",!0),v.elementMouseover({data:a,index:b,color:d3.select(this).style("fill")})}).on("mouseout",function(a,b){d3.select(this).classed("hover",!1),v.elementMouseout({data:a,index:b,color:d3.select(this).style("fill")})}).on("mousemove",function(a,b){v.elementMousemove({data:a,index:b,color:d3.select(this).style("fill")})}).on("click",function(a,b){var c=this;v.elementClick({data:a,index:b,color:d3.select(this).style("fill"),event:d3.event,element:c}),d3.event.stopPropagation()}).on("dblclick",function(a,b){v.elementDblClick({data:a,index:b,color:d3.select(this).style("fill")}),d3.event.stopPropagation()});F.append("rect").attr("height",0).attr("width",.9*n.rangeBand()/b.length),t?(F.append("text").attr("text-anchor","middle"),E.select("text").text(function(a,b){return u(q(a,b))}).watchTransition(y,"discreteBar: bars text").attr("x",.9*n.rangeBand()/2).attr("y",function(a,b){return q(a,b)<0?o(q(a,b))-o(0)+12:-4})):E.selectAll("text").remove(),E.attr("class",function(a,b){return q(a,b)<0?"nv-bar negative":"nv-bar positive"}).style("fill",function(a,b){return a.color||s(a,b)}).style("stroke",function(a,b){return a.color||s(a,b)}).select("rect").attr("class",w).watchTransition(y,"discreteBar: bars rect").attr("width",.9*n.rangeBand()/b.length),E.watchTransition(y,"discreteBar: bars").attr("transform",function(a,b){var c=n(p(a,b))+.05*n.rangeBand(),d=q(a,b)<0?o(0):o(0)-o(q(a,b))<1?o(0)-1:o(q(a,b));return"translate("+c+", "+d+")"}).select("rect").attr("height",function(a,b){return Math.max(Math.abs(o(q(a,b))-o(0)),1)}),h=n.copy(),i=o.copy()}),y.renderEnd("discreteBar immediate"),b}var c,d,e,f,g,h,i,j={top:0,right:0,bottom:0,left:0},k=960,l=500,m=Math.floor(1e4*Math.random()),n=d3.scale.ordinal(),o=d3.scale.linear(),p=function(a){return a.x},q=function(a){return a.y},r=[0],s=a.utils.defaultColor(),t=!1,u=d3.format(",.2f"),v=d3.dispatch("chartClick","elementClick","elementDblClick","elementMouseover","elementMouseout","elementMousemove","renderEnd"),w="discreteBar",x=250,y=a.utils.renderWatch(v,x);return b.dispatch=v,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return k},set:function(a){k=a}},height:{get:function(){return l},set:function(a){l=a}},forceY:{get:function(){return r},set:function(a){r=a}},showValues:{get:function(){return t},set:function(a){t=a}},x:{get:function(){return p},set:function(a){p=a}},y:{get:function(){return q},set:function(a){q=a}},xScale:{get:function(){return n},set:function(a){n=a}},yScale:{get:function(){return o},set:function(a){o=a}},xDomain:{get:function(){return d},set:function(a){d=a}},yDomain:{get:function(){return e},set:function(a){e=a}},xRange:{get:function(){return f},set:function(a){f=a}},yRange:{get:function(){return g},set:function(a){g=a}},valueFormat:{get:function(){return u},set:function(a){u=a}},id:{get:function(){return m},set:function(a){m=a}},rectClass:{get:function(){return w},set:function(a){w=a}},margin:{get:function(){return j},set:function(a){j.top=void 0!==a.top?a.top:j.top,j.right=void 0!==a.right?a.right:j.right,j.bottom=void 0!==a.bottom?a.bottom:j.bottom,j.left=void 0!==a.left?a.left:j.left}},color:{get:function(){return s},set:function(b){s=a.utils.getColor(b)}},duration:{get:function(){return x},set:function(a){x=a,y.reset(x)}}}),a.utils.initOptions(b),b},a.models.discreteBarChart=function(){"use strict";function b(i){return x.reset(),x.models(e),o&&x.models(f),p&&x.models(g),i.each(function(i){var m=d3.select(this);a.utils.initSVG(m);var u=a.utils.availableWidth(k,m,j),x=a.utils.availableHeight(l,m,j);if(b.update=function(){v.beforeUpdate(),m.transition().duration(w).call(b)},b.container=this,!(i&&i.length&&i.filter(function(a){return a.values.length}).length))return a.utils.noData(b,m),b;m.selectAll(".nv-noData").remove(),c=e.xScale(),d=e.yScale().clamp(!0);var y=m.selectAll("g.nv-wrap.nv-discreteBarWithAxes").data([i]),z=y.enter().append("g").attr("class","nvd3 nv-wrap nv-discreteBarWithAxes").append("g"),A=z.append("defs"),B=y.select("g");z.append("g").attr("class","nv-x nv-axis"),z.append("g").attr("class","nv-y nv-axis").append("g").attr("class","nv-zeroLine").append("line"),z.append("g").attr("class","nv-barsWrap"),z.append("g").attr("class","nv-legendWrap"),B.attr("transform","translate("+j.left+","+j.top+")"),n?(h.width(u),B.select(".nv-legendWrap").datum(i).call(h),h.height()>j.top&&(j.top=h.height(),x=a.utils.availableHeight(l,m,j)),y.select(".nv-legendWrap").attr("transform","translate(0,"+-j.top+")")):B.select(".nv-legendWrap").selectAll("*").remove(),q&&B.select(".nv-y.nv-axis").attr("transform","translate("+u+",0)"),e.width(u).height(x);var C=B.select(".nv-barsWrap").datum(i.filter(function(a){return!a.disabled}));if(C.transition().call(e),A.append("clipPath").attr("id","nv-x-label-clip-"+e.id()).append("rect"),B.select("#nv-x-label-clip-"+e.id()+" rect").attr("width",c.rangeBand()*(r?2:1)).attr("height",16).attr("x",-c.rangeBand()/(r?1:2)),o){f.scale(c)._ticks(a.utils.calcTicksX(u/100,i)).tickSize(-x,0),B.select(".nv-x.nv-axis").attr("transform","translate(0,"+(d.range()[0]+(e.showValues()&&d.domain()[0]<0?16:0))+")"),B.select(".nv-x.nv-axis").call(f);var D=B.select(".nv-x.nv-axis").selectAll("g");r&&D.selectAll("text").attr("transform",function(a,b,c){return"translate(0,"+(c%2==0?"5":"17")+")"}),t&&D.selectAll(".tick text").attr("transform","rotate("+t+" 0,0)").style("text-anchor",t>0?"start":"end"),s&&B.selectAll(".tick text").call(a.utils.wrapTicks,b.xAxis.rangeBand())}p&&(g.scale(d)._ticks(a.utils.calcTicksY(x/36,i)).tickSize(-u,0),B.select(".nv-y.nv-axis").call(g)),B.select(".nv-zeroLine line").attr("x1",0).attr("x2",q?-u:u).attr("y1",d(0)).attr("y2",d(0))}),x.renderEnd("discreteBar chart immediate"),b}var c,d,e=a.models.discreteBar(),f=a.models.axis(),g=a.models.axis(),h=a.models.legend(),i=a.models.tooltip(),j={top:15,right:10,bottom:50,left:60},k=null,l=null,m=a.utils.getColor(),n=!1,o=!0,p=!0,q=!1,r=!1,s=!1,t=0,u=null,v=d3.dispatch("beforeUpdate","renderEnd"),w=250;f.orient("bottom").showMaxMin(!1).tickFormat(function(a){return a}),g.orient(q?"right":"left").tickFormat(d3.format(",.1f")),i.duration(0).headerEnabled(!1).valueFormatter(function(a,b){return g.tickFormat()(a,b)}).keyFormatter(function(a,b){return f.tickFormat()(a,b)});var x=a.utils.renderWatch(v,w);return e.dispatch.on("elementMouseover.tooltip",function(a){a.series={key:b.x()(a.data),value:b.y()(a.data),color:a.color},i.data(a).hidden(!1)}),e.dispatch.on("elementMouseout.tooltip",function(a){i.hidden(!0)}),e.dispatch.on("elementMousemove.tooltip",function(a){i()}),b.dispatch=v,b.discretebar=e,b.legend=h,b.xAxis=f,b.yAxis=g,b.tooltip=i,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return k},set:function(a){k=a}},height:{get:function(){return l},set:function(a){l=a}},showLegend:{get:function(){return n},set:function(a){n=a}},staggerLabels:{get:function(){return r},set:function(a){r=a}},rotateLabels:{get:function(){return t},set:function(a){t=a}},wrapLabels:{get:function(){return s},set:function(a){s=!!a}},showXAxis:{get:function(){return o},set:function(a){o=a}},showYAxis:{get:function(){return p},set:function(a){p=a}},noData:{get:function(){return u},set:function(a){u=a}},margin:{get:function(){return j},set:function(a){j.top=void 0!==a.top?a.top:j.top,j.right=void 0!==a.right?a.right:j.right,j.bottom=void 0!==a.bottom?a.bottom:j.bottom,j.left=void 0!==a.left?a.left:j.left}},duration:{get:function(){return w},set:function(a){w=a,x.reset(w),e.duration(w),f.duration(w),g.duration(w)}},color:{get:function(){return m},set:function(b){m=a.utils.getColor(b),e.color(m),h.color(m)}},rightAlignYAxis:{get:function(){return q},set:function(a){q=a,g.orient(a?"right":"left")}}}),a.utils.inheritOptions(b,e),a.utils.initOptions(b),b},a.models.distribution=function(){"use strict";function b(k){return m.reset(),k.each(function(b){var k=(e-("x"===g?d.left+d.right:d.top+d.bottom),"x"==g?"y":"x"),l=d3.select(this);a.utils.initSVG(l),c=c||j;var n=l.selectAll("g.nv-distribution").data([b]),o=n.enter().append("g").attr("class","nvd3 nv-distribution"),p=(o.append("g"),n.select("g"));n.attr("transform","translate("+d.left+","+d.top+")");var q=p.selectAll("g.nv-dist").data(function(a){return a},function(a){return a.key});q.enter().append("g"),q.attr("class",function(a,b){return"nv-dist nv-series-"+b}).style("stroke",function(a,b){return i(a,b)});var r=q.selectAll("line.nv-dist"+g).data(function(a){return a.values});r.enter().append("line").attr(g+"1",function(a,b){return c(h(a,b))}).attr(g+"2",function(a,b){return c(h(a,b))}),m.transition(q.exit().selectAll("line.nv-dist"+g),"dist exit").attr(g+"1",function(a,b){return j(h(a,b))}).attr(g+"2",function(a,b){return j(h(a,b))}).style("stroke-opacity",0).remove(),r.attr("class",function(a,b){return"nv-dist"+g+" nv-dist"+g+"-"+b}).attr(k+"1",0).attr(k+"2",f),m.transition(r,"dist").attr(g+"1",function(a,b){return j(h(a,b))}).attr(g+"2",function(a,b){return j(h(a,b))}),c=j.copy()}),m.renderEnd("distribution immediate"),b}var c,d={top:0,right:0,bottom:0,left:0},e=400,f=8,g="x",h=function(a){return a[g]},i=a.utils.defaultColor(),j=d3.scale.linear(),k=250,l=d3.dispatch("renderEnd"),m=a.utils.renderWatch(l,k);return b.options=a.utils.optionsFunc.bind(b),b.dispatch=l,b.margin=function(a){return arguments.length?(d.top="undefined"!=typeof a.top?a.top:d.top,d.right="undefined"!=typeof a.right?a.right:d.right,d.bottom="undefined"!=typeof a.bottom?a.bottom:d.bottom,d.left="undefined"!=typeof a.left?a.left:d.left,b):d},b.width=function(a){return arguments.length?(e=a,b):e},b.axis=function(a){return arguments.length?(g=a,b):g},b.size=function(a){return arguments.length?(f=a,b):f},b.getData=function(a){return arguments.length?(h=d3.functor(a),b):h},b.scale=function(a){return arguments.length?(j=a,b):j},b.color=function(c){return arguments.length?(i=a.utils.getColor(c),b):i},b.duration=function(a){return arguments.length?(k=a,m.reset(k),b):k},b},a.models.focus=function(b){"use strict";function c(t){return s.reset(),s.models(b),m&&s.models(f),n&&s.models(g),t.each(function(s){function t(a){var b=+("e"==a),c=b?1:-1,d=y/3;return"M"+.5*c+","+d+"A6,6 0 0 "+b+" "+6.5*c+","+(d+6)+"V"+(2*d-6)+"A6,6 0 0 "+b+" "+.5*c+","+2*d+"ZM"+2.5*c+","+(d+8)+"V"+(2*d-8)+"M"+4.5*c+","+(d+8)+"V"+(2*d-8)}function u(){h.empty()||h.extent(p),D.data([h.empty()?d.domain():p]).each(function(a,b){var c=d(a[0])-d.range()[0],e=x-d(a[1]);d3.select(this).select(".left").attr("width",0>c?0:c),d3.select(this).select(".right").attr("x",d(a[1])).attr("width",0>e?0:e)})}function v(){p=h.empty()?null:h.extent();var a=h.empty()?d.domain():h.extent();Math.abs(a[0]-a[1])<=1||(r.brush({extent:a,brush:h}),u(),r.onBrush(a))}var w=d3.select(this);a.utils.initSVG(w);var x=a.utils.availableWidth(k,w,i),y=l-i.top-i.bottom;c.update=function(){0===q?w.call(c):w.transition().duration(q).call(c)},c.container=this,d=b.xScale(),e=b.yScale();var z=w.selectAll("g.nv-focus").data([s]),A=z.enter().append("g").attr("class","nvd3 nv-focus").append("g"),B=z.select("g");z.attr("transform","translate("+i.left+","+i.top+")"),A.append("g").attr("class","nv-background").append("rect"),A.append("g").attr("class","nv-x nv-axis"),A.append("g").attr("class","nv-y nv-axis"),A.append("g").attr("class","nv-contentWrap"),A.append("g").attr("class","nv-brushBackground"),A.append("g").attr("class","nv-x nv-brush"),o&&B.select(".nv-y.nv-axis").attr("transform","translate("+x+",0)"),B.select(".nv-background rect").attr("width",x).attr("height",y),b.width(x).height(y).color(s.map(function(a,b){return a.color||j(a,b)}).filter(function(a,b){return!s[b].disabled}));var C=B.select(".nv-contentWrap").datum(s.filter(function(a){return!a.disabled}));d3.transition(C).call(b),h.x(d).on("brush",function(){v()}),p&&h.extent(p);var D=B.select(".nv-brushBackground").selectAll("g").data([p||h.extent()]),E=D.enter().append("g");E.append("rect").attr("class","left").attr("x",0).attr("y",0).attr("height",y),E.append("rect").attr("class","right").attr("x",0).attr("y",0).attr("height",y);var F=B.select(".nv-x.nv-brush").call(h);F.selectAll("rect").attr("height",y),F.selectAll(".resize").append("path").attr("d",t),v(),B.select(".nv-background rect").attr("width",x).attr("height",y),m&&(f.scale(d)._ticks(a.utils.calcTicksX(x/100,s)).tickSize(-y,0),B.select(".nv-x.nv-axis").attr("transform","translate(0,"+e.range()[0]+")"),d3.transition(B.select(".nv-x.nv-axis")).call(f)),n&&(g.scale(e)._ticks(a.utils.calcTicksY(y/36,s)).tickSize(-x,0),d3.transition(B.select(".nv-y.nv-axis")).call(g)),B.select(".nv-x.nv-axis").attr("transform","translate(0,"+e.range()[0]+")")}),s.renderEnd("focus immediate"),c}var d,e,b=b||a.models.line(),f=a.models.axis(),g=a.models.axis(),h=d3.svg.brush(),i={top:10,right:0,bottom:30,left:0},j=a.utils.defaultColor(),k=null,l=70,m=!0,n=!1,o=!1,p=null,q=250,r=d3.dispatch("brush","onBrush","renderEnd");b.interactive(!1),b.pointActive(function(a){return!1});var s=a.utils.renderWatch(r,q);return c.dispatch=r,c.content=b,c.brush=h,c.xAxis=f,c.yAxis=g,c.options=a.utils.optionsFunc.bind(c),c._options=Object.create({},{width:{get:function(){return k},set:function(a){k=a}},height:{get:function(){return l},set:function(a){l=a}},showXAxis:{get:function(){return m},set:function(a){m=a}},showYAxis:{get:function(){return n},set:function(a){n=a}},brushExtent:{get:function(){return p},set:function(a){p=a}},margin:{get:function(){return i},set:function(a){i.top=void 0!==a.top?a.top:i.top,i.right=void 0!==a.right?a.right:i.right,i.bottom=void 0!==a.bottom?a.bottom:i.bottom,i.left=void 0!==a.left?a.left:i.left}},duration:{get:function(){return q},set:function(a){q=a,s.reset(q),b.duration(q),f.duration(q),g.duration(q)}},color:{get:function(){return j},set:function(c){j=a.utils.getColor(c),b.color(j)}},interpolate:{get:function(){return b.interpolate()},set:function(a){b.interpolate(a)}},xTickFormat:{get:function(){return f.tickFormat()},set:function(a){f.tickFormat(a)}},yTickFormat:{get:function(){return g.tickFormat()},set:function(a){g.tickFormat(a)}},x:{get:function(){return b.x()},set:function(a){b.x(a)}},y:{get:function(){return b.y()},set:function(a){b.y(a)}},rightAlignYAxis:{get:function(){return o},set:function(a){o=a,g.orient(o?"right":"left")}}}),a.utils.inheritOptions(c,b),a.utils.initOptions(c),c},a.models.forceDirectedGraph=function(){"use strict";function b(g){return u.reset(),g.each(function(g){f=d3.select(this),a.utils.initSVG(f);var j=a.utils.availableWidth(d,f,c),u=a.utils.availableHeight(e,f,c);if(f.attr("width",j).attr("height",u),!(g&&g.links&&g.nodes))return a.utils.noData(b,f),b;f.selectAll(".nv-noData").remove(),f.selectAll("*").remove();var v=new Set;g.nodes.forEach(function(a){var b=Object.keys(a);b.forEach(function(a){v.add(a)})});var w=d3.layout.force().nodes(g.nodes).links(g.links).size([j,u]).linkStrength(k).friction(l).linkDistance(m).charge(n).gravity(o).theta(p).alpha(q).start(),x=f.selectAll(".link").data(g.links).enter().append("line").attr("class","nv-force-link").style("stroke-width",function(a){return Math.sqrt(a.value)}),y=f.selectAll(".node").data(g.nodes).enter().append("g").attr("class","nv-force-node").call(w.drag);y.append("circle").attr("r",r).style("fill",function(a){return h(a)}).on("mouseover",function(a){f.select(".nv-series-"+a.seriesIndex+" .nv-distx-"+a.pointIndex).attr("y1",a.py),f.select(".nv-series-"+a.seriesIndex+" .nv-disty-"+a.pointIndex).attr("x2",a.px);var b=h(a);a.series=[],v.forEach(function(c){a.series.push({color:b,key:c,value:a[c]})}),i.data(a).hidden(!1)}).on("mouseout",function(a){i.hidden(!0)}),i.headerFormatter(function(a){return"Node"}),t(x),s(y),w.on("tick",function(){x.attr("x1",function(a){return a.source.x}).attr("y1",function(a){return a.source.y}).attr("x2",function(a){return a.target.x}).attr("y2",function(a){return a.target.y}),y.attr("transform",function(a){return"translate("+a.x+", "+a.y+")"})})}),b}var c={top:2,right:0,bottom:2,left:0},d=400,e=32,f=null,g=d3.dispatch("renderEnd"),h=a.utils.getColor(["#000"]),i=a.models.tooltip(),j=null,k=.1,l=.9,m=30,n=-120,o=.1,p=.8,q=.1,r=5,s=function(a){},t=function(a){},u=a.utils.renderWatch(g);return b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return d},set:function(a){d=a}},height:{get:function(){return e},set:function(a){e=a}},linkStrength:{get:function(){return k},set:function(a){k=a}},friction:{get:function(){return l},set:function(a){l=a}},linkDist:{get:function(){return m},set:function(a){m=a}},charge:{get:function(){return n},set:function(a){n=a}},gravity:{get:function(){return o},set:function(a){o=a}},theta:{get:function(){return p},set:function(a){p=a}},alpha:{get:function(){return q},set:function(a){q=a}},radius:{get:function(){return r},set:function(a){r=a}},x:{get:function(){return getX},set:function(a){getX=d3.functor(a)}},y:{get:function(){return getY},set:function(a){getY=d3.functor(a)}},margin:{get:function(){return c},set:function(a){c.top=void 0!==a.top?a.top:c.top,c.right=void 0!==a.right?a.right:c.right,c.bottom=void 0!==a.bottom?a.bottom:c.bottom,c.left=void 0!==a.left?a.left:c.left}},color:{get:function(){return h},set:function(b){h=a.utils.getColor(b)}},noData:{get:function(){return j},set:function(a){j=a}},nodeExtras:{get:function(){return s},set:function(a){s=a}},linkExtras:{get:function(){return t},set:function(a){t=a}}}),b.dispatch=g,b.tooltip=i,a.utils.initOptions(b),b},a.models.furiousLegend=function(){"use strict";function b(r){function s(a,b){return"furious"!=q?"#000":o?a.disengaged?h(a,b):"#fff":o?void 0:a.disabled?h(a,b):"#fff"}function t(a,b){return o&&"furious"==q?a.disengaged?"#fff":h(a,b):a.disabled?"#fff":h(a,b)}return r.each(function(b){var r=d-c.left-c.right,u=d3.select(this);a.utils.initSVG(u);var v=u.selectAll("g.nv-legend").data([b]),w=(v.enter().append("g").attr("class","nvd3 nv-legend").append("g"),v.select("g"));v.attr("transform","translate("+c.left+","+c.top+")");var x,y=w.selectAll(".nv-series").data(function(a){return"furious"!=q?a:a.filter(function(a){return o?!0:!a.disengaged})}),z=y.enter().append("g").attr("class","nv-series");if("classic"==q)z.append("circle").style("stroke-width",2).attr("class","nv-legend-symbol").attr("r",5),x=y.select("circle");else if("furious"==q){z.append("rect").style("stroke-width",2).attr("class","nv-legend-symbol").attr("rx",3).attr("ry",3),x=y.select("rect"),z.append("g").attr("class","nv-check-box").property("innerHTML",'<path d="M0.5,5 L22.5,5 L22.5,26.5 L0.5,26.5 L0.5,5 Z" class="nv-box"></path><path d="M5.5,12.8618467 L11.9185089,19.2803556 L31,0.198864511" class="nv-check"></path>').attr("transform","translate(-10,-8)scale(0.5)");var A=y.select(".nv-check-box");A.each(function(a,b){d3.select(this).selectAll("path").attr("stroke",s(a,b))})}z.append("text").attr("text-anchor","start").attr("class","nv-legend-text").attr("dy",".32em").attr("dx","8");var B=y.select("text.nv-legend-text");y.on("mouseover",function(a,b){p.legendMouseover(a,b)}).on("mouseout",function(a,b){p.legendMouseout(a,b)}).on("click",function(a,b){p.legendClick(a,b);var c=y.data();if(m){if("classic"==q)n?(c.forEach(function(a){a.disabled=!0}),a.disabled=!1):(a.disabled=!a.disabled,c.every(function(a){return a.disabled})&&c.forEach(function(a){a.disabled=!1}));else if("furious"==q)if(o)a.disengaged=!a.disengaged,a.userDisabled=void 0==a.userDisabled?!!a.disabled:a.userDisabled,a.disabled=a.disengaged||a.userDisabled;else if(!o){a.disabled=!a.disabled,a.userDisabled=a.disabled;var d=c.filter(function(a){return!a.disengaged});d.every(function(a){return a.userDisabled})&&c.forEach(function(a){a.disabled=a.userDisabled=!1})}p.stateChange({disabled:c.map(function(a){return!!a.disabled}),disengaged:c.map(function(a){return!!a.disengaged})})}}).on("dblclick",function(a,b){if(("furious"!=q||!o)&&(p.legendDblclick(a,b),m)){var c=y.data();c.forEach(function(a){a.disabled=!0,"furious"==q&&(a.userDisabled=a.disabled)}),a.disabled=!1,"furious"==q&&(a.userDisabled=a.disabled),p.stateChange({disabled:c.map(function(a){return!!a.disabled})})}}),y.classed("nv-disabled",function(a){return a.userDisabled}),y.exit().remove(),B.attr("fill",s).text(function(a){return g(f(a))});var C;switch(q){case"furious":C=23;break;case"classic":C=20}if(j){var D=[];y.each(function(b,c){var d;if(g(f(b))&&g(f(b)).length>i){var e=g(f(b)).substring(0,i);d=d3.select(this).select("text").text(e+"..."),d3.select(this).append("svg:title").text(g(f(b)))}else d=d3.select(this).select("text");var h;try{if(h=d.node().getComputedTextLength(),0>=h)throw Error()}catch(j){h=a.utils.calcApproxTextWidth(d)}D.push(h+k)});for(var E=0,F=0,G=[];r>F&&E<D.length;)G[E]=D[E],F+=D[E++];for(0===E&&(E=1);F>r&&E>1;){G=[],E--;for(var H=0;H<D.length;H++)D[H]>(G[H%E]||0)&&(G[H%E]=D[H]);F=G.reduce(function(a,b,c,d){return a+b})}for(var I=[],J=0,K=0;E>J;J++)I[J]=K,K+=G[J];y.attr("transform",function(a,b){return"translate("+I[b%E]+","+(5+Math.floor(b/E)*C)+")"}),l?w.attr("transform","translate("+(d-c.right-F)+","+c.top+")"):w.attr("transform","translate(0,"+c.top+")"),e=c.top+c.bottom+Math.ceil(D.length/E)*C}else{var L,M=5,N=5,O=0;y.attr("transform",function(a,b){var e=d3.select(this).select("text").node().getComputedTextLength()+k;return L=N,d<c.left+c.right+L+e&&(N=L=5,M+=C),N+=e,N>O&&(O=N),"translate("+L+","+M+")"}),w.attr("transform","translate("+(d-c.right-O)+","+c.top+")"),e=c.top+c.bottom+M+15}"furious"==q&&x.attr("width",function(a,b){return B[0][b].getComputedTextLength()+27}).attr("height",18).attr("y",-9).attr("x",-15),x.style("fill",t).style("stroke",function(a,b){return a.color||h(a,b)})}),b}var c={top:5,right:0,bottom:5,left:0},d=400,e=20,f=function(a){return a.key},g=function(a){return a},h=a.utils.getColor(),i=20,j=!0,k=28,l=!0,m=!0,n=!1,o=!1,p=d3.dispatch("legendClick","legendDblclick","legendMouseover","legendMouseout","stateChange"),q="classic";return b.dispatch=p,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return d},set:function(a){d=a}},height:{get:function(){return e},set:function(a){e=a}},key:{get:function(){return f},set:function(a){f=a}},keyFormatter:{get:function(){return g},set:function(a){g=a}},align:{get:function(){return j},set:function(a){j=a}},rightAlign:{get:function(){return l},set:function(a){l=a}},maxKeyLength:{get:function(){return i},set:function(a){i=a}},padding:{get:function(){return k},set:function(a){k=a}},updateState:{get:function(){return m},set:function(a){m=a}},radioButtonMode:{get:function(){return n},set:function(a){n=a}},expanded:{get:function(){return o},set:function(a){o=a}},vers:{get:function(){return q},set:function(a){q=a}},margin:{get:function(){return c},set:function(a){c.top=void 0!==a.top?a.top:c.top,c.right=void 0!==a.right?a.right:c.right,c.bottom=void 0!==a.bottom?a.bottom:c.bottom,c.left=void 0!==a.left?a.left:c.left}},color:{get:function(){return h},set:function(b){h=a.utils.getColor(b)}}}),a.utils.initOptions(b),b},a.models.historicalBar=function(){"use strict";function b(x){return x.each(function(b){w.reset(),k=d3.select(this);var x=a.utils.availableWidth(h,k,g),y=a.utils.availableHeight(i,k,g);a.utils.initSVG(k),l.domain(c||d3.extent(b[0].values.map(n).concat(p))),r?l.range(e||[.5*x/b[0].values.length,x*(b[0].values.length-.5)/b[0].values.length]):l.range(e||[0,x]),m.domain(d||d3.extent(b[0].values.map(o).concat(q))).range(f||[y,0]),l.domain()[0]===l.domain()[1]&&(l.domain()[0]?l.domain([l.domain()[0]-.01*l.domain()[0],l.domain()[1]+.01*l.domain()[1]]):l.domain([-1,1])),m.domain()[0]===m.domain()[1]&&(m.domain()[0]?m.domain([m.domain()[0]+.01*m.domain()[0],m.domain()[1]-.01*m.domain()[1]]):m.domain([-1,1]));var z=k.selectAll("g.nv-wrap.nv-historicalBar-"+j).data([b[0].values]),A=z.enter().append("g").attr("class","nvd3 nv-wrap nv-historicalBar-"+j),B=A.append("defs"),C=A.append("g"),D=z.select("g");C.append("g").attr("class","nv-bars"),z.attr("transform","translate("+g.left+","+g.top+")"),k.on("click",function(a,b){u.chartClick({data:a,index:b,pos:d3.event,id:j})}),B.append("clipPath").attr("id","nv-chart-clip-path-"+j).append("rect"),z.select("#nv-chart-clip-path-"+j+" rect").attr("width",x).attr("height",y),D.attr("clip-path",s?"url(#nv-chart-clip-path-"+j+")":"");var E=z.select(".nv-bars").selectAll(".nv-bar").data(function(a){return a},function(a,b){return n(a,b)});E.exit().remove(),E.enter().append("rect").attr("x",0).attr("y",function(b,c){return a.utils.NaNtoZero(m(Math.max(0,o(b,c))))}).attr("height",function(b,c){return a.utils.NaNtoZero(Math.abs(m(o(b,c))-m(0)))}).attr("transform",function(a,c){return"translate("+(l(n(a,c))-x/b[0].values.length*.45)+",0)"}).on("mouseover",function(a,b){v&&(d3.select(this).classed("hover",!0),u.elementMouseover({data:a,index:b,color:d3.select(this).style("fill")}))}).on("mouseout",function(a,b){v&&(d3.select(this).classed("hover",!1),u.elementMouseout({data:a,index:b,color:d3.select(this).style("fill")}))}).on("mousemove",function(a,b){v&&u.elementMousemove({data:a,index:b,color:d3.select(this).style("fill")})}).on("click",function(a,b){v&&(u.elementClick({data:a,index:b,color:d3.select(this).style("fill")}),d3.event.stopPropagation())}).on("dblclick",function(a,b){v&&(u.elementDblClick({data:a,index:b,color:d3.select(this).style("fill")}),d3.event.stopPropagation())}),E.attr("fill",function(a,b){return t(a,b)}).attr("class",function(a,b,c){return(o(a,b)<0?"nv-bar negative":"nv-bar positive")+" nv-bar-"+c+"-"+b}).watchTransition(w,"bars").attr("transform",function(a,c){return"translate("+(l(n(a,c))-x/b[0].values.length*.45)+",0)"}).attr("width",x/b[0].values.length*.9),E.watchTransition(w,"bars").attr("y",function(b,c){var d=o(b,c)<0?m(0):m(0)-m(o(b,c))<1?m(0)-1:m(o(b,c));return a.utils.NaNtoZero(d)}).attr("height",function(b,c){return a.utils.NaNtoZero(Math.max(Math.abs(m(o(b,c))-m(0)),1))})}),w.renderEnd("historicalBar immediate"),b}var c,d,e,f,g={top:0,right:0,bottom:0,left:0},h=null,i=null,j=Math.floor(1e4*Math.random()),k=null,l=d3.scale.linear(),m=d3.scale.linear(),n=function(a){return a.x},o=function(a){return a.y},p=[],q=[0],r=!1,s=!0,t=a.utils.defaultColor(),u=d3.dispatch("chartClick","elementClick","elementDblClick","elementMouseover","elementMouseout","elementMousemove","renderEnd"),v=!0,w=a.utils.renderWatch(u,0);return b.highlightPoint=function(a,b){k.select(".nv-bars .nv-bar-0-"+a).classed("hover",b)},b.clearHighlights=function(){k.select(".nv-bars .nv-bar.hover").classed("hover",!1)},b.dispatch=u,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return h},set:function(a){h=a}},height:{get:function(){return i},set:function(a){i=a}},forceX:{get:function(){return p},set:function(a){p=a}},forceY:{get:function(){return q},set:function(a){q=a}},padData:{get:function(){return r},set:function(a){r=a}},x:{get:function(){return n},set:function(a){n=a}},y:{get:function(){return o},set:function(a){o=a}},xScale:{get:function(){return l},set:function(a){l=a}},yScale:{get:function(){return m},set:function(a){m=a}},xDomain:{get:function(){return c},set:function(a){c=a}},yDomain:{get:function(){return d},set:function(a){d=a}},xRange:{get:function(){return e},set:function(a){e=a}},yRange:{get:function(){return f},set:function(a){f=a}},clipEdge:{get:function(){return s},set:function(a){s=a}},id:{get:function(){return j},set:function(a){j=a}},interactive:{get:function(){return v},set:function(a){v=a}},margin:{get:function(){return g},set:function(a){g.top=void 0!==a.top?a.top:g.top,g.right=void 0!==a.right?a.right:g.right,g.bottom=void 0!==a.bottom?a.bottom:g.bottom,g.left=void 0!==a.left?a.left:g.left}},color:{get:function(){return t},set:function(b){t=a.utils.getColor(b)}}}),a.utils.initOptions(b),b},a.models.historicalBarChart=function(b){"use strict";function c(b){return b.each(function(k){z.reset(),z.models(f),q&&z.models(g),r&&z.models(h);var w=d3.select(this);a.utils.initSVG(w);var A=a.utils.availableWidth(n,w,l),B=a.utils.availableHeight(o,w,l);if(c.update=function(){w.transition().duration(y).call(c)},c.container=this,u.disabled=k.map(function(a){return!!a.disabled}),!v){var C;v={};for(C in u)u[C]instanceof Array?v[C]=u[C].slice(0):v[C]=u[C]}if(!(k&&k.length&&k.filter(function(a){return a.values.length}).length))return a.utils.noData(c,w),c;w.selectAll(".nv-noData").remove(),d=f.xScale(),e=f.yScale();var D=w.selectAll("g.nv-wrap.nv-historicalBarChart").data([k]),E=D.enter().append("g").attr("class","nvd3 nv-wrap nv-historicalBarChart").append("g"),F=D.select("g");E.append("g").attr("class","nv-x nv-axis"),E.append("g").attr("class","nv-y nv-axis"),E.append("g").attr("class","nv-barsWrap"),E.append("g").attr("class","nv-legendWrap"),E.append("g").attr("class","nv-interactive"),p?(i.width(A),F.select(".nv-legendWrap").datum(k).call(i),i.height()>l.top&&(l.top=i.height(),B=a.utils.availableHeight(o,w,l)),D.select(".nv-legendWrap").attr("transform","translate(0,"+-l.top+")")):F.select(".nv-legendWrap").selectAll("*").remove(),D.attr("transform","translate("+l.left+","+l.top+")"),s&&F.select(".nv-y.nv-axis").attr("transform","translate("+A+",0)"),t&&(j.width(A).height(B).margin({left:l.left,top:l.top}).svgContainer(w).xScale(d),D.select(".nv-interactive").call(j)),f.width(A).height(B).color(k.map(function(a,b){return a.color||m(a,b)}).filter(function(a,b){return!k[b].disabled}));var G=F.select(".nv-barsWrap").datum(k.filter(function(a){return!a.disabled}));G.transition().call(f),q&&(g.scale(d)._ticks(a.utils.calcTicksX(A/100,k)).tickSize(-B,0),F.select(".nv-x.nv-axis").attr("transform","translate(0,"+e.range()[0]+")"),F.select(".nv-x.nv-axis").transition().call(g)),r&&(h.scale(e)._ticks(a.utils.calcTicksY(B/36,k)).tickSize(-A,0),F.select(".nv-y.nv-axis").transition().call(h)),j.dispatch.on("elementMousemove",function(b){f.clearHighlights();var d,e,i,l=[];k.filter(function(a,b){return a.seriesIndex=b,!a.disabled}).forEach(function(g,h){e=a.interactiveBisect(g.values,b.pointXValue,c.x()),f.highlightPoint(e,!0);var j=g.values[e];void 0!==j&&(void 0===d&&(d=j),void 0===i&&(i=c.xScale()(c.x()(j,e))),l.push({key:g.key,value:c.y()(j,e),color:m(g,g.seriesIndex),data:g.values[e]}))});var n=g.tickFormat()(c.x()(d,e));j.tooltip.valueFormatter(function(a,b){return h.tickFormat()(a)}).data({value:n,index:e,series:l})(),j.renderGuideLine(i)}),j.dispatch.on("elementMouseout",function(a){x.tooltipHide(),f.clearHighlights()}),i.dispatch.on("legendClick",function(a,d){a.disabled=!a.disabled,k.filter(function(a){return!a.disabled}).length||k.map(function(a){return a.disabled=!1,D.selectAll(".nv-series").classed("disabled",!1),a}),u.disabled=k.map(function(a){return!!a.disabled}),x.stateChange(u),b.transition().call(c)}),i.dispatch.on("legendDblclick",function(a){k.forEach(function(a){a.disabled=!0}),a.disabled=!1,u.disabled=k.map(function(a){return!!a.disabled}),x.stateChange(u),c.update()}),x.on("changeState",function(a){"undefined"!=typeof a.disabled&&(k.forEach(function(b,c){b.disabled=a.disabled[c]}),u.disabled=a.disabled),c.update()})}),z.renderEnd("historicalBarChart immediate"),c}var d,e,f=b||a.models.historicalBar(),g=a.models.axis(),h=a.models.axis(),i=a.models.legend(),j=a.interactiveGuideline(),k=a.models.tooltip(),l={top:30,right:90,bottom:50,left:90},m=a.utils.defaultColor(),n=null,o=null,p=!1,q=!0,r=!0,s=!1,t=!1,u={},v=null,w=null,x=d3.dispatch("tooltipHide","stateChange","changeState","renderEnd"),y=250;g.orient("bottom").tickPadding(7),h.orient(s?"right":"left"),k.duration(0).headerEnabled(!1).valueFormatter(function(a,b){return h.tickFormat()(a,b)}).headerFormatter(function(a,b){return g.tickFormat()(a,b)});var z=a.utils.renderWatch(x,0);return f.dispatch.on("elementMouseover.tooltip",function(a){a.series={key:c.x()(a.data),value:c.y()(a.data),color:a.color},k.data(a).hidden(!1)}),f.dispatch.on("elementMouseout.tooltip",function(a){k.hidden(!0)}),f.dispatch.on("elementMousemove.tooltip",function(a){k()}),c.dispatch=x,c.bars=f,c.legend=i,c.xAxis=g,c.yAxis=h,c.interactiveLayer=j,c.tooltip=k,c.options=a.utils.optionsFunc.bind(c),c._options=Object.create({},{width:{get:function(){return n},set:function(a){n=a}},height:{get:function(){return o},set:function(a){o=a}},showLegend:{get:function(){return p},set:function(a){p=a}},showXAxis:{get:function(){return q},set:function(a){q=a}},showYAxis:{get:function(){return r},set:function(a){r=a}},defaultState:{get:function(){return v},set:function(a){v=a}},noData:{get:function(){return w},set:function(a){w=a}},margin:{get:function(){return l},set:function(a){l.top=void 0!==a.top?a.top:l.top,l.right=void 0!==a.right?a.right:l.right,l.bottom=void 0!==a.bottom?a.bottom:l.bottom,l.left=void 0!==a.left?a.left:l.left}},color:{get:function(){return m},set:function(b){m=a.utils.getColor(b),i.color(m),f.color(m)}},duration:{get:function(){return y},set:function(a){y=a,z.reset(y),h.duration(y),g.duration(y)}},rightAlignYAxis:{get:function(){return s},set:function(a){s=a,h.orient(a?"right":"left")}},useInteractiveGuideline:{get:function(){return t},set:function(a){t=a,a===!0&&c.interactive(!1)}}}),a.utils.inheritOptions(c,f),a.utils.initOptions(c),c},a.models.ohlcBarChart=function(){var b=a.models.historicalBarChart(a.models.ohlcBar());return b.useInteractiveGuideline(!0),b.interactiveLayer.tooltip.contentGenerator(function(a){var c=a.series[0].data,d=c.open<c.close?"2ca02c":"d62728";return'<h3 style="color: #'+d+'">'+a.value+"</h3><table><tr><td>open:</td><td>"+b.yAxis.tickFormat()(c.open)+"</td></tr><tr><td>close:</td><td>"+b.yAxis.tickFormat()(c.close)+"</td></tr><tr><td>high</td><td>"+b.yAxis.tickFormat()(c.high)+"</td></tr><tr><td>low:</td><td>"+b.yAxis.tickFormat()(c.low)+"</td></tr></table>"}),b},a.models.candlestickBarChart=function(){var b=a.models.historicalBarChart(a.models.candlestickBar());return b.useInteractiveGuideline(!0),b.interactiveLayer.tooltip.contentGenerator(function(a){var c=a.series[0].data,d=c.open<c.close?"2ca02c":"d62728";return'<h3 style="color: #'+d+'">'+a.value+"</h3><table><tr><td>open:</td><td>"+b.yAxis.tickFormat()(c.open)+"</td></tr><tr><td>close:</td><td>"+b.yAxis.tickFormat()(c.close)+"</td></tr><tr><td>high</td><td>"+b.yAxis.tickFormat()(c.high)+"</td></tr><tr><td>low:</td><td>"+b.yAxis.tickFormat()(c.low)+"</td></tr></table>"}),b},a.models.legend=function(){"use strict";function b(r){function s(a,b){return"furious"!=q?"#000":o?a.disengaged?"#000":"#fff":o?void 0:(a.color||(a.color=h(a,b)),a.disabled?a.color:"#fff")}function t(a,b){return o&&"furious"==q&&a.disengaged?"#eee":a.color||h(a,b)}function u(a,b){return o&&"furious"==q?1:a.disabled?0:1}return r.each(function(b){var h=d-c.left-c.right,r=d3.select(this);a.utils.initSVG(r);var v=r.selectAll("g.nv-legend").data([b]),w=v.enter().append("g").attr("class","nvd3 nv-legend").append("g"),x=v.select("g");v.attr("transform","translate("+c.left+","+c.top+")");var y,z,A=x.selectAll(".nv-series").data(function(a){return"furious"!=q?a:a.filter(function(a){return o?!0:!a.disengaged})}),B=A.enter().append("g").attr("class","nv-series");switch(q){case"furious":z=23;break;case"classic":z=20}if("classic"==q)B.append("circle").style("stroke-width",2).attr("class","nv-legend-symbol").attr("r",5),y=A.select(".nv-legend-symbol");else if("furious"==q){B.append("rect").style("stroke-width",2).attr("class","nv-legend-symbol").attr("rx",3).attr("ry",3),y=A.select(".nv-legend-symbol"),B.append("g").attr("class","nv-check-box").property("innerHTML",'<path d="M0.5,5 L22.5,5 L22.5,26.5 L0.5,26.5 L0.5,5 Z" class="nv-box"></path><path d="M5.5,12.8618467 L11.9185089,19.2803556 L31,0.198864511" class="nv-check"></path>').attr("transform","translate(-10,-8)scale(0.5)");var C=A.select(".nv-check-box");C.each(function(a,b){d3.select(this).selectAll("path").attr("stroke",s(a,b))})}B.append("text").attr("text-anchor","start").attr("class","nv-legend-text").attr("dy",".32em").attr("dx","8");var D=A.select("text.nv-legend-text");A.on("mouseover",function(a,b){p.legendMouseover(a,b)}).on("mouseout",function(a,b){p.legendMouseout(a,b)}).on("click",function(a,b){p.legendClick(a,b);var c=A.data();if(m){if("classic"==q)n?(c.forEach(function(a){a.disabled=!0}),a.disabled=!1):(a.disabled=!a.disabled,c.every(function(a){return a.disabled})&&c.forEach(function(a){a.disabled=!1}));else if("furious"==q)if(o)a.disengaged=!a.disengaged,a.userDisabled=void 0==a.userDisabled?!!a.disabled:a.userDisabled,a.disabled=a.disengaged||a.userDisabled;else if(!o){a.disabled=!a.disabled,a.userDisabled=a.disabled;var d=c.filter(function(a){return!a.disengaged});d.every(function(a){return a.userDisabled})&&c.forEach(function(a){a.disabled=a.userDisabled=!1})}p.stateChange({disabled:c.map(function(a){return!!a.disabled}),disengaged:c.map(function(a){return!!a.disengaged})})}}).on("dblclick",function(a,b){if(("furious"!=q||!o)&&(p.legendDblclick(a,b),m)){var c=A.data();c.forEach(function(a){a.disabled=!0,"furious"==q&&(a.userDisabled=a.disabled)}),a.disabled=!1,"furious"==q&&(a.userDisabled=a.disabled),p.stateChange({disabled:c.map(function(a){return!!a.disabled})})}}),A.classed("nv-disabled",function(a){return a.userDisabled}),A.exit().remove(),D.attr("fill",s).text(function(a){return g(f(a))});var E=0;if(j){var F=[];A.each(function(b,c){var d;if(g(f(b))&&g(f(b)).length>i){var e=g(f(b)).substring(0,i);d=d3.select(this).select("text").text(e+"..."),d3.select(this).append("svg:title").text(g(f(b)))}else d=d3.select(this).select("text");var h;try{if(h=d.node().getComputedTextLength(),0>=h)throw Error()}catch(j){h=a.utils.calcApproxTextWidth(d)}F.push(h+k)});var G=0,H=[];for(E=0;h>E&&G<F.length;)H[G]=F[G],E+=F[G++];for(0===G&&(G=1);E>h&&G>1;){H=[],G--;for(var I=0;I<F.length;I++)F[I]>(H[I%G]||0)&&(H[I%G]=F[I]);E=H.reduce(function(a,b,c,d){return a+b})}for(var J=[],K=0,L=0;G>K;K++)J[K]=L,L+=H[K];A.attr("transform",function(a,b){return"translate("+J[b%G]+","+(5+Math.floor(b/G)*z)+")"}),l?x.attr("transform","translate("+(d-c.right-E)+","+c.top+")"):x.attr("transform","translate(0,"+c.top+")"),e=c.top+c.bottom+Math.ceil(F.length/G)*z}else{var M,N=5,O=5,P=0;A.attr("transform",function(a,b){var e=d3.select(this).select("text").node().getComputedTextLength()+k;return M=O,d<c.left+c.right+M+e&&(O=M=5,N+=z),O+=e,O>P&&(P=O),M+P>E&&(E=M+P),"translate("+M+","+N+")"}),x.attr("transform","translate("+(d-c.right-P)+","+c.top+")"),e=c.top+c.bottom+N+15}if("furious"==q){y.attr("width",function(a,b){return D[0][b].getComputedTextLength()+27}).attr("height",18).attr("y",-9).attr("x",-15),w.insert("rect",":first-child").attr("class","nv-legend-bg").attr("fill","#eee").attr("opacity",0);var Q=x.select(".nv-legend-bg");Q.transition().duration(300).attr("x",-z).attr("width",E+z-12).attr("height",e+10).attr("y",-c.top-10).attr("opacity",o?1:0)}y.style("fill",t).style("fill-opacity",u).style("stroke",t)}),b}var c={top:5,right:0,bottom:5,left:0},d=400,e=20,f=function(a){return a.key},g=function(a){return a},h=a.utils.getColor(),i=20,j=!0,k=32,l=!0,m=!0,n=!1,o=!1,p=d3.dispatch("legendClick","legendDblclick","legendMouseover","legendMouseout","stateChange"),q="classic";return b.dispatch=p,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return d},set:function(a){d=a}},height:{get:function(){return e},set:function(a){e=a}},key:{get:function(){return f},set:function(a){f=a}},keyFormatter:{get:function(){return g},set:function(a){g=a}},align:{get:function(){return j},set:function(a){j=a}},maxKeyLength:{get:function(){return i},set:function(a){i=a}},rightAlign:{get:function(){return l},set:function(a){l=a}},padding:{get:function(){return k},set:function(a){k=a}},updateState:{get:function(){return m},set:function(a){m=a}},radioButtonMode:{get:function(){return n},set:function(a){n=a}},expanded:{get:function(){return o},set:function(a){o=a}},vers:{get:function(){return q},set:function(a){q=a}},margin:{get:function(){return c},set:function(a){c.top=void 0!==a.top?a.top:c.top,c.right=void 0!==a.right?a.right:c.right,c.bottom=void 0!==a.bottom?a.bottom:c.bottom,c.left=void 0!==a.left?a.left:c.left}},color:{get:function(){return h},set:function(b){h=a.utils.getColor(b)}}}),a.utils.initOptions(b),b},a.models.line=function(){"use strict";function b(r){return v.reset(),v.models(e),r.each(function(b){i=d3.select(this);var r=a.utils.availableWidth(g,i,f),s=a.utils.availableHeight(h,i,f);a.utils.initSVG(i),c=e.xScale(),d=e.yScale(),t=t||c,u=u||d;var w=i.selectAll("g.nv-wrap.nv-line").data([b]),x=w.enter().append("g").attr("class","nvd3 nv-wrap nv-line"),y=x.append("defs"),z=x.append("g"),A=w.select("g");z.append("g").attr("class","nv-groups"),z.append("g").attr("class","nv-scatterWrap"),w.attr("transform","translate("+f.left+","+f.top+")"),e.width(r).height(s);var B=w.select(".nv-scatterWrap");B.call(e),y.append("clipPath").attr("id","nv-edge-clip-"+e.id()).append("rect"),w.select("#nv-edge-clip-"+e.id()+" rect").attr("width",r).attr("height",s>0?s:0),A.attr("clip-path",p?"url(#nv-edge-clip-"+e.id()+")":""),B.attr("clip-path",p?"url(#nv-edge-clip-"+e.id()+")":"");var C=w.select(".nv-groups").selectAll(".nv-group").data(function(a){return a},function(a){return a.key});C.enter().append("g").style("stroke-opacity",1e-6).style("stroke-width",function(a){return a.strokeWidth||j}).style("fill-opacity",1e-6),C.exit().remove(),C.attr("class",function(a,b){return(a.classed||"")+" nv-group nv-series-"+b}).classed("hover",function(a){return a.hover}).style("fill",function(a,b){return k(a,b)}).style("stroke",function(a,b){return k(a,b)}),C.watchTransition(v,"line: groups").style("stroke-opacity",1).style("fill-opacity",function(a){return a.fillOpacity||.5});var D=C.selectAll("path.nv-area").data(function(a){return o(a)?[a]:[]});D.enter().append("path").attr("class","nv-area").attr("d",function(b){return d3.svg.area().interpolate(q).defined(n).x(function(b,c){return a.utils.NaNtoZero(t(l(b,c)))}).y0(function(b,c){return a.utils.NaNtoZero(u(m(b,c)))}).y1(function(a,b){return u(d.domain()[0]<=0?d.domain()[1]>=0?0:d.domain()[1]:d.domain()[0])}).apply(this,[b.values])}),C.exit().selectAll("path.nv-area").remove(),D.watchTransition(v,"line: areaPaths").attr("d",function(b){return d3.svg.area().interpolate(q).defined(n).x(function(b,d){return a.utils.NaNtoZero(c(l(b,d)))}).y0(function(b,c){return a.utils.NaNtoZero(d(m(b,c)))}).y1(function(a,b){return d(d.domain()[0]<=0?d.domain()[1]>=0?0:d.domain()[1]:d.domain()[0])}).apply(this,[b.values])});var E=C.selectAll("path.nv-line").data(function(a){return[a.values]});E.enter().append("path").attr("class","nv-line").attr("d",d3.svg.line().interpolate(q).defined(n).x(function(b,c){return a.utils.NaNtoZero(t(l(b,c)))}).y(function(b,c){return a.utils.NaNtoZero(u(m(b,c)))})),E.watchTransition(v,"line: linePaths").attr("d",d3.svg.line().interpolate(q).defined(n).x(function(b,d){return a.utils.NaNtoZero(c(l(b,d)))}).y(function(b,c){return a.utils.NaNtoZero(d(m(b,c)))})),t=c.copy(),u=d.copy()}),v.renderEnd("line immediate"),b}var c,d,e=a.models.scatter(),f={top:0,right:0,bottom:0,left:0},g=960,h=500,i=null,j=1.5,k=a.utils.defaultColor(),l=function(a){return a.x},m=function(a){return a.y},n=function(a,b){return!isNaN(m(a,b))&&null!==m(a,b)},o=function(a){return a.area},p=!1,q="linear",r=250,s=d3.dispatch("elementClick","elementMouseover","elementMouseout","renderEnd");e.pointSize(16).pointDomain([16,256]);var t,u,v=a.utils.renderWatch(s,r);return b.dispatch=s,b.scatter=e,e.dispatch.on("elementClick",function(){s.elementClick.apply(this,arguments)}),e.dispatch.on("elementMouseover",function(){s.elementMouseover.apply(this,arguments)}),e.dispatch.on("elementMouseout",function(){s.elementMouseout.apply(this,arguments)}),b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return g},set:function(a){g=a}},height:{get:function(){return h},set:function(a){h=a}},defined:{get:function(){return n},set:function(a){n=a}},interpolate:{get:function(){return q},set:function(a){q=a}},clipEdge:{get:function(){return p},set:function(a){p=a}},margin:{get:function(){return f},set:function(a){f.top=void 0!==a.top?a.top:f.top,f.right=void 0!==a.right?a.right:f.right,f.bottom=void 0!==a.bottom?a.bottom:f.bottom,f.left=void 0!==a.left?a.left:f.left}},duration:{get:function(){return r},set:function(a){r=a,v.reset(r),e.duration(r)}},isArea:{get:function(){return o},set:function(a){o=d3.functor(a)}},x:{get:function(){return l},set:function(a){l=a,e.x(a)}},y:{get:function(){return m},set:function(a){m=a,e.y(a)}},color:{get:function(){return k},set:function(b){k=a.utils.getColor(b),e.color(k)}}}),a.utils.inheritOptions(b,e),a.utils.initOptions(b),b},a.models.lineChart=function(){"use strict";function b(j){return B.reset(),B.models(e),r&&B.models(f),s&&B.models(g),j.each(function(j){function y(){r&&L.select(".nv-focus .nv-x.nv-axis").transition().duration(A).call(f)}function B(){s&&L.select(".nv-focus .nv-y.nv-axis").transition().duration(A).call(g)}function E(a){var b=L.select(".nv-focus .nv-linesWrap").datum(j.filter(function(a){return!a.disabled}).map(function(b,c){return{key:b.key,area:b.area,classed:b.classed,values:b.values.filter(function(b,c){return e.x()(b,c)>=a[0]&&e.x()(b,c)<=a[1]}),disableTooltip:b.disableTooltip}}));b.transition().duration(A).call(e),y(),B()}var F=d3.select(this);a.utils.initSVG(F);var G=a.utils.availableWidth(n,F,l),H=a.utils.availableHeight(o,F,l)-(v?k.height():0);if(b.update=function(){0===A?F.call(b):F.transition().duration(A).call(b)},b.container=this,w.setter(D(j),b.update).getter(C(j)).update(),w.disabled=j.map(function(a){return!!a.disabled}),!x){var I;x={};for(I in w)w[I]instanceof Array?x[I]=w[I].slice(0):x[I]=w[I]}if(!(j&&j.length&&j.filter(function(a){return a.values.length}).length))return a.utils.noData(b,F),b;F.selectAll(".nv-noData").remove(),k.dispatch.on("onBrush",function(a){E(a)}),c=e.xScale(),d=e.yScale();var J=F.selectAll("g.nv-wrap.nv-lineChart").data([j]),K=J.enter().append("g").attr("class","nvd3 nv-wrap nv-lineChart").append("g"),L=J.select("g");K.append("g").attr("class","nv-legendWrap");var M=K.append("g").attr("class","nv-focus");M.append("g").attr("class","nv-background").append("rect"),M.append("g").attr("class","nv-x nv-axis"),M.append("g").attr("class","nv-y nv-axis"),M.append("g").attr("class","nv-linesWrap"),M.append("g").attr("class","nv-interactive");K.append("g").attr("class","nv-focusWrap");p?(h.width(G),L.select(".nv-legendWrap").datum(j).call(h),"bottom"===q?J.select(".nv-legendWrap").attr("transform","translate(0,"+H+")"):"top"===q&&(h.height()>l.top&&(l.top=h.height(),H=a.utils.availableHeight(o,F,l)-(v?k.height():0)),J.select(".nv-legendWrap").attr("transform","translate(0,"+-l.top+")"))):L.select(".nv-legendWrap").selectAll("*").remove(),J.attr("transform","translate("+l.left+","+l.top+")"),t&&L.select(".nv-y.nv-axis").attr("transform","translate("+G+",0)"),u&&(i.width(G).height(H).margin({left:l.left,top:l.top}).svgContainer(F).xScale(c),J.select(".nv-interactive").call(i)),L.select(".nv-focus .nv-background rect").attr("width",G).attr("height",H),e.width(G).height(H).color(j.map(function(a,b){return a.color||m(a,b)}).filter(function(a,b){return!j[b].disabled}));var N=L.select(".nv-linesWrap").datum(j.filter(function(a){return!a.disabled}));if(r&&f.scale(c)._ticks(a.utils.calcTicksX(G/100,j)).tickSize(-H,0),s&&g.scale(d)._ticks(a.utils.calcTicksY(H/36,j)).tickSize(-G,0),L.select(".nv-focus .nv-x.nv-axis").attr("transform","translate(0,"+H+")"),v){k.width(G),L.select(".nv-focusWrap").attr("transform","translate(0,"+(H+l.bottom+k.margin().top)+")").datum(j.filter(function(a){return!a.disabled})).call(k);var O=k.brush.empty()?k.xDomain():k.brush.extent();null!==O&&E(O)}else N.call(e),y(),B();h.dispatch.on("stateChange",function(a){for(var c in a)w[c]=a[c];z.stateChange(w),b.update()}),i.dispatch.on("elementMousemove",function(d){e.clearHighlights();var f,h,l,n=[];if(j.filter(function(a,b){return a.seriesIndex=b,!a.disabled&&!a.disableTooltip}).forEach(function(g,i){var j=v?k.brush.empty()?k.xScale().domain():k.brush.extent():c.domain(),o=g.values.filter(function(a,b){return e.x()(a,b)>=j[0]&&e.x()(a,b)<=j[1]});h=a.interactiveBisect(o,d.pointXValue,e.x());var p=o[h],q=b.y()(p,h);null!==q&&e.highlightPoint(g.seriesIndex,h,!0),void 0!==p&&(void 0===f&&(f=p),void 0===l&&(l=b.xScale()(b.x()(p,h))),n.push({key:g.key,value:q,color:m(g,g.seriesIndex),data:p}))}),n.length>2){var o=b.yScale().invert(d.mouseY),p=Math.abs(b.yScale().domain()[0]-b.yScale().domain()[1]),q=.03*p,r=a.nearestValueIndex(n.map(function(a){return a.value}),o,q);null!==r&&(n[r].highlight=!0)}var s=function(a,b){return null==a?"N/A":g.tickFormat()(a)};i.tooltip.valueFormatter(i.tooltip.valueFormatter()||s).data({value:b.x()(f,h),index:h,series:n})(),i.renderGuideLine(l)}),i.dispatch.on("elementClick",function(c){var d,f=[];j.filter(function(a,b){return a.seriesIndex=b,!a.disabled}).forEach(function(e){var g=a.interactiveBisect(e.values,c.pointXValue,b.x()),h=e.values[g];if("undefined"!=typeof h){"undefined"==typeof d&&(d=b.xScale()(b.x()(h,g)));var i=b.yScale()(b.y()(h,g));f.push({point:h,pointIndex:g,pos:[d,i],seriesIndex:e.seriesIndex,series:e})}}),e.dispatch.elementClick(f)}),i.dispatch.on("elementMouseout",function(a){e.clearHighlights()}),z.on("changeState",function(a){"undefined"!=typeof a.disabled&&j.length===a.disabled.length&&(j.forEach(function(b,c){b.disabled=a.disabled[c]}),w.disabled=a.disabled),b.update()})}),B.renderEnd("lineChart immediate"),b}var c,d,e=a.models.line(),f=a.models.axis(),g=a.models.axis(),h=a.models.legend(),i=a.interactiveGuideline(),j=a.models.tooltip(),k=a.models.focus(a.models.line()),l={top:30,right:20,bottom:50,left:60},m=a.utils.defaultColor(),n=null,o=null,p=!0,q="top",r=!0,s=!0,t=!1,u=!1,v=!1,w=a.utils.state(),x=null,y=null,z=d3.dispatch("tooltipShow","tooltipHide","stateChange","changeState","renderEnd"),A=250;f.orient("bottom").tickPadding(7),g.orient(t?"right":"left"),e.clipEdge(!0).duration(0),j.valueFormatter(function(a,b){return g.tickFormat()(a,b)}).headerFormatter(function(a,b){return f.tickFormat()(a,b)}),i.tooltip.valueFormatter(function(a,b){return g.tickFormat()(a,b)}).headerFormatter(function(a,b){return f.tickFormat()(a,b)});var B=a.utils.renderWatch(z,A),C=function(a){return function(){return{active:a.map(function(a){return!a.disabled})}}},D=function(a){return function(b){void 0!==b.active&&a.forEach(function(a,c){a.disabled=!b.active[c]})}};return e.dispatch.on("elementMouseover.tooltip",function(a){a.series.disableTooltip||j.data(a).hidden(!1)}),e.dispatch.on("elementMouseout.tooltip",function(a){j.hidden(!0)}),b.dispatch=z,b.lines=e,b.legend=h,b.focus=k,b.xAxis=f,b.x2Axis=k.xAxis,b.yAxis=g,b.y2Axis=k.yAxis,b.interactiveLayer=i,b.tooltip=j,b.state=w,b.dispatch=z,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return n},set:function(a){n=a}},height:{get:function(){return o},set:function(a){o=a}},showLegend:{get:function(){return p},set:function(a){p=a}},legendPosition:{get:function(){return q},set:function(a){q=a}},showXAxis:{get:function(){return r},set:function(a){r=a}},showYAxis:{get:function(){return s},set:function(a){s=a}},defaultState:{get:function(){return x},set:function(a){x=a}},noData:{get:function(){return y},set:function(a){y=a}},focusEnable:{get:function(){return v},set:function(a){v=a}},focusHeight:{get:function(){return k.height()},set:function(a){k.height(a)}},focusShowAxisX:{get:function(){return k.showXAxis()},set:function(a){k.showXAxis(a)}},focusShowAxisY:{get:function(){return k.showYAxis()},set:function(a){k.showYAxis(a)}},brushExtent:{get:function(){return k.brushExtent()},set:function(a){k.brushExtent(a)}},focusMargin:{get:function(){return k.margin},set:function(a){k.margin.top=void 0!==a.top?a.top:k.margin.top,k.margin.right=void 0!==a.right?a.right:k.margin.right,k.margin.bottom=void 0!==a.bottom?a.bottom:k.margin.bottom,k.margin.left=void 0!==a.left?a.left:k.margin.left}},margin:{get:function(){return l},set:function(a){l.top=void 0!==a.top?a.top:l.top,l.right=void 0!==a.right?a.right:l.right,l.bottom=void 0!==a.bottom?a.bottom:l.bottom,l.left=void 0!==a.left?a.left:l.left}},duration:{get:function(){return A},set:function(a){A=a,B.reset(A),e.duration(A),k.duration(A),f.duration(A),g.duration(A)}},color:{get:function(){return m},set:function(b){m=a.utils.getColor(b),h.color(m),e.color(m),k.color(m)}},interpolate:{get:function(){return e.interpolate()},set:function(a){e.interpolate(a),k.interpolate(a)}},xTickFormat:{get:function(){return f.tickFormat()},set:function(a){f.tickFormat(a),k.xTickFormat(a)}},yTickFormat:{get:function(){return g.tickFormat()},set:function(a){g.tickFormat(a),k.yTickFormat(a)}},x:{get:function(){return e.x()},set:function(a){e.x(a),k.x(a)}},y:{get:function(){return e.y()},set:function(a){e.y(a),k.y(a)}},rightAlignYAxis:{get:function(){return t},set:function(a){t=a,g.orient(t?"right":"left")}},useInteractiveGuideline:{get:function(){return u},set:function(a){u=a,u&&(e.interactive(!1),e.useVoronoi(!1))}}}),a.utils.inheritOptions(b,e),a.utils.initOptions(b),b},a.models.lineWithFocusChart=function(){return a.models.lineChart().margin({bottom:30}).focusEnable(!0)},a.models.linePlusBarChart=function(){"use strict";function b(v){return v.each(function(v){function J(a){var b=+("e"==a),c=b?1:-1,d=Z/3;return"M"+.5*c+","+d+"A6,6 0 0 "+b+" "+6.5*c+","+(d+6)+"V"+(2*d-6)+"A6,6 0 0 "+b+" "+.5*c+","+2*d+"ZM"+2.5*c+","+(d+8)+"V"+(2*d-8)+"M"+4.5*c+","+(d+8)+"V"+(2*d-8)}function R(){u.empty()||u.extent(I),ma.data([u.empty()?e.domain():I]).each(function(a,b){var c=e(a[0])-e.range()[0],d=e.range()[1]-e(a[1]);d3.select(this).select(".left").attr("width",0>c?0:c),d3.select(this).select(".right").attr("x",e(a[1])).attr("width",0>d?0:d)})}function S(){I=u.empty()?null:u.extent(),c=u.empty()?e.domain():u.extent(),K.brush({extent:c,brush:u}),R(),l.width(X).height(Y).color(v.map(function(a,b){return a.color||C(a,b)}).filter(function(a,b){return!v[b].disabled&&v[b].bar})),j.width(X).height(Y).color(v.map(function(a,b){return a.color||C(a,b)}).filter(function(a,b){return!v[b].disabled&&!v[b].bar}));var b=fa.select(".nv-focus .nv-barsWrap").datum(_.length?_.map(function(a,b){return{key:a.key,values:a.values.filter(function(a,b){return l.x()(a,b)>=c[0]&&l.x()(a,b)<=c[1]})}}):[{values:[]}]),h=fa.select(".nv-focus .nv-linesWrap").datum(V(aa)?[{values:[]}]:aa.filter(function(a){return!a.disabled}).map(function(a,b){return{area:a.area,fillOpacity:a.fillOpacity,strokeWidth:a.strokeWidth,key:a.key,values:a.values.filter(function(a,b){return j.x()(a,b)>=c[0]&&j.x()(a,b)<=c[1]})}}));d=_.length&&!Q?l.xScale():j.xScale(),n.scale(d)._ticks(a.utils.calcTicksX(X/100,v)).tickSize(-Y,0),n.domain([Math.ceil(c[0]),Math.floor(c[1])]),fa.select(".nv-x.nv-axis").transition().duration(L).call(n),b.transition().duration(L).call(l),h.transition().duration(L).call(j),fa.select(".nv-focus .nv-x.nv-axis").attr("transform","translate(0,"+f.range()[0]+")"),p.scale(f)._ticks(a.utils.calcTicksY(Y/36,v)).tickSize(-X,0),q.scale(g)._ticks(a.utils.calcTicksY(Y/36,v)),Q?q.tickSize(aa.length?0:-X,0):q.tickSize(_.length?0:-X,0);var i=_.length?1:0,k=aa.length&&!V(aa)?1:0,m=Q?k:i,o=Q?i:k;fa.select(".nv-focus .nv-y1.nv-axis").style("opacity",m),fa.select(".nv-focus .nv-y2.nv-axis").style("opacity",o).attr("transform","translate("+d.range()[1]+",0)"),fa.select(".nv-focus .nv-y1.nv-axis").transition().duration(L).call(p),fa.select(".nv-focus .nv-y2.nv-axis").transition().duration(L).call(q)}var W=d3.select(this);a.utils.initSVG(W);var X=a.utils.availableWidth(y,W,w),Y=a.utils.availableHeight(z,W,w)-(E?H:0),Z=H-x.top-x.bottom;if(b.update=function(){W.transition().duration(L).call(b)},b.container=this,M.setter(U(v),b.update).getter(T(v)).update(),M.disabled=v.map(function(a){return!!a.disabled}),!N){var $;N={};for($ in M)M[$]instanceof Array?N[$]=M[$].slice(0):N[$]=M[$]}if(!(v&&v.length&&v.filter(function(a){return a.values.length}).length))return a.utils.noData(b,W),b;W.selectAll(".nv-noData").remove();var _=v.filter(function(a){return!a.disabled&&a.bar}),aa=v.filter(function(a){return!a.bar});d=_.length&&!Q?l.xScale():j.xScale(),e=o.scale(),f=Q?j.yScale():l.yScale(),g=Q?l.yScale():j.yScale(),h=Q?k.yScale():m.yScale(),i=Q?m.yScale():k.yScale();var ba=v.filter(function(a){return!a.disabled&&(Q?!a.bar:a.bar)}).map(function(a){return a.values.map(function(a,b){return{x:A(a,b),y:B(a,b)}})}),ca=v.filter(function(a){return!a.disabled&&(Q?a.bar:!a.bar)}).map(function(a){return a.values.map(function(a,b){return{x:A(a,b),y:B(a,b)}})});d.range([0,X]),e.domain(d3.extent(d3.merge(ba.concat(ca)),function(a){return a.x})).range([0,X]);var da=W.selectAll("g.nv-wrap.nv-linePlusBar").data([v]),ea=da.enter().append("g").attr("class","nvd3 nv-wrap nv-linePlusBar").append("g"),fa=da.select("g");ea.append("g").attr("class","nv-legendWrap");var ga=ea.append("g").attr("class","nv-focus");ga.append("g").attr("class","nv-x nv-axis"),ga.append("g").attr("class","nv-y1 nv-axis"),ga.append("g").attr("class","nv-y2 nv-axis"),ga.append("g").attr("class","nv-barsWrap"),ga.append("g").attr("class","nv-linesWrap");var ha=ea.append("g").attr("class","nv-context");if(ha.append("g").attr("class","nv-x nv-axis"),ha.append("g").attr("class","nv-y1 nv-axis"),ha.append("g").attr("class","nv-y2 nv-axis"),ha.append("g").attr("class","nv-barsWrap"),ha.append("g").attr("class","nv-linesWrap"),ha.append("g").attr("class","nv-brushBackground"),ha.append("g").attr("class","nv-x nv-brush"),D){var ia=t.align()?X/2:X,ja=t.align()?ia:0;t.width(ia),fa.select(".nv-legendWrap").datum(v.map(function(a){return a.originalKey=void 0===a.originalKey?a.key:a.originalKey,Q?a.key=a.originalKey+(a.bar?P:O):a.key=a.originalKey+(a.bar?O:P),a})).call(t),t.height()>w.top&&(w.top=t.height(),Y=a.utils.availableHeight(z,W,w)-H),fa.select(".nv-legendWrap").attr("transform","translate("+ja+","+-w.top+")")}else fa.select(".nv-legendWrap").selectAll("*").remove();da.attr("transform","translate("+w.left+","+w.top+")"),fa.select(".nv-context").style("display",E?"initial":"none"),m.width(X).height(Z).color(v.map(function(a,b){return a.color||C(a,b)}).filter(function(a,b){return!v[b].disabled&&v[b].bar})),k.width(X).height(Z).color(v.map(function(a,b){return a.color||C(a,b)}).filter(function(a,b){return!v[b].disabled&&!v[b].bar}));var ka=fa.select(".nv-context .nv-barsWrap").datum(_.length?_:[{values:[]}]),la=fa.select(".nv-context .nv-linesWrap").datum(V(aa)?[{values:[]}]:aa.filter(function(a){return!a.disabled}));fa.select(".nv-context").attr("transform","translate(0,"+(Y+w.bottom+x.top)+")"),ka.transition().call(m),la.transition().call(k),G&&(o._ticks(a.utils.calcTicksX(X/100,v)).tickSize(-Z,0),fa.select(".nv-context .nv-x.nv-axis").attr("transform","translate(0,"+h.range()[0]+")"),fa.select(".nv-context .nv-x.nv-axis").transition().call(o)),F&&(r.scale(h)._ticks(Z/36).tickSize(-X,0),s.scale(i)._ticks(Z/36).tickSize(_.length?0:-X,0),fa.select(".nv-context .nv-y3.nv-axis").style("opacity",_.length?1:0).attr("transform","translate(0,"+e.range()[0]+")"),fa.select(".nv-context .nv-y2.nv-axis").style("opacity",aa.length?1:0).attr("transform","translate("+e.range()[1]+",0)"),fa.select(".nv-context .nv-y1.nv-axis").transition().call(r),fa.select(".nv-context .nv-y2.nv-axis").transition().call(s)),u.x(e).on("brush",S),I&&u.extent(I);var ma=fa.select(".nv-brushBackground").selectAll("g").data([I||u.extent()]),na=ma.enter().append("g");na.append("rect").attr("class","left").attr("x",0).attr("y",0).attr("height",Z),na.append("rect").attr("class","right").attr("x",0).attr("y",0).attr("height",Z);var oa=fa.select(".nv-x.nv-brush").call(u);oa.selectAll("rect").attr("height",Z),oa.selectAll(".resize").append("path").attr("d",J),t.dispatch.on("stateChange",function(a){for(var c in a)M[c]=a[c];K.stateChange(M),b.update()}),K.on("changeState",function(a){"undefined"!=typeof a.disabled&&(v.forEach(function(b,c){b.disabled=a.disabled[c]}),M.disabled=a.disabled),b.update()}),S()}),b}var c,d,e,f,g,h,i,j=a.models.line(),k=a.models.line(),l=a.models.historicalBar(),m=a.models.historicalBar(),n=a.models.axis(),o=a.models.axis(),p=a.models.axis(),q=a.models.axis(),r=a.models.axis(),s=a.models.axis(),t=a.models.legend(),u=d3.svg.brush(),v=a.models.tooltip(),w={top:30,right:30,bottom:30,left:60},x={top:0,right:30,bottom:20,left:60},y=null,z=null,A=function(a){return a.x},B=function(a){return a.y},C=a.utils.defaultColor(),D=!0,E=!0,F=!1,G=!0,H=50,I=null,J=null,K=d3.dispatch("brush","stateChange","changeState"),L=0,M=a.utils.state(),N=null,O=" (left axis)",P=" (right axis)",Q=!1;j.clipEdge(!0),k.interactive(!1),k.pointActive(function(a){return!1}),n.orient("bottom").tickPadding(5),p.orient("left"),q.orient("right"),o.orient("bottom").tickPadding(5),r.orient("left"),s.orient("right"),v.headerEnabled(!0).headerFormatter(function(a,b){return n.tickFormat()(a,b)});var R=function(){return Q?{main:q,focus:s}:{main:p,focus:r}},S=function(){return Q?{main:p,focus:r}:{main:q,focus:s}},T=function(a){return function(){return{active:a.map(function(a){return!a.disabled})}}},U=function(a){return function(b){void 0!==b.active&&a.forEach(function(a,c){a.disabled=!b.active[c]})}},V=function(a){return a.every(function(a){return a.disabled})};return j.dispatch.on("elementMouseover.tooltip",function(a){v.duration(100).valueFormatter(function(a,b){return S().main.tickFormat()(a,b)}).data(a).hidden(!1)}),j.dispatch.on("elementMouseout.tooltip",function(a){v.hidden(!0)}),l.dispatch.on("elementMouseover.tooltip",function(a){a.value=b.x()(a.data),a.series={value:b.y()(a.data),color:a.color},v.duration(0).valueFormatter(function(a,b){return R().main.tickFormat()(a,b)}).data(a).hidden(!1)}),l.dispatch.on("elementMouseout.tooltip",function(a){v.hidden(!0)}),l.dispatch.on("elementMousemove.tooltip",function(a){v()}),b.dispatch=K,b.legend=t,b.lines=j,b.lines2=k,b.bars=l,b.bars2=m,b.xAxis=n,b.x2Axis=o,b.y1Axis=p,b.y2Axis=q,b.y3Axis=r,b.y4Axis=s,b.tooltip=v,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return y},set:function(a){y=a}},height:{get:function(){return z},set:function(a){z=a}},showLegend:{get:function(){return D},set:function(a){D=a}},brushExtent:{get:function(){return I},set:function(a){I=a}},noData:{get:function(){return J},set:function(a){J=a}},focusEnable:{get:function(){return E},set:function(a){E=a}},focusHeight:{get:function(){return H},set:function(a){H=a}},focusShowAxisX:{get:function(){return G},set:function(a){G=a}},focusShowAxisY:{get:function(){return F},set:function(a){F=a}},legendLeftAxisHint:{get:function(){return O},set:function(a){O=a}},legendRightAxisHint:{get:function(){return P},set:function(a){P=a}},margin:{get:function(){return w},set:function(a){w.top=void 0!==a.top?a.top:w.top,w.right=void 0!==a.right?a.right:w.right,w.bottom=void 0!==a.bottom?a.bottom:w.bottom,w.left=void 0!==a.left?a.left:w.left}},focusMargin:{get:function(){return x},set:function(a){x.top=void 0!==a.top?a.top:x.top,x.right=void 0!==a.right?a.right:x.right,x.bottom=void 0!==a.bottom?a.bottom:x.bottom,x.left=void 0!==a.left?a.left:x.left}},duration:{get:function(){return L},set:function(a){L=a}},color:{get:function(){return C},set:function(b){C=a.utils.getColor(b),t.color(C)}},x:{get:function(){return A},set:function(a){A=a,j.x(a),k.x(a),l.x(a),m.x(a)}},y:{get:function(){return B},set:function(a){B=a,j.y(a),k.y(a),l.y(a),m.y(a)}},switchYAxisOrder:{get:function(){return Q},set:function(a){if(Q!==a){var b=p;p=q,q=b;var c=r;r=s,s=c}Q=a,p.orient("left"),q.orient("right"),r.orient("left"),s.orient("right")}}}),a.utils.inheritOptions(b,j),a.utils.initOptions(b),b},a.models.multiBar=function(){"use strict";function b(F){return D.reset(),F.each(function(b){var F=k-j.left-j.right,G=l-j.top-j.bottom;p=d3.select(this),a.utils.initSVG(p);var H=0;if(x&&b.length&&(x=[{values:b[0].values.map(function(a){return{x:a.x,y:0,series:a.series,size:.01}})}]),u){var I=d3.layout.stack().offset(v).values(function(a){return a.values}).y(r)(!b.length&&x?x:b);I.forEach(function(a,c){a.nonStackable?(b[c].nonStackableSeries=H++,I[c]=b[c]):c>0&&I[c-1].nonStackable&&I[c].values.map(function(a,b){a.y0-=I[c-1].values[b].y,a.y1=a.y0+a.y})}),b=I}b.forEach(function(a,b){a.values.forEach(function(c){c.series=b,c.key=a.key})}),u&&b.length>0&&b[0].values.map(function(a,c){var d=0,e=0;b.map(function(a,f){if(!b[f].nonStackable){var g=a.values[c];g.size=Math.abs(g.y),g.y<0?(g.y1=e,e-=g.size):(g.y1=g.size+d,d+=g.size)}})});var J=d&&e?[]:b.map(function(a,b){return a.values.map(function(a,c){return{x:q(a,c),y:r(a,c),y0:a.y0,y1:a.y1,idx:b}})});m.domain(d||d3.merge(J).map(function(a){return a.x})).rangeBands(f||[0,F],A),n.domain(e||d3.extent(d3.merge(J).map(function(a){var c=a.y;return u&&!b[a.idx].nonStackable&&(c=a.y>0?a.y1:a.y1+a.y),c}).concat(s))).range(g||[G,0]),m.domain()[0]===m.domain()[1]&&(m.domain()[0]?m.domain([m.domain()[0]-.01*m.domain()[0],m.domain()[1]+.01*m.domain()[1]]):m.domain([-1,1])),n.domain()[0]===n.domain()[1]&&(n.domain()[0]?n.domain([n.domain()[0]+.01*n.domain()[0],n.domain()[1]-.01*n.domain()[1]]):n.domain([-1,1])),h=h||m,i=i||n;var K=p.selectAll("g.nv-wrap.nv-multibar").data([b]),L=K.enter().append("g").attr("class","nvd3 nv-wrap nv-multibar"),M=L.append("defs"),N=L.append("g"),O=K.select("g");N.append("g").attr("class","nv-groups"),K.attr("transform","translate("+j.left+","+j.top+")"),M.append("clipPath").attr("id","nv-edge-clip-"+o).append("rect"),K.select("#nv-edge-clip-"+o+" rect").attr("width",F).attr("height",G),O.attr("clip-path",t?"url(#nv-edge-clip-"+o+")":"");var P=K.select(".nv-groups").selectAll(".nv-group").data(function(a){return a},function(a,b){return b});P.enter().append("g").style("stroke-opacity",1e-6).style("fill-opacity",1e-6);var Q=D.transition(P.exit().selectAll("rect.nv-bar"),"multibarExit",Math.min(100,z)).attr("y",function(a,c,d){var e=i(0)||0;return u&&b[a.series]&&!b[a.series].nonStackable&&(e=i(a.y0)),e}).attr("height",0).remove();Q.delay&&Q.delay(function(a,b){var c=b*(z/(E+1))-b;return c}),P.attr("class",function(a,b){return"nv-group nv-series-"+b}).classed("hover",function(a){return a.hover}).style("fill",function(a,b){return w(a,b)}).style("stroke",function(a,b){return w(a,b)}),P.style("stroke-opacity",1).style("fill-opacity",B);var R=P.selectAll("rect.nv-bar").data(function(a){return x&&!b.length?x.values:a.values});R.exit().remove();R.enter().append("rect").attr("class",function(a,b){return r(a,b)<0?"nv-bar negative":"nv-bar positive"}).attr("x",function(a,c,d){return u&&!b[d].nonStackable?0:d*m.rangeBand()/b.length}).attr("y",function(a,c,d){return i(u&&!b[d].nonStackable?a.y0:0)||0}).attr("height",0).attr("width",function(a,c,d){return m.rangeBand()/(u&&!b[d].nonStackable?1:b.length)}).attr("transform",function(a,b){return"translate("+m(q(a,b))+",0)"});R.style("fill",function(a,b,c){return w(a,c,b)}).style("stroke",function(a,b,c){return w(a,c,b)}).on("mouseover",function(a,b){d3.select(this).classed("hover",!0),C.elementMouseover({data:a,index:b,color:d3.select(this).style("fill")})}).on("mouseout",function(a,b){d3.select(this).classed("hover",!1),C.elementMouseout({data:a,index:b,color:d3.select(this).style("fill")})}).on("mousemove",function(a,b){C.elementMousemove({data:a,index:b,color:d3.select(this).style("fill")})}).on("click",function(a,b){var c=this;C.elementClick({data:a,index:b,color:d3.select(this).style("fill"),event:d3.event,element:c}),d3.event.stopPropagation()}).on("dblclick",function(a,b){C.elementDblClick({data:a,index:b,color:d3.select(this).style("fill")}),d3.event.stopPropagation()}),R.attr("class",function(a,b){return r(a,b)<0?"nv-bar negative":"nv-bar positive"}).attr("transform",function(a,b){return"translate("+m(q(a,b))+",0)"}),y&&(c||(c=b.map(function(){return!0})),R.style("fill",function(a,b,d){return d3.rgb(y(a,b)).darker(c.map(function(a,b){return b}).filter(function(a,b){return!c[b]})[d]).toString()}).style("stroke",function(a,b,d){return d3.rgb(y(a,b)).darker(c.map(function(a,b){return b}).filter(function(a,b){return!c[b]})[d]).toString()}));var S=R.watchTransition(D,"multibar",Math.min(250,z)).delay(function(a,c){return c*z/b[0].values.length});u?S.attr("y",function(a,c,d){var e=0;return e=b[d].nonStackable?r(a,c)<0?n(0):n(0)-n(r(a,c))<-1?n(0)-1:n(r(a,c))||0:n(a.y1)}).attr("height",function(a,c,d){return b[d].nonStackable?Math.max(Math.abs(n(r(a,c))-n(0)),0)||0:Math.max(Math.abs(n(a.y+a.y0)-n(a.y0)),0)}).attr("x",function(a,c,d){var e=0;return b[d].nonStackable&&(e=a.series*m.rangeBand()/b.length,b.length!==H&&(e=b[d].nonStackableSeries*m.rangeBand()/(2*H))),e}).attr("width",function(a,c,d){if(b[d].nonStackable){var e=m.rangeBand()/H;return b.length!==H&&(e=m.rangeBand()/(2*H)),e}return m.rangeBand()}):S.attr("x",function(a,c){return a.series*m.rangeBand()/b.length}).attr("width",m.rangeBand()/b.length).attr("y",function(a,b){return r(a,b)<0?n(0):n(0)-n(r(a,b))<1?n(0)-1:n(r(a,b))||0}).attr("height",function(a,b){return Math.max(Math.abs(n(r(a,b))-n(0)),1)||0}),h=m.copy(),i=n.copy(),b[0]&&b[0].values&&(E=b[0].values.length)}),D.renderEnd("multibar immediate"),b}var c,d,e,f,g,h,i,j={top:0,right:0,bottom:0,left:0},k=960,l=500,m=d3.scale.ordinal(),n=d3.scale.linear(),o=Math.floor(1e4*Math.random()),p=null,q=function(a){return a.x},r=function(a){return a.y},s=[0],t=!0,u=!1,v="zero",w=a.utils.defaultColor(),x=!1,y=null,z=500,A=.1,B=.75,C=d3.dispatch("chartClick","elementClick","elementDblClick","elementMouseover","elementMouseout","elementMousemove","renderEnd"),D=a.utils.renderWatch(C,z),E=0;return b.dispatch=C,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return k},set:function(a){k=a}},height:{get:function(){return l},set:function(a){l=a}},x:{get:function(){return q},set:function(a){q=a}},y:{get:function(){return r},set:function(a){r=a}},xScale:{get:function(){return m},set:function(a){m=a}},yScale:{get:function(){return n},set:function(a){n=a}},xDomain:{get:function(){return d},set:function(a){d=a}},yDomain:{get:function(){return e},set:function(a){e=a}},xRange:{get:function(){return f},set:function(a){f=a}},yRange:{get:function(){return g},set:function(a){g=a}},forceY:{get:function(){return s},set:function(a){s=a}},stacked:{get:function(){return u},set:function(a){u=a}},stackOffset:{get:function(){return v},set:function(a){v=a}},clipEdge:{get:function(){return t},set:function(a){t=a}},disabled:{get:function(){return c},set:function(a){c=a}},id:{get:function(){return o},set:function(a){o=a}},hideable:{get:function(){return x},set:function(a){x=a}},groupSpacing:{get:function(){return A},set:function(a){A=a}},fillOpacity:{get:function(){return B},set:function(a){B=a}},margin:{get:function(){return j},set:function(a){j.top=void 0!==a.top?a.top:j.top,j.right=void 0!==a.right?a.right:j.right,j.bottom=void 0!==a.bottom?a.bottom:j.bottom,j.left=void 0!==a.left?a.left:j.left}},duration:{get:function(){return z},set:function(a){z=a,D.reset(z)}},color:{get:function(){return w},set:function(b){w=a.utils.getColor(b)}},barColor:{get:function(){return y},set:function(b){y=b?a.utils.getColor(b):null}}}),a.utils.initOptions(b),b},a.models.multiBarChart=function(){"use strict";function b(B){return G.reset(),G.models(e),s&&G.models(f),t&&G.models(g),B.each(function(B){var G=d3.select(this);a.utils.initSVG(G);var K=a.utils.availableWidth(m,G,l),L=a.utils.availableHeight(n,G,l);if(b.update=function(){0===E?G.call(b):G.transition().duration(E).call(b)},b.container=this,z.setter(J(B),b.update).getter(I(B)).update(),z.disabled=B.map(function(a){return!!a.disabled}),!A){var M;A={};for(M in z)z[M]instanceof Array?A[M]=z[M].slice(0):A[M]=z[M]}if(!(B&&B.length&&B.filter(function(a){return a.values.length}).length))return a.utils.noData(b,G),b;G.selectAll(".nv-noData").remove(),c=e.xScale(),d=e.yScale();var N=G.selectAll("g.nv-wrap.nv-multiBarWithLegend").data([B]),O=N.enter().append("g").attr("class","nvd3 nv-wrap nv-multiBarWithLegend").append("g"),P=N.select("g");if(O.append("g").attr("class","nv-x nv-axis"),O.append("g").attr("class","nv-y nv-axis"),O.append("g").attr("class","nv-barsWrap"),O.append("g").attr("class","nv-legendWrap"),O.append("g").attr("class","nv-controlsWrap"),O.append("g").attr("class","nv-interactive"),r?(i.width(K-D()),P.select(".nv-legendWrap").datum(B).call(i),i.height()>l.top&&(l.top=i.height(),L=a.utils.availableHeight(n,G,l)),P.select(".nv-legendWrap").attr("transform","translate("+D()+","+-l.top+")")):P.select(".nv-legendWrap").selectAll("*").remove(),p){var Q=[{key:q.grouped||"Grouped",disabled:e.stacked()},{key:q.stacked||"Stacked",disabled:!e.stacked()}];j.width(D()).color(["#444","#444","#444"]),P.select(".nv-controlsWrap").datum(Q).attr("transform","translate(0,"+-l.top+")").call(j)}else P.select(".nv-controlsWrap").selectAll("*").remove();N.attr("transform","translate("+l.left+","+l.top+")"),u&&P.select(".nv-y.nv-axis").attr("transform","translate("+K+",0)"),e.disabled(B.map(function(a){return a.disabled})).width(K).height(L).color(B.map(function(a,b){return a.color||o(a,b)}).filter(function(a,b){return!B[b].disabled}));var R=P.select(".nv-barsWrap").datum(B.filter(function(a){return!a.disabled}));if(R.call(e),s){f.scale(c)._ticks(a.utils.calcTicksX(K/100,B)).tickSize(-L,0),P.select(".nv-x.nv-axis").attr("transform","translate(0,"+d.range()[0]+")"),P.select(".nv-x.nv-axis").call(f);var S=P.select(".nv-x.nv-axis > g").selectAll("g");if(S.selectAll("line, text").style("opacity",1),w){var T=function(a,b){return"translate("+a+","+b+")"},U=5,V=17;S.selectAll("text").attr("transform",function(a,b,c){return T(0,c%2==0?U:V)});var W=d3.selectAll(".nv-x.nv-axis .nv-wrap g g text")[0].length;P.selectAll(".nv-x.nv-axis .nv-axisMaxMin text").attr("transform",function(a,b){return T(0,0===b||W%2!==0?V:U)})}x&&P.selectAll(".tick text").call(a.utils.wrapTicks,b.xAxis.rangeBand()),v&&S.filter(function(a,b){return b%Math.ceil(B[0].values.length/(K/100))!==0}).selectAll("text, line").style("opacity",0),y&&S.selectAll(".tick text").attr("transform","rotate("+y+" 0,0)").style("text-anchor",y>0?"start":"end"),P.select(".nv-x.nv-axis").selectAll("g.nv-axisMaxMin text").style("opacity",1)}t&&(g.scale(d)._ticks(a.utils.calcTicksY(L/36,B)).tickSize(-K,0),P.select(".nv-y.nv-axis").call(g)),F&&(h.width(K).height(L).margin({left:l.left,top:l.top}).svgContainer(G).xScale(c),N.select(".nv-interactive").call(h)),i.dispatch.on("stateChange",function(a){for(var c in a)z[c]=a[c];C.stateChange(z),b.update()}),j.dispatch.on("legendClick",function(a,c){if(a.disabled){switch(Q=Q.map(function(a){return a.disabled=!0,a}),a.disabled=!1,a.key){case"Grouped":case q.grouped:e.stacked(!1);break;case"Stacked":case q.stacked:e.stacked(!0)}z.stacked=e.stacked(),C.stateChange(z),b.update()}}),C.on("changeState",function(a){"undefined"!=typeof a.disabled&&(B.forEach(function(b,c){b.disabled=a.disabled[c]}),z.disabled=a.disabled),"undefined"!=typeof a.stacked&&(e.stacked(a.stacked),z.stacked=a.stacked,H=a.stacked),b.update()}),F?(h.dispatch.on("elementMousemove",function(a){if(void 0!=a.pointXValue){var d,e,f,g,i=[];B.filter(function(a,b){return a.seriesIndex=b,!a.disabled}).forEach(function(h,j){e=c.domain().indexOf(a.pointXValue);var k=h.values[e];void 0!==k&&(g=k.x,void 0===d&&(d=k),void 0===f&&(f=a.mouseX),i.push({key:h.key,value:b.y()(k,e),color:o(h,h.seriesIndex),data:h.values[e]}))}),h.tooltip.data({value:g,index:e,series:i})(),h.renderGuideLine(f)}}),h.dispatch.on("elementMouseout",function(a){h.tooltip.hidden(!0)})):(e.dispatch.on("elementMouseover.tooltip",function(a){a.value=b.x()(a.data),a.series={key:a.data.key,value:b.y()(a.data),color:a.color},k.data(a).hidden(!1)}),e.dispatch.on("elementMouseout.tooltip",function(a){k.hidden(!0)}),e.dispatch.on("elementMousemove.tooltip",function(a){k()}))}),G.renderEnd("multibarchart immediate"),b}var c,d,e=a.models.multiBar(),f=a.models.axis(),g=a.models.axis(),h=a.interactiveGuideline(),i=a.models.legend(),j=a.models.legend(),k=a.models.tooltip(),l={top:30,right:20,bottom:50,left:60},m=null,n=null,o=a.utils.defaultColor(),p=!0,q={},r=!0,s=!0,t=!0,u=!1,v=!0,w=!1,x=!1,y=0,z=a.utils.state(),A=null,B=null,C=d3.dispatch("stateChange","changeState","renderEnd"),D=function(){return p?180:0},E=250,F=!1;z.stacked=!1,e.stacked(!1),f.orient("bottom").tickPadding(7).showMaxMin(!1).tickFormat(function(a){return a}),g.orient(u?"right":"left").tickFormat(d3.format(",.1f")),k.duration(0).valueFormatter(function(a,b){return g.tickFormat()(a,b)}).headerFormatter(function(a,b){return f.tickFormat()(a,b)}),j.updateState(!1);var G=a.utils.renderWatch(C),H=!1,I=function(a){return function(){return{active:a.map(function(a){return!a.disabled}),stacked:H}}},J=function(a){return function(b){void 0!==b.stacked&&(H=b.stacked),void 0!==b.active&&a.forEach(function(a,c){a.disabled=!b.active[c]})}};return b.dispatch=C,b.multibar=e,b.legend=i,b.controls=j,b.xAxis=f,b.yAxis=g,b.state=z,b.tooltip=k,b.interactiveLayer=h,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return m},set:function(a){m=a}},height:{get:function(){return n},set:function(a){n=a}},showLegend:{get:function(){return r},set:function(a){r=a}},showControls:{get:function(){return p},set:function(a){p=a}},controlLabels:{get:function(){return q},set:function(a){q=a}},showXAxis:{get:function(){return s},set:function(a){s=a}},showYAxis:{get:function(){return t},set:function(a){t=a}},defaultState:{get:function(){return A},set:function(a){A=a}},noData:{get:function(){return B},set:function(a){B=a}},reduceXTicks:{get:function(){return v},set:function(a){v=a}},rotateLabels:{get:function(){return y},set:function(a){y=a}},staggerLabels:{get:function(){return w},set:function(a){w=a}},wrapLabels:{get:function(){return x},set:function(a){x=!!a}},margin:{get:function(){return l},set:function(a){l.top=void 0!==a.top?a.top:l.top,l.right=void 0!==a.right?a.right:l.right,l.bottom=void 0!==a.bottom?a.bottom:l.bottom,l.left=void 0!==a.left?a.left:l.left}},duration:{get:function(){return E},set:function(a){E=a,e.duration(E),f.duration(E),g.duration(E),G.reset(E)}},color:{get:function(){return o},set:function(b){o=a.utils.getColor(b),i.color(o)}},rightAlignYAxis:{get:function(){return u},set:function(a){u=a,g.orient(u?"right":"left")}},useInteractiveGuideline:{get:function(){return F},set:function(a){F=a}},barColor:{get:function(){return e.barColor},set:function(a){e.barColor(a),i.color(function(a,b){return d3.rgb("#ccc").darker(1.5*b).toString()})}}}),a.utils.inheritOptions(b,e),a.utils.initOptions(b),b},a.models.multiBarHorizontal=function(){"use strict";function b(m){return F.reset(),m.each(function(b){var m=k-j.left-j.right,D=l-j.top-j.bottom;n=d3.select(this),a.utils.initSVG(n),w&&(b=d3.layout.stack().offset("zero").values(function(a){return a.values}).y(r)(b)),b.forEach(function(a,b){a.values.forEach(function(c){c.series=b,c.key=a.key})}),w&&b[0].values.map(function(a,c){var d=0,e=0;b.map(function(a){var b=a.values[c];b.size=Math.abs(b.y),b.y<0?(b.y1=e-b.size,e-=b.size):(b.y1=d,d+=b.size)})});var G=d&&e?[]:b.map(function(a){return a.values.map(function(a,b){return{x:q(a,b),y:r(a,b),y0:a.y0,y1:a.y1}})});o.domain(d||d3.merge(G).map(function(a){return a.x})).rangeBands(f||[0,D],A),p.domain(e||d3.extent(d3.merge(G).map(function(a){return w?a.y>0?a.y1+a.y:a.y1:a.y}).concat(t))),x&&!w?p.range(g||[p.domain()[0]<0?z:0,m-(p.domain()[1]>0?z:0)]):p.range(g||[0,m]),h=h||o,i=i||d3.scale.linear().domain(p.domain()).range([p(0),p(0)]);var H=d3.select(this).selectAll("g.nv-wrap.nv-multibarHorizontal").data([b]),I=H.enter().append("g").attr("class","nvd3 nv-wrap nv-multibarHorizontal"),J=(I.append("defs"),I.append("g"));H.select("g");J.append("g").attr("class","nv-groups"),H.attr("transform","translate("+j.left+","+j.top+")");var K=H.select(".nv-groups").selectAll(".nv-group").data(function(a){return a},function(a,b){return b});K.enter().append("g").style("stroke-opacity",1e-6).style("fill-opacity",1e-6),K.exit().watchTransition(F,"multibarhorizontal: exit groups").style("stroke-opacity",1e-6).style("fill-opacity",1e-6).remove(),K.attr("class",function(a,b){return"nv-group nv-series-"+b}).classed("hover",function(a){return a.hover}).style("fill",function(a,b){return u(a,b)}).style("stroke",function(a,b){return u(a,b)}),K.watchTransition(F,"multibarhorizontal: groups").style("stroke-opacity",1).style("fill-opacity",B);var L=K.selectAll("g.nv-bar").data(function(a){return a.values});L.exit().remove();var M=L.enter().append("g").attr("transform",function(a,c,d){return"translate("+i(w?a.y0:0)+","+(w?0:d*o.rangeBand()/b.length+o(q(a,c)))+")"});M.append("rect").attr("width",0).attr("height",o.rangeBand()/(w?1:b.length)),L.on("mouseover",function(a,b){d3.select(this).classed("hover",!0),E.elementMouseover({data:a,index:b,color:d3.select(this).style("fill")})}).on("mouseout",function(a,b){d3.select(this).classed("hover",!1),E.elementMouseout({data:a,index:b,color:d3.select(this).style("fill")})}).on("mouseout",function(a,b){E.elementMouseout({data:a,index:b,color:d3.select(this).style("fill")})}).on("mousemove",function(a,b){E.elementMousemove({data:a,index:b,color:d3.select(this).style("fill")})}).on("click",function(a,b){var c=this;E.elementClick({data:a,index:b,color:d3.select(this).style("fill"),event:d3.event,element:c}),d3.event.stopPropagation()}).on("dblclick",function(a,b){E.elementDblClick({data:a,index:b,color:d3.select(this).style("fill")}),d3.event.stopPropagation()}),s(b[0],0)&&(M.append("polyline"),L.select("polyline").attr("fill","none").attr("points",function(a,c){var d=s(a,c),e=.8*o.rangeBand()/(2*(w?1:b.length));d=d.length?d:[-Math.abs(d),Math.abs(d)],d=d.map(function(a){return p(a)-p(0)});var f=[[d[0],-e],[d[0],e],[d[0],0],[d[1],0],[d[1],-e],[d[1],e]];return f.map(function(a){return a.join(",")}).join(" ")}).attr("transform",function(a,c){var d=o.rangeBand()/(2*(w?1:b.length));return"translate("+(r(a,c)<0?0:p(r(a,c))-p(0))+", "+d+")"})),M.append("text"),x&&!w?(L.select("text").attr("text-anchor",function(a,b){return r(a,b)<0?"end":"start"}).attr("y",o.rangeBand()/(2*b.length)).attr("dy",".32em").text(function(a,b){var c=C(r(a,b)),d=s(a,b);return void 0===d?c:d.length?c+"+"+C(Math.abs(d[1]))+"-"+C(Math.abs(d[0])):c+"±"+C(Math.abs(d))}),L.watchTransition(F,"multibarhorizontal: bars").select("text").attr("x",function(a,b){return r(a,b)<0?-4:p(r(a,b))-p(0)+4})):L.selectAll("text").text(""),y&&!w?(M.append("text").classed("nv-bar-label",!0),L.select("text.nv-bar-label").attr("text-anchor",function(a,b){return r(a,b)<0?"start":"end"}).attr("y",o.rangeBand()/(2*b.length)).attr("dy",".32em").text(function(a,b){return q(a,b)}),L.watchTransition(F,"multibarhorizontal: bars").select("text.nv-bar-label").attr("x",function(a,b){return r(a,b)<0?p(0)-p(r(a,b))+4:-4})):L.selectAll("text.nv-bar-label").text(""),L.attr("class",function(a,b){return r(a,b)<0?"nv-bar negative":"nv-bar positive"}),v&&(c||(c=b.map(function(){return!0})),L.style("fill",function(a,b,d){return d3.rgb(v(a,b)).darker(c.map(function(a,b){return b}).filter(function(a,b){return!c[b]})[d]).toString()}).style("stroke",function(a,b,d){return d3.rgb(v(a,b)).darker(c.map(function(a,b){return b}).filter(function(a,b){return!c[b]})[d]).toString()})),w?L.watchTransition(F,"multibarhorizontal: bars").attr("transform",function(a,b){return"translate("+p(a.y1)+","+o(q(a,b))+")"}).select("rect").attr("width",function(a,b){return Math.abs(p(r(a,b)+a.y0)-p(a.y0))||0}).attr("height",o.rangeBand()):L.watchTransition(F,"multibarhorizontal: bars").attr("transform",function(a,c){return"translate("+p(r(a,c)<0?r(a,c):0)+","+(a.series*o.rangeBand()/b.length+o(q(a,c)))+")"}).select("rect").attr("height",o.rangeBand()/b.length).attr("width",function(a,b){return Math.max(Math.abs(p(r(a,b))-p(0)),1)||0}),h=o.copy(),i=p.copy()}),F.renderEnd("multibarHorizontal immediate"),b}var c,d,e,f,g,h,i,j={top:0,right:0,bottom:0,left:0},k=960,l=500,m=Math.floor(1e4*Math.random()),n=null,o=d3.scale.ordinal(),p=d3.scale.linear(),q=function(a){return a.x},r=function(a){return a.y},s=function(a){return a.yErr},t=[0],u=a.utils.defaultColor(),v=null,w=!1,x=!1,y=!1,z=60,A=.1,B=.75,C=d3.format(",.2f"),D=250,E=d3.dispatch("chartClick","elementClick","elementDblClick","elementMouseover","elementMouseout","elementMousemove","renderEnd"),F=a.utils.renderWatch(E,D);return b.dispatch=E,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return k},set:function(a){k=a}},height:{get:function(){return l},set:function(a){l=a}},x:{get:function(){return q},set:function(a){q=a}},y:{get:function(){return r},set:function(a){r=a}},yErr:{get:function(){return s},set:function(a){s=a}},xScale:{get:function(){return o},set:function(a){o=a}},yScale:{get:function(){return p},set:function(a){p=a}},xDomain:{get:function(){return d},set:function(a){d=a}},yDomain:{get:function(){return e},set:function(a){e=a}},xRange:{get:function(){return f},set:function(a){f=a}},yRange:{get:function(){return g},set:function(a){g=a}},forceY:{get:function(){return t},set:function(a){t=a}},stacked:{get:function(){return w},set:function(a){w=a}},showValues:{get:function(){return x},set:function(a){x=a}},disabled:{get:function(){return c},set:function(a){c=a}},id:{get:function(){return m},set:function(a){m=a}},valueFormat:{get:function(){return C},set:function(a){C=a}},valuePadding:{get:function(){return z},set:function(a){z=a}},groupSpacing:{get:function(){return A},set:function(a){A=a}},fillOpacity:{get:function(){return B},set:function(a){B=a}},margin:{get:function(){return j},set:function(a){j.top=void 0!==a.top?a.top:j.top,j.right=void 0!==a.right?a.right:j.right,j.bottom=void 0!==a.bottom?a.bottom:j.bottom,j.left=void 0!==a.left?a.left:j.left}},duration:{get:function(){return D},set:function(a){D=a,F.reset(D)}},color:{get:function(){return u},set:function(b){u=a.utils.getColor(b)}},barColor:{get:function(){return v},set:function(b){v=b?a.utils.getColor(b):null}}}),a.utils.initOptions(b),b},a.models.multiBarHorizontalChart=function(){"use strict";function b(j){return C.reset(),C.models(e),r&&C.models(f),s&&C.models(g),j.each(function(j){var w=d3.select(this);a.utils.initSVG(w);var C=a.utils.availableWidth(l,w,k),D=a.utils.availableHeight(m,w,k);if(b.update=function(){w.transition().duration(z).call(b)},b.container=this,t=e.stacked(),u.setter(B(j),b.update).getter(A(j)).update(),u.disabled=j.map(function(a){return!!a.disabled}),!v){var E;v={};for(E in u)u[E]instanceof Array?v[E]=u[E].slice(0):v[E]=u[E]}if(!(j&&j.length&&j.filter(function(a){return a.values.length}).length))return a.utils.noData(b,w),b;w.selectAll(".nv-noData").remove(),c=e.xScale(),d=e.yScale().clamp(!0);var F=w.selectAll("g.nv-wrap.nv-multiBarHorizontalChart").data([j]),G=F.enter().append("g").attr("class","nvd3 nv-wrap nv-multiBarHorizontalChart").append("g"),H=F.select("g");if(G.append("g").attr("class","nv-x nv-axis"),G.append("g").attr("class","nv-y nv-axis").append("g").attr("class","nv-zeroLine").append("line"),G.append("g").attr("class","nv-barsWrap"),G.append("g").attr("class","nv-legendWrap"),G.append("g").attr("class","nv-controlsWrap"),q?(h.width(C-y()),H.select(".nv-legendWrap").datum(j).call(h),h.height()>k.top&&(k.top=h.height(),D=a.utils.availableHeight(m,w,k)),H.select(".nv-legendWrap").attr("transform","translate("+y()+","+-k.top+")")):H.select(".nv-legendWrap").selectAll("*").remove(),o){var I=[{key:p.grouped||"Grouped",disabled:e.stacked()},{key:p.stacked||"Stacked",disabled:!e.stacked()}];i.width(y()).color(["#444","#444","#444"]),H.select(".nv-controlsWrap").datum(I).attr("transform","translate(0,"+-k.top+")").call(i)}else H.select(".nv-controlsWrap").selectAll("*").remove();F.attr("transform","translate("+k.left+","+k.top+")"),e.disabled(j.map(function(a){return a.disabled})).width(C).height(D).color(j.map(function(a,b){return a.color||n(a,b)}).filter(function(a,b){return!j[b].disabled}));var J=H.select(".nv-barsWrap").datum(j.filter(function(a){return!a.disabled}));if(J.transition().call(e),r){f.scale(c)._ticks(a.utils.calcTicksY(D/24,j)).tickSize(-C,0),H.select(".nv-x.nv-axis").call(f);var K=H.select(".nv-x.nv-axis").selectAll("g");K.selectAll("line, text")}s&&(g.scale(d)._ticks(a.utils.calcTicksX(C/100,j)).tickSize(-D,0),H.select(".nv-y.nv-axis").attr("transform","translate(0,"+D+")"),H.select(".nv-y.nv-axis").call(g)),H.select(".nv-zeroLine line").attr("x1",d(0)).attr("x2",d(0)).attr("y1",0).attr("y2",-D),h.dispatch.on("stateChange",function(a){for(var c in a)u[c]=a[c];x.stateChange(u),b.update()}),i.dispatch.on("legendClick",function(a,c){if(a.disabled){switch(I=I.map(function(a){return a.disabled=!0,a}),a.disabled=!1,a.key){case"Grouped":case p.grouped:e.stacked(!1);break;case"Stacked":case p.stacked:e.stacked(!0)}u.stacked=e.stacked(),x.stateChange(u),t=e.stacked(),b.update()}}),x.on("changeState",function(a){"undefined"!=typeof a.disabled&&(j.forEach(function(b,c){b.disabled=a.disabled[c]}),u.disabled=a.disabled),"undefined"!=typeof a.stacked&&(e.stacked(a.stacked),u.stacked=a.stacked,t=a.stacked),b.update()})}),C.renderEnd("multibar horizontal chart immediate"),b}var c,d,e=a.models.multiBarHorizontal(),f=a.models.axis(),g=a.models.axis(),h=a.models.legend().height(30),i=a.models.legend().height(30),j=a.models.tooltip(),k={top:30,right:20,bottom:50,left:60},l=null,m=null,n=a.utils.defaultColor(),o=!0,p={},q=!0,r=!0,s=!0,t=!1,u=a.utils.state(),v=null,w=null,x=d3.dispatch("stateChange","changeState","renderEnd"),y=function(){return o?180:0},z=250;u.stacked=!1,e.stacked(t),f.orient("left").tickPadding(5).showMaxMin(!1).tickFormat(function(a){return a}),g.orient("bottom").tickFormat(d3.format(",.1f")),j.duration(0).valueFormatter(function(a,b){return g.tickFormat()(a,b)}).headerFormatter(function(a,b){return f.tickFormat()(a,b)}),i.updateState(!1);var A=function(a){return function(){return{active:a.map(function(a){return!a.disabled}),stacked:t}}},B=function(a){return function(b){void 0!==b.stacked&&(t=b.stacked),void 0!==b.active&&a.forEach(function(a,c){a.disabled=!b.active[c]})}},C=a.utils.renderWatch(x,z);return e.dispatch.on("elementMouseover.tooltip",function(a){a.value=b.x()(a.data),a.series={key:a.data.key,value:b.y()(a.data),color:a.color},j.data(a).hidden(!1)}),e.dispatch.on("elementMouseout.tooltip",function(a){j.hidden(!0)}),e.dispatch.on("elementMousemove.tooltip",function(a){j()}),b.dispatch=x,b.multibar=e,b.legend=h,b.controls=i,b.xAxis=f,b.yAxis=g,b.state=u,b.tooltip=j,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return l},set:function(a){l=a}},height:{get:function(){return m},set:function(a){m=a}},showLegend:{get:function(){return q},set:function(a){q=a}},showControls:{get:function(){return o},set:function(a){o=a}},controlLabels:{get:function(){return p},set:function(a){p=a}},showXAxis:{get:function(){return r},set:function(a){r=a}},showYAxis:{get:function(){return s},set:function(a){s=a}},defaultState:{get:function(){return v},set:function(a){v=a}},noData:{get:function(){return w},set:function(a){w=a}},margin:{get:function(){return k},set:function(a){k.top=void 0!==a.top?a.top:k.top,k.right=void 0!==a.right?a.right:k.right,k.bottom=void 0!==a.bottom?a.bottom:k.bottom,k.left=void 0!==a.left?a.left:k.left}},duration:{get:function(){return z},set:function(a){z=a,C.reset(z),e.duration(z),f.duration(z),g.duration(z)}},color:{get:function(){return n},set:function(b){n=a.utils.getColor(b),h.color(n)}},barColor:{get:function(){return e.barColor},set:function(a){e.barColor(a),h.color(function(a,b){return d3.rgb("#ccc").darker(1.5*b).toString()})}}}),a.utils.inheritOptions(b,e),a.utils.initOptions(b),b},a.models.multiChart=function(){"use strict";function b(j){return j.each(function(j){function n(a){var b=2===j[a.seriesIndex].yAxis?E:D;a.value=a.point.x,a.series={value:a.point.y,color:a.point.color,key:a.series.key},G.duration(0).headerFormatter(function(a,b){return C.tickFormat()(a,b)}).valueFormatter(function(a,c){return b.tickFormat()(a,c)}).data(a).hidden(!1)}function H(a){var b=2===j[a.seriesIndex].yAxis?E:D;a.value=a.point.x,a.series={value:a.point.y,color:a.point.color,key:a.series.key},G.duration(100).headerFormatter(function(a,b){return C.tickFormat()(a,b)}).valueFormatter(function(a,c){return b.tickFormat()(a,c)}).data(a).hidden(!1)}function J(a){var b=2===j[a.seriesIndex].yAxis?E:D;a.point.x=A.x()(a.point),a.point.y=A.y()(a.point),G.duration(0).headerFormatter(function(a,b){return C.tickFormat()(a,b)}).valueFormatter(function(a,c){return b.tickFormat()(a,c)}).data(a).hidden(!1)}function K(a){var b=2===j[a.data.series].yAxis?E:D;a.value=y.x()(a.data),a.series={value:y.y()(a.data),color:a.color,key:a.data.key},G.duration(0).headerFormatter(function(a,b){return C.tickFormat()(a,b)}).valueFormatter(function(a,c){return b.tickFormat()(a,c)}).data(a).hidden(!1)}function L(){for(var a=0,b=I.length;b>a;a++){var c=I[a];try{c.clearHighlights()}catch(d){}}}function M(a,b,c){for(var d=0,e=I.length;e>d;d++){var f=I[d];try{f.highlightPoint(a,b,c)}catch(g){}}}var N=d3.select(this);a.utils.initSVG(N),b.update=function(){N.transition().call(b)},b.container=this;var O=a.utils.availableWidth(g,N,e),P=a.utils.availableHeight(h,N,e),Q=j.filter(function(a){return"line"==a.type&&1==a.yAxis}),R=j.filter(function(a){return"line"==a.type&&2==a.yAxis}),S=j.filter(function(a){return"scatter"==a.type&&1==a.yAxis}),T=j.filter(function(a){return"scatter"==a.type&&2==a.yAxis}),U=j.filter(function(a){return"bar"==a.type&&1==a.yAxis}),V=j.filter(function(a){return"bar"==a.type&&2==a.yAxis}),W=j.filter(function(a){return"area"==a.type&&1==a.yAxis}),X=j.filter(function(a){return"area"==a.type&&2==a.yAxis});if(!(j&&j.length&&j.filter(function(a){return a.values.length}).length))return a.utils.noData(b,N),b;N.selectAll(".nv-noData").remove();var Y=j.filter(function(a){return!a.disabled&&1==a.yAxis}).map(function(a){return a.values.map(function(a,b){return{x:k(a),y:l(a)}})}),Z=j.filter(function(a){return!a.disabled&&2==a.yAxis}).map(function(a){return a.values.map(function(a,b){return{x:k(a),y:l(a)}})});r.domain(d3.extent(d3.merge(Y.concat(Z)),function(a){return a.x})).range([0,O]);var $=N.selectAll("g.wrap.multiChart").data([j]),_=$.enter().append("g").attr("class","wrap nvd3 multiChart").append("g");_.append("g").attr("class","nv-x nv-axis"),_.append("g").attr("class","nv-y1 nv-axis"),_.append("g").attr("class","nv-y2 nv-axis"),_.append("g").attr("class","stack1Wrap"),_.append("g").attr("class","stack2Wrap"),_.append("g").attr("class","bars1Wrap"),_.append("g").attr("class","bars2Wrap"),_.append("g").attr("class","scatters1Wrap"),_.append("g").attr("class","scatters2Wrap"),_.append("g").attr("class","lines1Wrap"),_.append("g").attr("class","lines2Wrap"),_.append("g").attr("class","legendWrap"),_.append("g").attr("class","nv-interactive");var aa=$.select("g"),ba=j.map(function(a,b){return j[b].color||f(a,b)});if(i){var ca=F.align()?O/2:O,da=F.align()?ca:0;F.width(ca),F.color(ba),aa.select(".legendWrap").datum(j.map(function(a){return a.originalKey=void 0===a.originalKey?a.key:a.originalKey,a.key=a.originalKey+(1==a.yAxis?"":q),a})).call(F),F.height()>e.top&&(e.top=F.height(),P=a.utils.availableHeight(h,N,e)),aa.select(".legendWrap").attr("transform","translate("+da+","+-e.top+")")}else aa.select(".legendWrap").selectAll("*").remove();u.width(O).height(P).interpolate(m).color(ba.filter(function(a,b){return!j[b].disabled&&1==j[b].yAxis&&"line"==j[b].type})),v.width(O).height(P).interpolate(m).color(ba.filter(function(a,b){return!j[b].disabled&&2==j[b].yAxis&&"line"==j[b].type})),w.width(O).height(P).color(ba.filter(function(a,b){return!j[b].disabled&&1==j[b].yAxis&&"scatter"==j[b].type})),x.width(O).height(P).color(ba.filter(function(a,b){return!j[b].disabled&&2==j[b].yAxis&&"scatter"==j[b].type})),y.width(O).height(P).color(ba.filter(function(a,b){return!j[b].disabled&&1==j[b].yAxis&&"bar"==j[b].type})),z.width(O).height(P).color(ba.filter(function(a,b){return!j[b].disabled&&2==j[b].yAxis&&"bar"==j[b].type})),A.width(O).height(P).interpolate(m).color(ba.filter(function(a,b){return!j[b].disabled&&1==j[b].yAxis&&"area"==j[b].type})),B.width(O).height(P).interpolate(m).color(ba.filter(function(a,b){return!j[b].disabled&&2==j[b].yAxis&&"area"==j[b].type})),aa.attr("transform","translate("+e.left+","+e.top+")");var ea=aa.select(".lines1Wrap").datum(Q.filter(function(a){return!a.disabled})),fa=aa.select(".scatters1Wrap").datum(S.filter(function(a){return!a.disabled})),ga=aa.select(".bars1Wrap").datum(U.filter(function(a){return!a.disabled})),ha=aa.select(".stack1Wrap").datum(W.filter(function(a){return!a.disabled})),ia=aa.select(".lines2Wrap").datum(R.filter(function(a){return!a.disabled})),ja=aa.select(".scatters2Wrap").datum(T.filter(function(a){return!a.disabled})),ka=aa.select(".bars2Wrap").datum(V.filter(function(a){return!a.disabled})),la=aa.select(".stack2Wrap").datum(X.filter(function(a){return!a.disabled})),ma=W.length?W.map(function(a){return a.values}).reduce(function(a,b){return a.map(function(a,c){return{x:a.x,y:a.y+b[c].y}})}).concat([{x:0,y:0}]):[],na=X.length?X.map(function(a){return a.values}).reduce(function(a,b){return a.map(function(a,c){return{x:a.x,y:a.y+b[c].y}})}).concat([{x:0,y:0}]):[];s.domain(c||d3.extent(d3.merge(Y).concat(ma),function(a){return a.y})).range([0,P]),t.domain(d||d3.extent(d3.merge(Z).concat(na),function(a){return a.y})).range([0,P]),u.yDomain(s.domain()),w.yDomain(s.domain()),y.yDomain(s.domain()),A.yDomain(s.domain()),v.yDomain(t.domain()),x.yDomain(t.domain()),z.yDomain(t.domain()),B.yDomain(t.domain()),W.length&&d3.transition(ha).call(A),X.length&&d3.transition(la).call(B),U.length&&d3.transition(ga).call(y),V.length&&d3.transition(ka).call(z),Q.length&&d3.transition(ea).call(u),R.length&&d3.transition(ia).call(v),S.length&&d3.transition(fa).call(w),T.length&&d3.transition(ja).call(x),C._ticks(a.utils.calcTicksX(O/100,j)).tickSize(-P,0),aa.select(".nv-x.nv-axis").attr("transform","translate(0,"+P+")"),d3.transition(aa.select(".nv-x.nv-axis")).call(C),D._ticks(a.utils.calcTicksY(P/36,j)).tickSize(-O,0),d3.transition(aa.select(".nv-y1.nv-axis")).call(D),E._ticks(a.utils.calcTicksY(P/36,j)).tickSize(-O,0),d3.transition(aa.select(".nv-y2.nv-axis")).call(E),aa.select(".nv-y1.nv-axis").classed("nv-disabled",Y.length?!1:!0).attr("transform","translate("+r.range()[0]+",0)"),aa.select(".nv-y2.nv-axis").classed("nv-disabled",Z.length?!1:!0).attr("transform","translate("+r.range()[1]+",0)"),F.dispatch.on("stateChange",function(a){b.update()}),p&&(o.width(O).height(P).margin({left:e.left,top:e.top}).svgContainer(N).xScale(r),$.select(".nv-interactive").call(o)),p?(o.dispatch.on("elementMousemove",function(c){L();var d,e,g,h=[];j.filter(function(a,b){return a.seriesIndex=b,!a.disabled}).forEach(function(i,j){var k=r.domain(),l=i.values.filter(function(a,c){return b.x()(a,c)>=k[0]&&b.x()(a,c)<=k[1]});e=a.interactiveBisect(l,c.pointXValue,b.x());var m=l[e],n=b.y()(m,e);null!==n&&M(j,e,!0),void 0!==m&&(void 0===d&&(d=m),void 0===g&&(g=r(b.x()(m,e))),h.push({key:i.key,value:n,color:f(i,i.seriesIndex),data:m,yAxis:2==i.yAxis?E:D}))});var i=function(a,b){var c=h[b].yAxis;return null==a?"N/A":c.tickFormat()(a)};o.tooltip.headerFormatter(function(a,b){return C.tickFormat()(a,b)}).valueFormatter(o.tooltip.valueFormatter()||i).data({value:b.x()(d,e),index:e,series:h})(),o.renderGuideLine(g)}),o.dispatch.on("elementMouseout",function(a){L()})):(u.dispatch.on("elementMouseover.tooltip",n),v.dispatch.on("elementMouseover.tooltip",n),u.dispatch.on("elementMouseout.tooltip",function(a){G.hidden(!0)}),v.dispatch.on("elementMouseout.tooltip",function(a){G.hidden(!0)}),w.dispatch.on("elementMouseover.tooltip",H),x.dispatch.on("elementMouseover.tooltip",H),w.dispatch.on("elementMouseout.tooltip",function(a){G.hidden(!0)}),x.dispatch.on("elementMouseout.tooltip",function(a){G.hidden(!0)}),A.dispatch.on("elementMouseover.tooltip",J),B.dispatch.on("elementMouseover.tooltip",J),A.dispatch.on("elementMouseout.tooltip",function(a){G.hidden(!0)}),B.dispatch.on("elementMouseout.tooltip",function(a){G.hidden(!0)}),y.dispatch.on("elementMouseover.tooltip",K),z.dispatch.on("elementMouseover.tooltip",K),y.dispatch.on("elementMouseout.tooltip",function(a){G.hidden(!0)}),z.dispatch.on("elementMouseout.tooltip",function(a){G.hidden(!0)}),y.dispatch.on("elementMousemove.tooltip",function(a){G()}),z.dispatch.on("elementMousemove.tooltip",function(a){G()}))}),b}var c,d,e={top:30,right:20,bottom:50,left:60},f=a.utils.defaultColor(),g=null,h=null,i=!0,j=null,k=function(a){return a.x},l=function(a){return a.y},m="linear",n=!0,o=a.interactiveGuideline(),p=!1,q=" (right axis)",r=d3.scale.linear(),s=d3.scale.linear(),t=d3.scale.linear(),u=a.models.line().yScale(s),v=a.models.line().yScale(t),w=a.models.scatter().yScale(s),x=a.models.scatter().yScale(t),y=a.models.multiBar().stacked(!1).yScale(s),z=a.models.multiBar().stacked(!1).yScale(t),A=a.models.stackedArea().yScale(s),B=a.models.stackedArea().yScale(t),C=a.models.axis().scale(r).orient("bottom").tickPadding(5),D=a.models.axis().scale(s).orient("left"),E=a.models.axis().scale(t).orient("right"),F=a.models.legend().height(30),G=a.models.tooltip(),H=d3.dispatch(),I=[u,v,w,x,y,z,A,B];return b.dispatch=H,b.legend=F,b.lines1=u,b.lines2=v,b.scatters1=w,b.scatters2=x,b.bars1=y,b.bars2=z,b.stack1=A,b.stack2=B,b.xAxis=C,b.yAxis1=D,b.yAxis2=E,b.tooltip=G,b.interactiveLayer=o,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return g},set:function(a){g=a}},height:{get:function(){return h},set:function(a){h=a}},showLegend:{get:function(){return i},set:function(a){i=a}},yDomain1:{get:function(){return c},set:function(a){c=a}},yDomain2:{get:function(){return d},set:function(a){d=a}},noData:{get:function(){return j},set:function(a){j=a}},interpolate:{get:function(){return m},set:function(a){m=a}},legendRightAxisHint:{get:function(){return q},set:function(a){q=a}},margin:{get:function(){return e},set:function(a){e.top=void 0!==a.top?a.top:e.top,e.right=void 0!==a.right?a.right:e.right,e.bottom=void 0!==a.bottom?a.bottom:e.bottom,e.left=void 0!==a.left?a.left:e.left}},color:{get:function(){return f},set:function(b){f=a.utils.getColor(b)}},x:{get:function(){return k},set:function(a){k=a,u.x(a),v.x(a),w.x(a),x.x(a),y.x(a),z.x(a),A.x(a),B.x(a)}},y:{get:function(){return l},set:function(a){l=a,u.y(a),v.y(a),w.y(a),x.y(a),A.y(a),B.y(a),y.y(a),z.y(a)}},useVoronoi:{get:function(){return n},set:function(a){n=a,u.useVoronoi(a),v.useVoronoi(a),A.useVoronoi(a),B.useVoronoi(a)}},useInteractiveGuideline:{get:function(){return p},set:function(a){p=a,p&&(u.interactive(!1),u.useVoronoi(!1),v.interactive(!1),v.useVoronoi(!1),A.interactive(!1),A.useVoronoi(!1),B.interactive(!1),B.useVoronoi(!1),w.interactive(!1),x.interactive(!1))}}}),a.utils.initOptions(b),b},a.models.ohlcBar=function(){"use strict";function b(y){return y.each(function(b){k=d3.select(this);var y=a.utils.availableWidth(h,k,g),A=a.utils.availableHeight(i,k,g);a.utils.initSVG(k);var B=y/b[0].values.length*.9;l.domain(c||d3.extent(b[0].values.map(n).concat(t))),v?l.range(e||[.5*y/b[0].values.length,y*(b[0].values.length-.5)/b[0].values.length]):l.range(e||[5+B/2,y-B/2-5]),m.domain(d||[d3.min(b[0].values.map(s).concat(u)),d3.max(b[0].values.map(r).concat(u))]).range(f||[A,0]),l.domain()[0]===l.domain()[1]&&(l.domain()[0]?l.domain([l.domain()[0]-.01*l.domain()[0],l.domain()[1]+.01*l.domain()[1]]):l.domain([-1,1])),m.domain()[0]===m.domain()[1]&&(m.domain()[0]?m.domain([m.domain()[0]+.01*m.domain()[0],m.domain()[1]-.01*m.domain()[1]]):m.domain([-1,1]));var C=d3.select(this).selectAll("g.nv-wrap.nv-ohlcBar").data([b[0].values]),D=C.enter().append("g").attr("class","nvd3 nv-wrap nv-ohlcBar"),E=D.append("defs"),F=D.append("g"),G=C.select("g");F.append("g").attr("class","nv-ticks"),C.attr("transform","translate("+g.left+","+g.top+")"),k.on("click",function(a,b){z.chartClick({data:a,index:b,pos:d3.event,id:j})}),E.append("clipPath").attr("id","nv-chart-clip-path-"+j).append("rect"),C.select("#nv-chart-clip-path-"+j+" rect").attr("width",y).attr("height",A),G.attr("clip-path",w?"url(#nv-chart-clip-path-"+j+")":"");var H=C.select(".nv-ticks").selectAll(".nv-tick").data(function(a){return a});H.exit().remove(),H.enter().append("path").attr("class",function(a,b,c){return(p(a,b)>q(a,b)?"nv-tick negative":"nv-tick positive")+" nv-tick-"+c+"-"+b}).attr("d",function(a,b){return"m0,0l0,"+(m(p(a,b))-m(r(a,b)))+"l"+-B/2+",0l"+B/2+",0l0,"+(m(s(a,b))-m(p(a,b)))+"l0,"+(m(q(a,b))-m(s(a,b)))+"l"+B/2+",0l"+-B/2+",0z"}).attr("transform",function(a,b){return"translate("+l(n(a,b))+","+m(r(a,b))+")"}).attr("fill",function(a,b){return x[0]}).attr("stroke",function(a,b){return x[0]}).attr("x",0).attr("y",function(a,b){return m(Math.max(0,o(a,b)))}).attr("height",function(a,b){return Math.abs(m(o(a,b))-m(0))}),H.attr("class",function(a,b,c){return(p(a,b)>q(a,b)?"nv-tick negative":"nv-tick positive")+" nv-tick-"+c+"-"+b}),d3.transition(H).attr("transform",function(a,b){return"translate("+l(n(a,b))+","+m(r(a,b))+")"}).attr("d",function(a,c){var d=y/b[0].values.length*.9;return"m0,0l0,"+(m(p(a,c))-m(r(a,c)))+"l"+-d/2+",0l"+d/2+",0l0,"+(m(s(a,c))-m(p(a,c)))+"l0,"+(m(q(a,c))-m(s(a,c)))+"l"+d/2+",0l"+-d/2+",0z"})}),b}var c,d,e,f,g={top:0,right:0,bottom:0,left:0},h=null,i=null,j=Math.floor(1e4*Math.random()),k=null,l=d3.scale.linear(),m=d3.scale.linear(),n=function(a){return a.x},o=function(a){return a.y},p=function(a){return a.open},q=function(a){return a.close},r=function(a){return a.high},s=function(a){return a.low},t=[],u=[],v=!1,w=!0,x=a.utils.defaultColor(),y=!1,z=d3.dispatch("stateChange","changeState","renderEnd","chartClick","elementClick","elementDblClick","elementMouseover","elementMouseout","elementMousemove");return b.highlightPoint=function(a,c){b.clearHighlights(),k.select(".nv-ohlcBar .nv-tick-0-"+a).classed("hover",c)},b.clearHighlights=function(){k.select(".nv-ohlcBar .nv-tick.hover").classed("hover",!1)},b.dispatch=z,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return h},set:function(a){h=a}},height:{get:function(){return i},set:function(a){i=a}},xScale:{get:function(){return l},set:function(a){l=a}},yScale:{get:function(){return m},set:function(a){m=a}},xDomain:{get:function(){return c},set:function(a){c=a}},yDomain:{get:function(){return d},set:function(a){d=a}},xRange:{get:function(){return e},set:function(a){e=a}},yRange:{get:function(){return f},set:function(a){f=a}},forceX:{get:function(){return t},set:function(a){t=a}},forceY:{get:function(){return u},set:function(a){u=a}},padData:{get:function(){return v},set:function(a){v=a}},clipEdge:{get:function(){return w},set:function(a){w=a}},id:{get:function(){return j},set:function(a){j=a}},interactive:{get:function(){return y},set:function(a){y=a}},x:{get:function(){return n},set:function(a){n=a}},y:{get:function(){return o},set:function(a){o=a}},open:{get:function(){return p()},set:function(a){p=a}},close:{get:function(){return q()},set:function(a){q=a}},high:{get:function(){return r},set:function(a){r=a}},low:{get:function(){return s},set:function(a){s=a}},margin:{get:function(){return g},set:function(a){g.top=void 0!=a.top?a.top:g.top,g.right=void 0!=a.right?a.right:g.right,g.bottom=void 0!=a.bottom?a.bottom:g.bottom,g.left=void 0!=a.left?a.left:g.left}},color:{get:function(){return x},set:function(b){x=a.utils.getColor(b)}}}),a.utils.initOptions(b),b},a.models.parallelCoordinates=function(){"use strict";function b(B){return A.reset(),B.each(function(b){function A(a){return x(o.map(function(b){if(isNaN(a.values[b.key])||isNaN(parseFloat(a.values[b.key]))||O){var c=l[b.key].domain(),d=l[b.key].range(),e=c[0]-(c[1]-c[0])/9;if(v.indexOf(b.key)<0){var f=d3.scale.linear().domain([e,c[1]]).range([j-12,d[1]]);l[b.key].brush.y(f),v.push(b.key)}if(isNaN(a.values[b.key])||isNaN(parseFloat(a.values[b.key])))return[k(b.key),l[b.key](e)]}return void 0!==U&&(v.length>0||O?(U.style("display","inline"),V.style("display","inline")):(U.style("display","none"),V.style("display","none"))),[k(b.key),l[b.key](a.values[b.key])]}))}function B(a){s.forEach(function(b){var c=l[b.dimension].brush.y().domain();b.hasOnlyNaN&&(b.extent[1]=(l[b.dimension].domain()[1]-c[0])*(b.extent[1]-b.extent[0])/(N[b.dimension]-b.extent[0])+c[0]),b.hasNaN&&(b.extent[0]=c[0]),a&&l[b.dimension].brush.extent(b.extent)}),e.select(".nv-brushBackground").each(function(a){d3.select(this).call(l[a.key].brush)}).selectAll("rect").attr("x",-8).attr("width",16),F()}function C(){q===!1&&(q=!0,B(!0))}function D(){$=p.filter(function(a){return!l[a].brush.empty()}),_=$.map(function(a){return l[a].brush.extent()}),s=[],$.forEach(function(a,b){s[b]={dimension:a,extent:_[b],hasNaN:!1,hasOnlyNaN:!1}}),t=[],c.style("display",function(a){var b=$.every(function(b,c){return(isNaN(a.values[b])||isNaN(parseFloat(a.values[b])))&&_[c][0]==l[b].brush.y().domain()[0]?!0:_[c][0]<=a.values[b]&&a.values[b]<=_[c][1]&&!isNaN(parseFloat(a.values[b]))});return b&&t.push(a),b?null:"none"}),F(),z.brush({filters:s,active:t})}function E(){var a=$.length>0?!0:!1;s.forEach(function(a){a.extent[0]===l[a.dimension].brush.y().domain()[0]&&v.indexOf(a.dimension)>=0&&(a.hasNaN=!0),a.extent[1]<l[a.dimension].domain()[0]&&(a.hasOnlyNaN=!0)}),z.brushEnd(t,a)}function F(){e.select(".nv-axis").each(function(a,b){var c=s.filter(function(b){return b.dimension==a.key});P[a.key]=l[a.key].domain(),0!=c.length&&q&&(P[a.key]=[],c[0].extent[1]>l[a.key].domain()[0]&&(P[a.key]=[c[0].extent[1]]),c[0].extent[0]>=l[a.key].domain()[0]&&P[a.key].push(c[0].extent[0])),d3.select(this).call(y.scale(l[a.key]).tickFormat(a.format).tickValues(P[a.key]))})}function G(a){u[a.key]=this.parentNode.__origin__=k(a.key),d.attr("visibility","hidden")}function H(a){u[a.key]=Math.min(i,Math.max(0,this.parentNode.__origin__+=d3.event.x)),c.attr("d",A),o.sort(function(a,b){return J(a.key)-J(b.key)}),o.forEach(function(a,b){return a.currentPosition=b}),k.domain(o.map(function(a){return a.key})),e.attr("transform",function(a){return"translate("+J(a.key)+")"})}function I(a,b){delete this.parentNode.__origin__,delete u[a.key],d3.select(this.parentNode).attr("transform","translate("+k(a.key)+")"),c.attr("d",A),d.attr("d",A).attr("visibility",null),z.dimensionsOrder(o)}function J(a){var b=u[a];return null==b?k(a):b}var K=d3.select(this);if(i=a.utils.availableWidth(g,K,f),j=a.utils.availableHeight(h,K,f),a.utils.initSVG(K),void 0===b[0].values){var L=[];b.forEach(function(a){var b={},c=Object.keys(a);c.forEach(function(c){"name"!==c&&(b[c]=a[c])}),L.push({key:a.name,values:b})}),b=L}var M=b.map(function(a){return a.values});0===t.length&&(t=b),p=n.sort(function(a,b){return a.currentPosition-b.currentPosition}).map(function(a){return a.key}),o=n.filter(function(a){return!a.disabled}),k.rangePoints([0,i],1).domain(o.map(function(a){return a.key}));var N={},O=!1,P=[];p.forEach(function(a){var b=d3.extent(M,function(b){return+b[a]}),c=b[0],d=b[1],e=!1;(isNaN(c)||isNaN(d))&&(e=!0,c=0,d=0),c===d&&(c-=1,d+=1);var f=s.filter(function(b){return b.dimension==a});0!==f.length&&(e?(c=l[a].domain()[0],d=l[a].domain()[1]):!f[0].hasOnlyNaN&&q?(c=c>f[0].extent[0]?f[0].extent[0]:c,d=d<f[0].extent[1]?f[0].extent[1]:d):f[0].hasNaN&&(d=d<f[0].extent[1]?f[0].extent[1]:d,N[a]=l[a].domain()[1],O=!0)),l[a]=d3.scale.linear().domain([c,d]).range([.9*(j-12),0]),v=[],l[a].brush=d3.svg.brush().y(l[a]).on("brushstart",C).on("brush",D).on("brushend",E)});var Q=K.selectAll("g.nv-wrap.nv-parallelCoordinates").data([b]),R=Q.enter().append("g").attr("class","nvd3 nv-wrap nv-parallelCoordinates"),S=R.append("g"),T=Q.select("g");S.append("g").attr("class","nv-parallelCoordinates background"),S.append("g").attr("class","nv-parallelCoordinates foreground"),S.append("g").attr("class","nv-parallelCoordinates missingValuesline"),Q.attr("transform","translate("+f.left+","+f.top+")"),x.interpolate("cardinal").tension(w),y.orient("left");var U,V,W=d3.behavior.drag().on("dragstart",G).on("drag",H).on("dragend",I),X=k.range()[1]-k.range()[0];if(!isNaN(X)){var Y=[0+X/2,j-12,i-X/2,j-12];U=Q.select(".missingValuesline").selectAll("line").data([Y]),U.enter().append("line"),U.exit().remove(),U.attr("x1",function(a){return a[0]}).attr("y1",function(a){return a[1]}).attr("x2",function(a){return a[2]}).attr("y2",function(a){return a[3]}),V=Q.select(".missingValuesline").selectAll("text").data([m]),V.append("text").data([m]),V.enter().append("text"),V.exit().remove(),V.attr("y",j).attr("x",i-92-X/2).text(function(a){return a})}d=Q.select(".background").selectAll("path").data(b),d.enter().append("path"),d.exit().remove(),d.attr("d",A),c=Q.select(".foreground").selectAll("path").data(b),c.enter().append("path"),c.exit().remove(),c.attr("d",A).style("stroke-width",function(a,b){return isNaN(a.strokeWidth)&&(a.strokeWidth=1),a.strokeWidth}).attr("stroke",function(a,b){return a.color||r(a,b)}),c.on("mouseover",function(a,b){d3.select(this).classed("hover",!0).style("stroke-width",a.strokeWidth+2+"px").style("stroke-opacity",1),z.elementMouseover({label:a.name,color:a.color||r(a,b),values:a.values,dimensions:o})}),c.on("mouseout",function(a,b){d3.select(this).classed("hover",!1).style("stroke-width",a.strokeWidth+"px").style("stroke-opacity",.7),z.elementMouseout({label:a.name,index:b})}),c.on("mousemove",function(a,b){z.elementMousemove()}),c.on("click",function(a){z.elementClick({id:a.id})}),e=T.selectAll(".dimension").data(o);var Z=e.enter().append("g").attr("class","nv-parallelCoordinates dimension");e.attr("transform",function(a){return"translate("+k(a.key)+",0)"}),Z.append("g").attr("class","nv-axis"),Z.append("text").attr("class","nv-label").style("cursor","move").attr("dy","-1em").attr("text-anchor","middle").on("mouseover",function(a,b){z.elementMouseover({label:a.tooltip||a.key,color:a.color})}).on("mouseout",function(a,b){z.elementMouseout({label:a.tooltip})}).on("mousemove",function(a,b){z.elementMousemove()}).call(W),Z.append("g").attr("class","nv-brushBackground"),e.exit().remove(),e.select(".nv-label").text(function(a){return a.key}),B(q);var $=p.filter(function(a){return!l[a].brush.empty()}),_=$.map(function(a){return l[a].brush.extent()}),aa=t.slice(0);t=[],c.style("display",function(a){var b=$.every(function(b,c){return(isNaN(a.values[b])||isNaN(parseFloat(a.values[b])))&&_[c][0]==l[b].brush.y().domain()[0]?!0:_[c][0]<=a.values[b]&&a.values[b]<=_[c][1]&&!isNaN(parseFloat(a.values[b]))});return b&&t.push(a),b?null:"none"}),(s.length>0||!a.utils.arrayEquals(t,aa))&&z.activeChanged(t)}),b}var c,d,e,f={top:30,right:0,bottom:10,left:0},g=null,h=null,i=null,j=null,k=d3.scale.ordinal(),l={},m="undefined values",n=[],o=[],p=[],q=!0,r=a.utils.defaultColor(),s=[],t=[],u=[],v=[],w=1,x=d3.svg.line(),y=d3.svg.axis(),z=d3.dispatch("brushstart","brush","brushEnd","dimensionsOrder","stateChange","elementClick","elementMouseover","elementMouseout","elementMousemove","renderEnd","activeChanged"),A=a.utils.renderWatch(z);return b.dispatch=z,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return g},set:function(a){g=a}},height:{get:function(){return h},set:function(a){h=a}},dimensionData:{get:function(){return n},set:function(a){n=a}},displayBrush:{get:function(){return q},set:function(a){q=a}},filters:{get:function(){return s},set:function(a){s=a}},active:{get:function(){return t},set:function(a){t=a}},lineTension:{get:function(){return w},set:function(a){w=a}},undefinedValuesLabel:{get:function(){return m},set:function(a){m=a}},dimensions:{get:function(){return n.map(function(a){return a.key})},set:function(b){a.deprecated("dimensions","use dimensionData instead"),0===n.length?b.forEach(function(a){n.push({key:a})}):b.forEach(function(a,b){n[b].key=a})}},dimensionNames:{get:function(){return n.map(function(a){return a.key})},set:function(b){a.deprecated("dimensionNames","use dimensionData instead"),p=[],0===n.length?b.forEach(function(a){n.push({key:a})}):b.forEach(function(a,b){n[b].key=a})}},dimensionFormats:{get:function(){return n.map(function(a){return a.format})},set:function(b){a.deprecated("dimensionFormats","use dimensionData instead"),0===n.length?b.forEach(function(a){n.push({format:a})}):b.forEach(function(a,b){n[b].format=a})}},margin:{get:function(){return f},set:function(a){f.top=void 0!==a.top?a.top:f.top,f.right=void 0!==a.right?a.right:f.right,f.bottom=void 0!==a.bottom?a.bottom:f.bottom,f.left=void 0!==a.left?a.left:f.left}},color:{get:function(){return r},set:function(b){r=a.utils.getColor(b)}}}),a.utils.initOptions(b),b},a.models.parallelCoordinatesChart=function(){"use strict";function b(e){return r.reset(),r.models(c),e.each(function(e){var j=d3.select(this);a.utils.initSVG(j);var o=a.utils.availableWidth(g,j,f),p=a.utils.availableHeight(h,j,f);if(b.update=function(){j.call(b)},b.container=this,k.setter(t(l),b.update).getter(s(l)).update(),k.disabled=l.map(function(a){return!!a.disabled}),l=l.map(function(a){return a.disabled=!!a.disabled,a}),l.forEach(function(a,b){a.originalPosition=isNaN(a.originalPosition)?b:a.originalPosition,a.currentPosition=isNaN(a.currentPosition)?b:a.currentPosition}),!n){var r;n={};for(r in k)k[r]instanceof Array?n[r]=k[r].slice(0):n[r]=k[r]}if(!e||!e.length)return a.utils.noData(b,j),b;j.selectAll(".nv-noData").remove();var u=j.selectAll("g.nv-wrap.nv-parallelCoordinatesChart").data([e]),v=u.enter().append("g").attr("class","nvd3 nv-wrap nv-parallelCoordinatesChart").append("g"),w=u.select("g");v.append("g").attr("class","nv-parallelCoordinatesWrap"),v.append("g").attr("class","nv-legendWrap"),w.select("rect").attr("width",o).attr("height",p>0?p:0),i?(d.width(o).color(function(a){return"rgb(188,190,192)"}),w.select(".nv-legendWrap").datum(l.sort(function(a,b){return a.originalPosition-b.originalPosition})).call(d),d.height()>f.top&&(f.top=d.height(),p=a.utils.availableHeight(h,j,f)),u.select(".nv-legendWrap").attr("transform","translate( 0 ,"+-f.top+")")):w.select(".nv-legendWrap").selectAll("*").remove(),u.attr("transform","translate("+f.left+","+f.top+")"),c.width(o).height(p).dimensionData(l).displayBrush(m);var x=w.select(".nv-parallelCoordinatesWrap ").datum(e);x.transition().call(c),c.dispatch.on("brushEnd",function(a,b){b?(m=!0,q.brushEnd(a)):m=!1}),d.dispatch.on("stateChange",function(a){for(var c in a)k[c]=a[c];q.stateChange(k),b.update()}),c.dispatch.on("dimensionsOrder",function(a){l.sort(function(a,b){return a.currentPosition-b.currentPosition});var b=!1;l.forEach(function(a,c){a.currentPosition=c,a.currentPosition!==a.originalPosition&&(b=!0)}),q.dimensionsOrder(l,b)}),q.on("changeState",function(a){"undefined"!=typeof a.disabled&&(l.forEach(function(b,c){b.disabled=a.disabled[c]}),k.disabled=a.disabled),b.update()})}),r.renderEnd("parraleleCoordinateChart immediate"),b}var c=a.models.parallelCoordinates(),d=a.models.legend(),e=a.models.tooltip(),f=(a.models.tooltip(),{top:0,right:0,bottom:0,left:0}),g=null,h=null,i=!0,j=a.utils.defaultColor(),k=a.utils.state(),l=[],m=!0,n=null,o=null,p="undefined",q=d3.dispatch("dimensionsOrder","brushEnd","stateChange","changeState","renderEnd"),r=a.utils.renderWatch(q),s=function(a){return function(){return{active:a.map(function(a){return!a.disabled})}}},t=function(a){return function(b){void 0!==b.active&&a.forEach(function(a,c){a.disabled=!b.active[c]})}};return e.contentGenerator(function(a){var b='<table><thead><tr><td class="legend-color-guide"><div style="background-color:'+a.color+'"></div></td><td><strong>'+a.key+"</strong></td></tr></thead>";return 0!==a.series.length&&(b+='<tbody><tr><td height ="10px"></td></tr>',a.series.forEach(function(a){b=b+'<tr><td class="legend-color-guide"><div style="background-color:'+a.color+'"></div></td><td class="key">'+a.key+'</td><td class="value">'+a.value+"</td></tr>"}),b+="</tbody>"),b+="</table>"}),c.dispatch.on("elementMouseover.tooltip",function(a){var b={key:a.label,color:a.color,series:[]};a.values&&(Object.keys(a.values).forEach(function(c){var d=a.dimensions.filter(function(a){return a.key===c})[0];if(d){var e;e=isNaN(a.values[c])||isNaN(parseFloat(a.values[c]))?p:d.format(a.values[c]),b.series.push({idx:d.currentPosition,key:c,value:e,color:d.color})}}),b.series.sort(function(a,b){return a.idx-b.idx})),e.data(b).hidden(!1)}),c.dispatch.on("elementMouseout.tooltip",function(a){e.hidden(!0)}),c.dispatch.on("elementMousemove.tooltip",function(){e()}),b.dispatch=q,b.parallelCoordinates=c,b.legend=d,b.tooltip=e,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return g},set:function(a){g=a}},height:{get:function(){return h},set:function(a){h=a}},showLegend:{get:function(){return i},set:function(a){i=a}},defaultState:{get:function(){return n},set:function(a){n=a}},dimensionData:{get:function(){return l},set:function(a){l=a}},displayBrush:{get:function(){return m},set:function(a){m=a}},noData:{get:function(){return o},set:function(a){o=a}},nanValue:{get:function(){return p},set:function(a){p=a}},margin:{get:function(){return f},set:function(a){f.top=void 0!==a.top?a.top:f.top,f.right=void 0!==a.right?a.right:f.right,f.bottom=void 0!==a.bottom?a.bottom:f.bottom,f.left=void 0!==a.left?a.left:f.left}},color:{get:function(){return j},set:function(b){j=a.utils.getColor(b),d.color(j),c.color(j)}}}),a.utils.inheritOptions(b,c),a.utils.initOptions(b),b},a.models.pie=function(){"use strict";function b(F){return E.reset(),F.each(function(b){function F(a,b){a.endAngle=isNaN(a.endAngle)?0:a.endAngle,a.startAngle=isNaN(a.startAngle)?0:a.startAngle,p||(a.innerRadius=0);var c=d3.interpolate(this._current,a);return this._current=c(0),function(a){return C[b](c(a))}}var G=d-c.left-c.right,H=e-c.top-c.bottom,I=Math.min(G,H)/2,J=[],K=[];if(i=d3.select(this),0===A.length)for(var L=I-I/5,M=y*I,N=0;N<b[0].length;N++)J.push(L),K.push(M);else r?(J=A.map(function(a){return(a.outer-a.outer/5)*I}),K=A.map(function(a){return(a.inner-a.inner/5)*I}),y=d3.min(A.map(function(a){return a.inner-a.inner/5}))):(J=A.map(function(a){return a.outer*I}),K=A.map(function(a){return a.inner*I}),y=d3.min(A.map(function(a){return a.inner})));a.utils.initSVG(i);var O=i.selectAll(".nv-wrap.nv-pie").data(b),P=O.enter().append("g").attr("class","nvd3 nv-wrap nv-pie nv-chart-"+h),Q=P.append("g"),R=O.select("g"),S=Q.append("g").attr("class","nv-pie");Q.append("g").attr("class","nv-pieLabels"),O.attr("transform","translate("+c.left+","+c.top+")"),R.select(".nv-pie").attr("transform","translate("+G/2+","+H/2+")"),R.select(".nv-pieLabels").attr("transform","translate("+G/2+","+H/2+")"),i.on("click",function(a,b){B.chartClick({data:a,index:b,pos:d3.event,id:h})}),C=[],D=[];for(var N=0;N<b[0].length;N++){var T=d3.svg.arc().outerRadius(J[N]),U=d3.svg.arc().outerRadius(J[N]+5);u!==!1&&(T.startAngle(u),U.startAngle(u)),w!==!1&&(T.endAngle(w),U.endAngle(w)),p&&(T.innerRadius(K[N]),U.innerRadius(K[N])),T.cornerRadius&&x&&(T.cornerRadius(x),U.cornerRadius(x)),C.push(T),D.push(U)}var V=d3.layout.pie().sort(null).value(function(a){return a.disabled?0:g(a)});V.padAngle&&v&&V.padAngle(v),p&&q&&(S.append("text").attr("class","nv-pie-title"),O.select(".nv-pie-title").style("text-anchor","middle").text(function(a){return q}).style("font-size",Math.min(G,H)*y*2/(q.length+2)+"px").attr("dy","0.35em").attr("transform",function(a,b){return"translate(0, "+s+")"}));var W=O.select(".nv-pie").selectAll(".nv-slice").data(V),X=O.select(".nv-pieLabels").selectAll(".nv-label").data(V);W.exit().remove(),X.exit().remove();var Y=W.enter().append("g");Y.attr("class","nv-slice"),Y.on("mouseover",function(a,b){d3.select(this).classed("hover",!0),r&&d3.select(this).select("path").transition().duration(70).attr("d",D[b]),B.elementMouseover({data:a.data,index:b,color:d3.select(this).style("fill"),percent:(a.endAngle-a.startAngle)/(2*Math.PI)})}),Y.on("mouseout",function(a,b){d3.select(this).classed("hover",!1),r&&d3.select(this).select("path").transition().duration(50).attr("d",C[b]),B.elementMouseout({data:a.data,index:b})}),Y.on("mousemove",function(a,b){B.elementMousemove({data:a.data,index:b})}),Y.on("click",function(a,b){var c=this;B.elementClick({data:a.data,index:b,color:d3.select(this).style("fill"),event:d3.event,element:c})}),Y.on("dblclick",function(a,b){B.elementDblClick({data:a.data,index:b,color:d3.select(this).style("fill")})}),W.attr("fill",function(a,b){return j(a.data,b)}),W.attr("stroke",function(a,b){return j(a.data,b)});Y.append("path").each(function(a){this._current=a});if(W.select("path").transition().duration(z).attr("d",function(a,b){return C[b](a)}).attrTween("d",F),l){for(var Z=[],N=0;N<b[0].length;N++)Z.push(C[N]),m?p&&(Z[N]=d3.svg.arc().outerRadius(C[N].outerRadius()),u!==!1&&Z[N].startAngle(u),w!==!1&&Z[N].endAngle(w)):p||Z[N].innerRadius(0);X.enter().append("g").classed("nv-label",!0).each(function(a,b){var c=d3.select(this);c.attr("transform",function(a,b){if(t){a.outerRadius=J[b]+10,a.innerRadius=J[b]+15;var c=(a.startAngle+a.endAngle)/2*(180/Math.PI);return(a.startAngle+a.endAngle)/2<Math.PI?c-=90:c+=90,"translate("+Z[b].centroid(a)+") rotate("+c+")"}return a.outerRadius=I+10,a.innerRadius=I+15,"translate("+Z[b].centroid(a)+")"}),c.append("rect").style("stroke","#fff").style("fill","#fff").attr("rx",3).attr("ry",3),c.append("text").style("text-anchor",t?(a.startAngle+a.endAngle)/2<Math.PI?"start":"end":"middle").style("fill","#000")});var $={},_=14,aa=140,ba=function(a){return Math.floor(a[0]/aa)*aa+","+Math.floor(a[1]/_)*_},ca=function(a){return(a.endAngle-a.startAngle)/(2*Math.PI)};X.watchTransition(E,"pie labels").attr("transform",function(a,b){if(t){a.outerRadius=J[b]+10,a.innerRadius=J[b]+15;var c=(a.startAngle+a.endAngle)/2*(180/Math.PI);return(a.startAngle+a.endAngle)/2<Math.PI?c-=90:c+=90,"translate("+Z[b].centroid(a)+") rotate("+c+")"}a.outerRadius=I+10,a.innerRadius=I+15;var d=Z[b].centroid(a),e=ca(a);if(a.value&&e>=o){var f=ba(d);$[f]&&(d[1]-=_),$[ba(d)]=!0}return"translate("+d+")"}),X.select(".nv-label text").style("text-anchor",function(a,b){return t?(a.startAngle+a.endAngle)/2<Math.PI?"start":"end":"middle"}).text(function(a,b){var c=ca(a),d="";if(!a.value||o>c)return"";if("function"==typeof n)d=n(a,b,{key:f(a.data),value:g(a.data),percent:k(c)});else switch(n){case"key":d=f(a.data);break;case"value":d=k(g(a.data));break;case"percent":d=d3.format("%")(c)}return d})}}),E.renderEnd("pie immediate"),b}var c={top:0,right:0,bottom:0,left:0},d=500,e=500,f=function(a){return a.x},g=function(a){return a.y},h=Math.floor(1e4*Math.random()),i=null,j=a.utils.defaultColor(),k=d3.format(",.2f"),l=!0,m=!1,n="key",o=.02,p=!1,q=!1,r=!0,s=0,t=!1,u=!1,v=!1,w=!1,x=0,y=.5,z=250,A=[],B=d3.dispatch("chartClick","elementClick","elementDblClick","elementMouseover","elementMouseout","elementMousemove","renderEnd"),C=[],D=[],E=a.utils.renderWatch(B);return b.dispatch=B,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{arcsRadius:{get:function(){return A},set:function(a){A=a}},width:{get:function(){return d},set:function(a){d=a}},height:{get:function(){return e},set:function(a){e=a}},showLabels:{get:function(){return l},set:function(a){l=a}},title:{get:function(){return q},set:function(a){q=a}},titleOffset:{get:function(){return s},set:function(a){s=a}},labelThreshold:{get:function(){return o},set:function(a){o=a}},valueFormat:{get:function(){return k},set:function(a){k=a}},x:{get:function(){return f},set:function(a){f=a}},id:{get:function(){return h},set:function(a){h=a}},endAngle:{get:function(){return w},set:function(a){w=a}},startAngle:{get:function(){return u},set:function(a){u=a}},padAngle:{get:function(){return v},set:function(a){v=a}},cornerRadius:{get:function(){return x},set:function(a){x=a}},donutRatio:{get:function(){return y},set:function(a){y=a}},labelsOutside:{get:function(){return m},set:function(a){m=a}},labelSunbeamLayout:{get:function(){return t},set:function(a){t=a}},donut:{get:function(){return p},set:function(a){p=a}},growOnHover:{get:function(){return r},set:function(a){r=a}},pieLabelsOutside:{get:function(){return m},set:function(b){m=b,a.deprecated("pieLabelsOutside","use labelsOutside instead")}},donutLabelsOutside:{get:function(){return m},set:function(b){m=b,a.deprecated("donutLabelsOutside","use labelsOutside instead")}},labelFormat:{get:function(){return k},set:function(b){k=b,a.deprecated("labelFormat","use valueFormat instead")}},margin:{get:function(){return c},set:function(a){c.top="undefined"!=typeof a.top?a.top:c.top,c.right="undefined"!=typeof a.right?a.right:c.right,c.bottom="undefined"!=typeof a.bottom?a.bottom:c.bottom,c.left="undefined"!=typeof a.left?a.left:c.left}},duration:{get:function(){return z},set:function(a){z=a,E.reset(z)}},y:{get:function(){return g},set:function(a){g=d3.functor(a)}},color:{get:function(){return j},set:function(b){j=a.utils.getColor(b)}},labelType:{get:function(){return n},set:function(a){n=a||"key"}}}),a.utils.initOptions(b),b},a.models.pieChart=function(){"use strict";function b(e){return r.reset(),r.models(c),e.each(function(e){var i=d3.select(this);a.utils.initSVG(i);var l=a.utils.availableWidth(g,i,f),o=a.utils.availableHeight(h,i,f);if(b.update=function(){i.transition().call(b)},b.container=this,m.setter(t(e),b.update).getter(s(e)).update(),m.disabled=e.map(function(a){return!!a.disabled}),!n){var p;n={};for(p in m)m[p]instanceof Array?n[p]=m[p].slice(0):n[p]=m[p]}if(!e||!e.length)return a.utils.noData(b,i),b;i.selectAll(".nv-noData").remove();var r=i.selectAll("g.nv-wrap.nv-pieChart").data([e]),u=r.enter().append("g").attr("class","nvd3 nv-wrap nv-pieChart").append("g"),v=r.select("g");if(u.append("g").attr("class","nv-pieWrap"),u.append("g").attr("class","nv-legendWrap"),j){if("top"===k)d.width(l).key(c.x()),r.select(".nv-legendWrap").datum(e).call(d),d.height()>f.top&&(f.top=d.height(),o=a.utils.availableHeight(h,i,f)),r.select(".nv-legendWrap").attr("transform","translate(0,"+-f.top+")");else if("right"===k){var w=a.models.legend().width();w>l/2&&(w=l/2),d.height(o).key(c.x()),d.width(w),l-=d.width(),r.select(".nv-legendWrap").datum(e).call(d).attr("transform","translate("+l+",0)")}}else v.select(".nv-legendWrap").selectAll("*").remove();r.attr("transform","translate("+f.left+","+f.top+")"),c.width(l).height(o);var x=v.select(".nv-pieWrap").datum([e]);d3.transition(x).call(c),d.dispatch.on("stateChange",function(a){for(var c in a)m[c]=a[c];q.stateChange(m),b.update()}),q.on("changeState",function(a){"undefined"!=typeof a.disabled&&(e.forEach(function(b,c){b.disabled=a.disabled[c]}),m.disabled=a.disabled),b.update()})}),r.renderEnd("pieChart immediate"),b}var c=a.models.pie(),d=a.models.legend(),e=a.models.tooltip(),f={top:30,right:20,bottom:20,left:20},g=null,h=null,i=!1,j=!0,k="top",l=a.utils.defaultColor(),m=a.utils.state(),n=null,o=null,p=250,q=d3.dispatch("stateChange","changeState","renderEnd");e.duration(0).headerEnabled(!1).valueFormatter(function(a,b){return c.valueFormat()(a,b)});var r=a.utils.renderWatch(q),s=function(a){return function(){return{active:a.map(function(a){return!a.disabled})}}},t=function(a){return function(b){void 0!==b.active&&a.forEach(function(a,c){a.disabled=!b.active[c]})}};return c.dispatch.on("elementMouseover.tooltip",function(a){a.series={key:b.x()(a.data),value:b.y()(a.data),color:a.color,percent:a.percent},i||(delete a.percent,delete a.series.percent),e.data(a).hidden(!1)}),c.dispatch.on("elementMouseout.tooltip",function(a){e.hidden(!0)}),c.dispatch.on("elementMousemove.tooltip",function(a){e()}),b.legend=d,b.dispatch=q,b.pie=c,b.tooltip=e,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return g},set:function(a){g=a}},height:{get:function(){return h},set:function(a){h=a}},noData:{get:function(){return o},set:function(a){o=a}},showTooltipPercent:{get:function(){return i},set:function(a){i=a}},showLegend:{get:function(){return j},set:function(a){j=a}},legendPosition:{get:function(){return k},set:function(a){k=a}},defaultState:{get:function(){return n},set:function(a){n=a}},color:{get:function(){return l},set:function(a){l=a,d.color(l),c.color(l)}},duration:{get:function(){return p},set:function(a){p=a,r.reset(p),c.duration(p)}},margin:{get:function(){return f},set:function(a){f.top=void 0!==a.top?a.top:f.top,f.right=void 0!==a.right?a.right:f.right,f.bottom=void 0!==a.bottom?a.bottom:f.bottom,f.left=void 0!==a.left?a.left:f.left}}}),a.utils.inheritOptions(b,c),a.utils.initOptions(b),b},a.models.scatter=function(){"use strict";function b(a){var b,c;return b=i=i||{},c=a[0].series,b=b[c]=b[c]||{},c=a[1],b=b[c]=b[c]||{}}function c(a){var c,d,e=a[0],f=b(a),g=!1;for(c=1;c<arguments.length;c++)d=arguments[c],f[d]===e[d]&&f.hasOwnProperty(d)||(f[d]=e[d],g=!0);return g}function d(b){return U.reset(),b.each(function(b){function i(){if(T=!1,!z)return!1;if(P===!0){var c=d3.merge(b.map(function(b,c){return b.values.map(function(b,d){var e=s(b,d),f=t(b,d);return[a.utils.NaNtoZero(p(e))+1e-4*Math.random(),a.utils.NaNtoZero(q(f))+1e-4*Math.random(),c,d,b]}).filter(function(a,b){return A(a[4],b)})}));if(0==c.length)return!1;c.length<3&&(c.push([p.range()[0]-20,q.range()[0]-20,null,null]),c.push([p.range()[1]+20,q.range()[1]+20,null,null]),c.push([p.range()[0]-20,q.range()[0]+20,null,null]),c.push([p.range()[1]+20,q.range()[1]-20,null,null]));var d=d3.geom.polygon([[-10,-10],[-10,l+10],[k+10,l+10],[k+10,-10]]),e=d3.geom.voronoi(c).map(function(a,b){return{data:d.clip(a),series:c[b][2],point:c[b][3]}});_.select(".nv-point-paths").selectAll("path").remove();var f=_.select(".nv-point-paths").selectAll("path").data(e),g=f.enter().append("svg:path").attr("d",function(a){return a&&a.data&&0!==a.data.length?"M"+a.data.join(",")+"Z":"M 0 0"}).attr("id",function(a,b){return"nv-path-"+b}).attr("clip-path",function(a,b){return"url(#nv-clip-"+n+"-"+b+")"});if(F&&g.style("fill",d3.rgb(230,230,230)).style("fill-opacity",.4).style("stroke-opacity",1).style("stroke",d3.rgb(200,200,200)),E){_.select(".nv-point-clips").selectAll("*").remove();var h=_.select(".nv-point-clips").selectAll("clipPath").data(c);h.enter().append("svg:clipPath").attr("id",function(a,b){return"nv-clip-"+n+"-"+b}).append("svg:circle").attr("cx",function(a){return a[0]}).attr("cy",function(a){return a[1]}).attr("r",G)}var i=function(a,c){if(T)return 0;var d=b[a.series];if(void 0!==d){var e=d.values[a.point];e.color=m(d,a.series),e.x=s(e),e.y=t(e);var f=o.node().getBoundingClientRect(),g=window.pageYOffset||document.documentElement.scrollTop,h=window.pageXOffset||document.documentElement.scrollLeft,i={left:p(s(e,a.point))+f.left+h+j.left+10,top:q(t(e,a.point))+f.top+g+j.top+10};c({point:e,series:d,pos:i,relativePos:[p(s(e,a.point))+j.left,q(t(e,a.point))+j.top],seriesIndex:a.series,pointIndex:a.point})}};f.on("click",function(a){i(a,O.elementClick)}).on("dblclick",function(a){i(a,O.elementDblClick)}).on("mouseover",function(a){i(a,O.elementMouseover)}).on("mouseout",function(a,b){i(a,O.elementMouseout)})}else _.select(".nv-groups").selectAll(".nv-group").selectAll(".nv-point").on("click",function(a,c){if(T||!b[a.series])return 0;var d=b[a.series],e=d.values[c],f=this;O.elementClick({point:e,series:d,pos:[p(s(e,c))+j.left,q(t(e,c))+j.top],relativePos:[p(s(e,c))+j.left,q(t(e,c))+j.top],seriesIndex:a.series,pointIndex:c,event:d3.event,element:f})}).on("dblclick",function(a,c){if(T||!b[a.series])return 0;var d=b[a.series],e=d.values[c];O.elementDblClick({point:e,series:d,pos:[p(s(e,c))+j.left,q(t(e,c))+j.top],relativePos:[p(s(e,c))+j.left,q(t(e,c))+j.top],seriesIndex:a.series,pointIndex:c})}).on("mouseover",function(a,c){if(T||!b[a.series])return 0;var d=b[a.series],e=d.values[c];O.elementMouseover({point:e,series:d,pos:[p(s(e,c))+j.left,q(t(e,c))+j.top],relativePos:[p(s(e,c))+j.left,q(t(e,c))+j.top],seriesIndex:a.series,pointIndex:c,color:m(a,c)})}).on("mouseout",function(a,c){if(T||!b[a.series])return 0;var d=b[a.series],e=d.values[c];O.elementMouseout({point:e,series:d,pos:[p(s(e,c))+j.left,q(t(e,c))+j.top],relativePos:[p(s(e,c))+j.left,q(t(e,c))+j.top],seriesIndex:a.series,pointIndex:c,color:m(a,c)})})}o=d3.select(this);var Q=a.utils.availableWidth(k,o,j),W=a.utils.availableHeight(l,o,j);a.utils.initSVG(o),b.forEach(function(a,b){a.values.forEach(function(a){a.series=b})});var X=d.yScale().name===d3.scale.log().name?!0:!1,Y=H&&I&&L?[]:d3.merge(b.map(function(a){return a.values.map(function(a,b){return{x:s(a,b),y:t(a,b),size:u(a,b)}})}));if(p.domain(H||d3.extent(Y.map(function(a){return a.x}).concat(w))),B&&b[0]?p.range(J||[(Q*C+Q)/(2*b[0].values.length),Q-Q*(1+C)/(2*b[0].values.length)]):p.range(J||[0,Q]),X){var Z=d3.min(Y.map(function(a){return 0!==a.y?a.y:void 0}));q.clamp(!0).domain(I||d3.extent(Y.map(function(a){return 0!==a.y?a.y:.1*Z}).concat(x))).range(K||[W,0])}else q.domain(I||d3.extent(Y.map(function(a){return a.y}).concat(x))).range(K||[W,0]);r.domain(L||d3.extent(Y.map(function(a){return a.size}).concat(y))).range(M||V),N=p.domain()[0]===p.domain()[1]||q.domain()[0]===q.domain()[1],p.domain()[0]===p.domain()[1]&&(p.domain()[0]?p.domain([p.domain()[0]-.01*p.domain()[0],p.domain()[1]+.01*p.domain()[1]]):p.domain([-1,1])),q.domain()[0]===q.domain()[1]&&(q.domain()[0]?q.domain([q.domain()[0]-.01*q.domain()[0],q.domain()[1]+.01*q.domain()[1]]):q.domain([-1,1])),isNaN(p.domain()[0])&&p.domain([-1,1]),isNaN(q.domain()[0])&&q.domain([-1,1]),e=e||p,f=f||q,g=g||r;var $=p(1)!==e(1)||q(1)!==f(1)||r(1)!==g(1),_=o.selectAll("g.nv-wrap.nv-scatter").data([b]),aa=_.enter().append("g").attr("class","nvd3 nv-wrap nv-scatter nv-chart-"+n),ba=aa.append("defs"),ca=aa.append("g"),da=_.select("g");_.classed("nv-single-point",N),ca.append("g").attr("class","nv-groups"),ca.append("g").attr("class","nv-point-paths"),aa.append("g").attr("class","nv-point-clips"),_.attr("transform","translate("+j.left+","+j.top+")"),ba.append("clipPath").attr("id","nv-edge-clip-"+n).append("rect"),_.select("#nv-edge-clip-"+n+" rect").attr("width",Q).attr("height",W>0?W:0),da.attr("clip-path",D?"url(#nv-edge-clip-"+n+")":""),T=!0;var ea=_.select(".nv-groups").selectAll(".nv-group").data(function(a){return a},function(a){return a.key});ea.enter().append("g").style("stroke-opacity",1e-6).style("fill-opacity",1e-6),ea.exit().remove(),ea.attr("class",function(a,b){return(a.classed||"")+" nv-group nv-series-"+b}).classed("nv-noninteractive",!z).classed("hover",function(a){return a.hover}),ea.watchTransition(U,"scatter: groups").style("fill",function(a,b){return m(a,b)}).style("stroke",function(a,b){return m(a,b)}).style("stroke-opacity",1).style("fill-opacity",.5);var fa=ea.selectAll("path.nv-point").data(function(a){return a.values.map(function(a,b){return[a,b]}).filter(function(a,b){return A(a[0],b)})});if(fa.enter().append("path").attr("class",function(a){return"nv-point nv-point-"+a[1]}).style("fill",function(a){return a.color}).style("stroke",function(a){return a.color}).attr("transform",function(b){return"translate("+a.utils.NaNtoZero(e(s(b[0],b[1])))+","+a.utils.NaNtoZero(f(t(b[0],b[1])))+")"}).attr("d",a.utils.symbol().type(function(a){return v(a[0])}).size(function(a){return r(u(a[0],a[1]))})),fa.exit().remove(),ea.exit().selectAll("path.nv-point").watchTransition(U,"scatter exit").attr("transform",function(b){return"translate("+a.utils.NaNtoZero(p(s(b[0],b[1])))+","+a.utils.NaNtoZero(q(t(b[0],b[1])))+")"}).remove(),fa.filter(function(a){return $||c(a,"x","y")}).watchTransition(U,"scatter points").attr("transform",function(b){return"translate("+a.utils.NaNtoZero(p(s(b[0],b[1])))+","+a.utils.NaNtoZero(q(t(b[0],b[1])))+")"}),fa.filter(function(a){return $||c(a,"shape","size")}).watchTransition(U,"scatter points").attr("d",a.utils.symbol().type(function(a){return v(a[0])}).size(function(a){return r(u(a[0],a[1]))})),S){var ga=ea.selectAll(".nv-label").data(function(a){return a.values.map(function(a,b){return[a,b]}).filter(function(a,b){return A(a[0],b)})});ga.enter().append("text").style("fill",function(a,b){return a.color}).style("stroke-opacity",0).style("fill-opacity",1).attr("transform",function(b){var c=a.utils.NaNtoZero(e(s(b[0],b[1])))+Math.sqrt(r(u(b[0],b[1]))/Math.PI)+2;return"translate("+c+","+a.utils.NaNtoZero(f(t(b[0],b[1])))+")"}).text(function(a,b){return a[0].label}),ga.exit().remove(),ea.exit().selectAll("path.nv-label").watchTransition(U,"scatter exit").attr("transform",function(b){var c=a.utils.NaNtoZero(p(s(b[0],b[1])))+Math.sqrt(r(u(b[0],b[1]))/Math.PI)+2;return"translate("+c+","+a.utils.NaNtoZero(q(t(b[0],b[1])))+")"}).remove(),ga.each(function(a){d3.select(this).classed("nv-label",!0).classed("nv-label-"+a[1],!1).classed("hover",!1)}),ga.watchTransition(U,"scatter labels").attr("transform",function(b){var c=a.utils.NaNtoZero(p(s(b[0],b[1])))+Math.sqrt(r(u(b[0],b[1]))/Math.PI)+2;return"translate("+c+","+a.utils.NaNtoZero(q(t(b[0],b[1])))+")"})}R?(clearTimeout(h),h=setTimeout(i,R)):i(),e=p.copy(),f=q.copy(),g=r.copy()}),U.renderEnd("scatter immediate"),d}var e,f,g,h,i,j={top:0,right:0,bottom:0,left:0},k=null,l=null,m=a.utils.defaultColor(),n=Math.floor(1e5*Math.random()),o=null,p=d3.scale.linear(),q=d3.scale.linear(),r=d3.scale.linear(),s=function(a){return a.x},t=function(a){return a.y},u=function(a){return a.size||1},v=function(a){return a.shape||"circle"},w=[],x=[],y=[],z=!0,A=function(a){return!a.notActive},B=!1,C=.1,D=!1,E=!0,F=!1,G=function(){return 25},H=null,I=null,J=null,K=null,L=null,M=null,N=!1,O=d3.dispatch("elementClick","elementDblClick","elementMouseover","elementMouseout","renderEnd"),P=!0,Q=250,R=300,S=!1,T=!1,U=a.utils.renderWatch(O,Q),V=[16,256];return d.dispatch=O,d.options=a.utils.optionsFunc.bind(d),d._calls=new function(){this.clearHighlights=function(){return a.dom.write(function(){o.selectAll(".nv-point.hover").classed("hover",!1)}),null},this.highlightPoint=function(b,c,d){a.dom.write(function(){o.select(".nv-groups").selectAll(".nv-series-"+b).selectAll(".nv-point-"+c).classed("hover",d)})}},O.on("elementMouseover.point",function(a){z&&d._calls.highlightPoint(a.seriesIndex,a.pointIndex,!0)}),O.on("elementMouseout.point",function(a){z&&d._calls.highlightPoint(a.seriesIndex,a.pointIndex,!1)}),d._options=Object.create({},{width:{get:function(){return k},set:function(a){k=a}},height:{get:function(){return l},set:function(a){l=a}},xScale:{get:function(){return p},set:function(a){p=a}},yScale:{get:function(){return q},set:function(a){q=a}},pointScale:{get:function(){return r},set:function(a){r=a}},xDomain:{get:function(){return H},set:function(a){H=a}},yDomain:{get:function(){return I},set:function(a){I=a}},pointDomain:{get:function(){return L},set:function(a){L=a}},xRange:{get:function(){return J},set:function(a){J=a}},yRange:{get:function(){return K},set:function(a){K=a}},pointRange:{get:function(){return M},set:function(a){M=a}},forceX:{get:function(){return w},set:function(a){w=a}},forceY:{get:function(){return x},set:function(a){x=a}},forcePoint:{get:function(){return y},set:function(a){y=a}},interactive:{get:function(){return z},set:function(a){z=a}},pointActive:{get:function(){return A},set:function(a){A=a}},padDataOuter:{get:function(){return C},set:function(a){C=a}},padData:{get:function(){return B},set:function(a){B=a}},clipEdge:{get:function(){return D},set:function(a){D=a}},clipVoronoi:{get:function(){return E},set:function(a){E=a}},clipRadius:{get:function(){return G},set:function(a){G=a}},showVoronoi:{get:function(){return F},set:function(a){F=a}},id:{get:function(){return n},set:function(a){n=a}},interactiveUpdateDelay:{get:function(){return R},set:function(a){R=a}},showLabels:{get:function(){return S},set:function(a){S=a}},x:{get:function(){return s},set:function(a){s=d3.functor(a)}},y:{get:function(){return t},set:function(a){t=d3.functor(a)}},pointSize:{get:function(){return u},set:function(a){u=d3.functor(a)}},pointShape:{get:function(){return v},set:function(a){v=d3.functor(a)}},margin:{get:function(){return j},set:function(a){j.top=void 0!==a.top?a.top:j.top,j.right=void 0!==a.right?a.right:j.right,j.bottom=void 0!==a.bottom?a.bottom:j.bottom,j.left=void 0!==a.left?a.left:j.left}},duration:{get:function(){return Q},set:function(a){Q=a,U.reset(Q)}},color:{get:function(){return m},set:function(b){m=a.utils.getColor(b)}},useVoronoi:{get:function(){return P},set:function(a){P=a,P===!1&&(E=!1)}}}),a.utils.initOptions(d),d},a.models.scatterChart=function(){"use strict";function b(z){return E.reset(),E.models(c),t&&E.models(d),u&&E.models(e),q&&E.models(g),r&&E.models(h),z.each(function(z){m=d3.select(this),a.utils.initSVG(m);var H=a.utils.availableWidth(k,m,j),I=a.utils.availableHeight(l,m,j);if(b.update=function(){0===A?m.call(b):m.transition().duration(A).call(b)},b.container=this,w.setter(G(z),b.update).getter(F(z)).update(),w.disabled=z.map(function(a){return!!a.disabled}),!x){var J;x={};for(J in w)w[J]instanceof Array?x[J]=w[J].slice(0):x[J]=w[J]}if(!(z&&z.length&&z.filter(function(a){return a.values.length}).length))return a.utils.noData(b,m),E.renderEnd("scatter immediate"),b;m.selectAll(".nv-noData").remove(),o=c.xScale(),p=c.yScale();var K=m.selectAll("g.nv-wrap.nv-scatterChart").data([z]),L=K.enter().append("g").attr("class","nvd3 nv-wrap nv-scatterChart nv-chart-"+c.id()),M=L.append("g"),N=K.select("g");if(M.append("rect").attr("class","nvd3 nv-background").style("pointer-events","none"),M.append("g").attr("class","nv-x nv-axis"),M.append("g").attr("class","nv-y nv-axis"),M.append("g").attr("class","nv-scatterWrap"),M.append("g").attr("class","nv-regressionLinesWrap"),M.append("g").attr("class","nv-distWrap"),M.append("g").attr("class","nv-legendWrap"),v&&N.select(".nv-y.nv-axis").attr("transform","translate("+H+",0)"),s){var O=H;f.width(O),K.select(".nv-legendWrap").datum(z).call(f),f.height()>j.top&&(j.top=f.height(),I=a.utils.availableHeight(l,m,j)),K.select(".nv-legendWrap").attr("transform","translate(0,"+-j.top+")")}else N.select(".nv-legendWrap").selectAll("*").remove();K.attr("transform","translate("+j.left+","+j.top+")"),c.width(H).height(I).color(z.map(function(a,b){return a.color=a.color||n(a,b),a.color}).filter(function(a,b){return!z[b].disabled})).showLabels(B),K.select(".nv-scatterWrap").datum(z.filter(function(a){return!a.disabled})).call(c),K.select(".nv-regressionLinesWrap").attr("clip-path","url(#nv-edge-clip-"+c.id()+")");var P=K.select(".nv-regressionLinesWrap").selectAll(".nv-regLines").data(function(a){return a});P.enter().append("g").attr("class","nv-regLines");var Q=P.selectAll(".nv-regLine").data(function(a){return[a]});Q.enter().append("line").attr("class","nv-regLine").style("stroke-opacity",0),Q.filter(function(a){return a.intercept&&a.slope}).watchTransition(E,"scatterPlusLineChart: regline").attr("x1",o.range()[0]).attr("x2",o.range()[1]).attr("y1",function(a,b){return p(o.domain()[0]*a.slope+a.intercept)}).attr("y2",function(a,b){return p(o.domain()[1]*a.slope+a.intercept)}).style("stroke",function(a,b,c){return n(a,c)}).style("stroke-opacity",function(a,b){return a.disabled||"undefined"==typeof a.slope||"undefined"==typeof a.intercept?0:1}),t&&(d.scale(o)._ticks(a.utils.calcTicksX(H/100,z)).tickSize(-I,0),N.select(".nv-x.nv-axis").attr("transform","translate(0,"+p.range()[0]+")").call(d)),u&&(e.scale(p)._ticks(a.utils.calcTicksY(I/36,z)).tickSize(-H,0),N.select(".nv-y.nv-axis").call(e)),q&&(g.getData(c.x()).scale(o).width(H).color(z.map(function(a,b){return a.color||n(a,b)}).filter(function(a,b){return!z[b].disabled})),M.select(".nv-distWrap").append("g").attr("class","nv-distributionX"),N.select(".nv-distributionX").attr("transform","translate(0,"+p.range()[0]+")").datum(z.filter(function(a){return!a.disabled})).call(g)),r&&(h.getData(c.y()).scale(p).width(I).color(z.map(function(a,b){return a.color||n(a,b)}).filter(function(a,b){return!z[b].disabled})),M.select(".nv-distWrap").append("g").attr("class","nv-distributionY"),N.select(".nv-distributionY").attr("transform","translate("+(v?H:-h.size())+",0)").datum(z.filter(function(a){return!a.disabled})).call(h)),f.dispatch.on("stateChange",function(a){for(var c in a)w[c]=a[c];y.stateChange(w),b.update()}),y.on("changeState",function(a){"undefined"!=typeof a.disabled&&(z.forEach(function(b,c){b.disabled=a.disabled[c]}),w.disabled=a.disabled),b.update()}),c.dispatch.on("elementMouseout.tooltip",function(a){i.hidden(!0),m.select(".nv-chart-"+c.id()+" .nv-series-"+a.seriesIndex+" .nv-distx-"+a.pointIndex).attr("y1",0),m.select(".nv-chart-"+c.id()+" .nv-series-"+a.seriesIndex+" .nv-disty-"+a.pointIndex).attr("x2",h.size())}),c.dispatch.on("elementMouseover.tooltip",function(a){m.select(".nv-series-"+a.seriesIndex+" .nv-distx-"+a.pointIndex).attr("y1",a.relativePos[1]-I),m.select(".nv-series-"+a.seriesIndex+" .nv-disty-"+a.pointIndex).attr("x2",a.relativePos[0]+g.size()),i.data(a).hidden(!1)}),C=o.copy(),D=p.copy()}),E.renderEnd("scatter with line immediate"),b}var c=a.models.scatter(),d=a.models.axis(),e=a.models.axis(),f=a.models.legend(),g=a.models.distribution(),h=a.models.distribution(),i=a.models.tooltip(),j={top:30,right:20,bottom:50,left:75},k=null,l=null,m=null,n=a.utils.defaultColor(),o=c.xScale(),p=c.yScale(),q=!1,r=!1,s=!0,t=!0,u=!0,v=!1,w=a.utils.state(),x=null,y=d3.dispatch("stateChange","changeState","renderEnd"),z=null,A=250,B=!1;c.xScale(o).yScale(p),d.orient("bottom").tickPadding(10),e.orient(v?"right":"left").tickPadding(10),g.axis("x"),h.axis("y"),i.headerFormatter(function(a,b){return d.tickFormat()(a,b)}).valueFormatter(function(a,b){return e.tickFormat()(a,b)});var C,D,E=a.utils.renderWatch(y,A),F=function(a){return function(){return{active:a.map(function(a){return!a.disabled})}}},G=function(a){return function(b){void 0!==b.active&&a.forEach(function(a,c){a.disabled=!b.active[c]})}};return b.dispatch=y,b.scatter=c,b.legend=f,b.xAxis=d,b.yAxis=e,b.distX=g,b.distY=h,b.tooltip=i,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return k},set:function(a){k=a}},height:{get:function(){return l},set:function(a){l=a}},container:{get:function(){return m},set:function(a){m=a}},showDistX:{get:function(){return q},set:function(a){q=a}},showDistY:{get:function(){return r},set:function(a){r=a}},showLegend:{get:function(){return s},set:function(a){s=a}},showXAxis:{get:function(){return t},set:function(a){t=a}},showYAxis:{get:function(){return u},set:function(a){u=a}},defaultState:{get:function(){return x},set:function(a){x=a}},noData:{get:function(){return z},set:function(a){z=a}},duration:{get:function(){return A},set:function(a){A=a}},showLabels:{get:function(){return B},set:function(a){B=a}},margin:{get:function(){return j},set:function(a){j.top=void 0!==a.top?a.top:j.top,j.right=void 0!==a.right?a.right:j.right,j.bottom=void 0!==a.bottom?a.bottom:j.bottom,j.left=void 0!==a.left?a.left:j.left}},rightAlignYAxis:{get:function(){return v},set:function(a){v=a,e.orient(a?"right":"left")}},color:{get:function(){return n},set:function(b){n=a.utils.getColor(b),f.color(n),g.color(n),h.color(n)}}}),a.utils.inheritOptions(b,c),a.utils.initOptions(b),b},a.models.sparkline=function(){"use strict";function b(k){return t.reset(),k.each(function(b){var k=h-g.left-g.right,s=i-g.top-g.bottom;j=d3.select(this),a.utils.initSVG(j),l.domain(c||d3.extent(b,n)).range(e||[0,k]),m.domain(d||d3.extent(b,o)).range(f||[s,0]);var t=j.selectAll("g.nv-wrap.nv-sparkline").data([b]),u=t.enter().append("g").attr("class","nvd3 nv-wrap nv-sparkline");u.append("g"),t.select("g");t.attr("transform","translate("+g.left+","+g.top+")");var v=t.selectAll("path").data(function(a){return[a]});v.enter().append("path"),v.exit().remove(),v.style("stroke",function(a,b){return a.color||p(a,b)}).attr("d",d3.svg.line().x(function(a,b){return l(n(a,b))}).y(function(a,b){return m(o(a,b))}));var w=t.selectAll("circle.nv-point").data(function(a){function b(b){if(-1!=b){var c=a[b];return c.pointIndex=b,c}return null}var c=a.map(function(a,b){return o(a,b)}),d=b(c.lastIndexOf(m.domain()[1])),e=b(c.indexOf(m.domain()[0])),f=b(c.length-1);return[q?e:null,q?d:null,r?f:null].filter(function(a){return null!=a})});w.enter().append("circle"),w.exit().remove(),w.attr("cx",function(a,b){return l(n(a,a.pointIndex))}).attr("cy",function(a,b){return m(o(a,a.pointIndex))}).attr("r",2).attr("class",function(a,b){return n(a,a.pointIndex)==l.domain()[1]?"nv-point nv-currentValue":o(a,a.pointIndex)==m.domain()[0]?"nv-point nv-minValue":"nv-point nv-maxValue"})}),t.renderEnd("sparkline immediate"),b}var c,d,e,f,g={top:2,right:0,bottom:2,left:0},h=400,i=32,j=null,k=!0,l=d3.scale.linear(),m=d3.scale.linear(),n=function(a){return a.x},o=function(a){return a.y},p=a.utils.getColor(["#000"]),q=!0,r=!0,s=d3.dispatch("renderEnd"),t=a.utils.renderWatch(s);return b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return h},set:function(a){h=a}},height:{get:function(){return i},set:function(a){i=a}},xDomain:{get:function(){return c},set:function(a){c=a}},yDomain:{get:function(){return d},set:function(a){d=a}},xRange:{get:function(){return e},set:function(a){e=a}},yRange:{get:function(){return f},set:function(a){f=a}},xScale:{get:function(){return l},set:function(a){l=a}},yScale:{get:function(){return m},set:function(a){m=a}},animate:{get:function(){return k},set:function(a){k=a}},showMinMaxPoints:{get:function(){return q},set:function(a){q=a}},showCurrentPoint:{get:function(){return r},set:function(a){r=a}},x:{get:function(){return n},set:function(a){n=d3.functor(a)}},y:{get:function(){return o},set:function(a){o=d3.functor(a)}},margin:{get:function(){return g},set:function(a){g.top=void 0!==a.top?a.top:g.top,g.right=void 0!==a.right?a.right:g.right,g.bottom=void 0!==a.bottom?a.bottom:g.bottom,g.left=void 0!==a.left?a.left:g.left}},color:{get:function(){return p},set:function(b){p=a.utils.getColor(b)}}}),b.dispatch=s,a.utils.initOptions(b),b},a.models.sparklinePlus=function(){"use strict";function b(p){return r.reset(),r.models(e),p.each(function(p){function q(){if(!j){var a=z.selectAll(".nv-hoverValue").data(i),b=a.enter().append("g").attr("class","nv-hoverValue").style("stroke-opacity",0).style("fill-opacity",0);a.exit().transition().duration(250).style("stroke-opacity",0).style("fill-opacity",0).remove(),a.attr("transform",function(a){return"translate("+c(e.x()(p[a],a))+",0)"}).transition().duration(250).style("stroke-opacity",1).style("fill-opacity",1),i.length&&(b.append("line").attr("x1",0).attr("y1",-f.top).attr("x2",0).attr("y2",u),b.append("text").attr("class","nv-xValue").attr("x",-6).attr("y",-f.top).attr("text-anchor","end").attr("dy",".9em"),z.select(".nv-hoverValue .nv-xValue").text(k(e.x()(p[i[0]],i[0]))),b.append("text").attr("class","nv-yValue").attr("x",6).attr("y",-f.top).attr("text-anchor","start").attr("dy",".9em"),z.select(".nv-hoverValue .nv-yValue").text(l(e.y()(p[i[0]],i[0]))))}}function r(){function a(a,b){for(var c=Math.abs(e.x()(a[0],0)-b),d=0,f=0;f<a.length;f++)Math.abs(e.x()(a[f],f)-b)<c&&(c=Math.abs(e.x()(a[f],f)-b),d=f);return d}if(!j){var b=d3.mouse(this)[0]-f.left;i=[a(p,Math.round(c.invert(b)))],q()}}var s=d3.select(this);a.utils.initSVG(s);var t=a.utils.availableWidth(g,s,f),u=a.utils.availableHeight(h,s,f);if(b.update=function(){s.call(b)},b.container=this,!p||!p.length)return a.utils.noData(b,s),b;s.selectAll(".nv-noData").remove();var v=e.y()(p[p.length-1],p.length-1);c=e.xScale(),d=e.yScale();var w=s.selectAll("g.nv-wrap.nv-sparklineplus").data([p]),x=w.enter().append("g").attr("class","nvd3 nv-wrap nv-sparklineplus"),y=x.append("g"),z=w.select("g");y.append("g").attr("class","nv-sparklineWrap"),y.append("g").attr("class","nv-valueWrap"),y.append("g").attr("class","nv-hoverArea"),w.attr("transform","translate("+f.left+","+f.top+")");var A=z.select(".nv-sparklineWrap");if(e.width(t).height(u),A.call(e),m){var B=z.select(".nv-valueWrap"),C=B.selectAll(".nv-currentValue").data([v]);C.enter().append("text").attr("class","nv-currentValue").attr("dx",o?-8:8).attr("dy",".9em").style("text-anchor",o?"end":"start"),C.attr("x",t+(o?f.right:0)).attr("y",n?function(a){return d(a)}:0).style("fill",e.color()(p[p.length-1],p.length-1)).text(l(v))}y.select(".nv-hoverArea").append("rect").on("mousemove",r).on("click",function(){j=!j}).on("mouseout",function(){i=[],q()}),z.select(".nv-hoverArea rect").attr("transform",function(a){return"translate("+-f.left+","+-f.top+")"}).attr("width",t+f.left+f.right).attr("height",u+f.top)}),r.renderEnd("sparklinePlus immediate"),b}var c,d,e=a.models.sparkline(),f={top:15,right:100,bottom:10,left:50},g=null,h=null,i=[],j=!1,k=d3.format(",r"),l=d3.format(",.2f"),m=!0,n=!0,o=!1,p=null,q=d3.dispatch("renderEnd"),r=a.utils.renderWatch(q);return b.dispatch=q,b.sparkline=e,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return g},set:function(a){g=a}},height:{get:function(){return h},set:function(a){h=a}},xTickFormat:{get:function(){return k},set:function(a){k=a}},yTickFormat:{get:function(){return l},set:function(a){l=a}},showLastValue:{get:function(){return m},set:function(a){m=a}},alignValue:{get:function(){return n},set:function(a){n=a}},rightAlignValue:{get:function(){return o},set:function(a){o=a}},noData:{get:function(){return p},set:function(a){p=a}},margin:{get:function(){return f},set:function(a){f.top=void 0!==a.top?a.top:f.top,f.right=void 0!==a.right?a.right:f.right,f.bottom=void 0!==a.bottom?a.bottom:f.bottom,f.left=void 0!==a.left?a.left:f.left}}}),a.utils.inheritOptions(b,e),a.utils.initOptions(b),b},a.models.stackedArea=function(){"use strict";function b(n){return v.reset(),v.models(s),n.each(function(n){var t=f-e.left-e.right,w=g-e.top-e.bottom;j=d3.select(this),a.utils.initSVG(j),c=s.xScale(),d=s.yScale();var x=n;n.forEach(function(a,b){a.seriesIndex=b,a.values=a.values.map(function(a,c){return a.index=c,a.seriesIndex=b,a})});var y=n.filter(function(a){return!a.disabled});n=d3.layout.stack().order(p).offset(o).values(function(a){return a.values}).x(k).y(l).out(function(a,b,c){a.display={y:c,y0:b}})(y);var z=j.selectAll("g.nv-wrap.nv-stackedarea").data([n]),A=z.enter().append("g").attr("class","nvd3 nv-wrap nv-stackedarea"),B=A.append("defs"),C=A.append("g"),D=z.select("g");C.append("g").attr("class","nv-areaWrap"),C.append("g").attr("class","nv-scatterWrap"),z.attr("transform","translate("+e.left+","+e.top+")"),0==s.forceY().length&&s.forceY().push(0),s.width(t).height(w).x(k).y(function(a){return void 0!==a.display?a.display.y+a.display.y0:void 0}).color(n.map(function(a,b){return a.color=a.color||h(a,a.seriesIndex),a.color}));var E=D.select(".nv-scatterWrap").datum(n);E.call(s),B.append("clipPath").attr("id","nv-edge-clip-"+i).append("rect"),z.select("#nv-edge-clip-"+i+" rect").attr("width",t).attr("height",w),D.attr("clip-path",r?"url(#nv-edge-clip-"+i+")":"");var F=d3.svg.area().defined(m).x(function(a,b){return c(k(a,b))}).y0(function(a){return d(a.display.y0)}).y1(function(a){return d(a.display.y+a.display.y0)}).interpolate(q),G=d3.svg.area().defined(m).x(function(a,b){return c(k(a,b))}).y0(function(a){return d(a.display.y0)}).y1(function(a){return d(a.display.y0)}),H=D.select(".nv-areaWrap").selectAll("path.nv-area").data(function(a){return a});H.enter().append("path").attr("class",function(a,b){return"nv-area nv-area-"+b}).attr("d",function(a,b){return G(a.values,a.seriesIndex)}).on("mouseover",function(a,b){d3.select(this).classed("hover",!0),u.areaMouseover({point:a,series:a.key,pos:[d3.event.pageX,d3.event.pageY],seriesIndex:a.seriesIndex})}).on("mouseout",function(a,b){d3.select(this).classed("hover",!1),u.areaMouseout({point:a,series:a.key,pos:[d3.event.pageX,d3.event.pageY],seriesIndex:a.seriesIndex})}).on("click",function(a,b){d3.select(this).classed("hover",!1),u.areaClick({point:a,series:a.key,pos:[d3.event.pageX,d3.event.pageY],seriesIndex:a.seriesIndex})}),H.exit().remove(),H.style("fill",function(a,b){return a.color||h(a,a.seriesIndex)}).style("stroke",function(a,b){return a.color||h(a,a.seriesIndex)}),H.watchTransition(v,"stackedArea path").attr("d",function(a,b){return F(a.values,b)}),s.dispatch.on("elementMouseover.area",function(a){D.select(".nv-chart-"+i+" .nv-area-"+a.seriesIndex).classed("hover",!0)}),s.dispatch.on("elementMouseout.area",function(a){D.select(".nv-chart-"+i+" .nv-area-"+a.seriesIndex).classed("hover",!1)}),b.d3_stackedOffset_stackPercent=function(a){var b,c,d,e=a.length,f=a[0].length,g=[];for(c=0;f>c;++c){for(b=0,d=0;b<x.length;b++)d+=l(x[b].values[c]);if(d)for(b=0;e>b;b++)a[b][c][1]/=d;else for(b=0;e>b;b++)a[b][c][1]=0}for(c=0;f>c;++c)g[c]=0;return g}}),v.renderEnd("stackedArea immediate"),b}var c,d,e={top:0,right:0,bottom:0,left:0},f=960,g=500,h=a.utils.defaultColor(),i=Math.floor(1e5*Math.random()),j=null,k=function(a){return a.x},l=function(a){return a.y},m=function(a,b){return!isNaN(l(a,b))&&null!==l(a,b)},n="stack",o="zero",p="default",q="linear",r=!1,s=a.models.scatter(),t=250,u=d3.dispatch("areaClick","areaMouseover","areaMouseout","renderEnd","elementClick","elementMouseover","elementMouseout");s.pointSize(2.2).pointDomain([2.2,2.2]);var v=a.utils.renderWatch(u,t);return b.dispatch=u,b.scatter=s,s.dispatch.on("elementClick",function(){u.elementClick.apply(this,arguments)}),s.dispatch.on("elementMouseover",function(){u.elementMouseover.apply(this,arguments)}),s.dispatch.on("elementMouseout",function(){u.elementMouseout.apply(this,arguments)}),b.interpolate=function(a){return arguments.length?(q=a,b):q},b.duration=function(a){return arguments.length?(t=a,v.reset(t),s.duration(t),b):t},b.dispatch=u,b.scatter=s,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return f},set:function(a){f=a}},height:{get:function(){return g},set:function(a){g=a}},defined:{get:function(){return m},set:function(a){m=a}},clipEdge:{get:function(){return r},set:function(a){r=a}},offset:{get:function(){return o},set:function(a){o=a}},order:{get:function(){return p},set:function(a){p=a}},interpolate:{get:function(){return q},set:function(a){q=a}},x:{get:function(){return k},set:function(a){k=d3.functor(a)}},y:{get:function(){return l},set:function(a){l=d3.functor(a)}},margin:{get:function(){return e},set:function(a){e.top=void 0!==a.top?a.top:e.top,e.right=void 0!==a.right?a.right:e.right,e.bottom=void 0!==a.bottom?a.bottom:e.bottom,e.left=void 0!==a.left?a.left:e.left}},color:{get:function(){return h},set:function(b){h=a.utils.getColor(b)}},style:{get:function(){return n},set:function(a){switch(n=a){case"stack":b.offset("zero"),b.order("default");break;case"stream":b.offset("wiggle"),b.order("inside-out");break;case"stream-center":b.offset("silhouette"),b.order("inside-out");break;case"expand":b.offset("expand"),b.order("default");break;case"stack_percent":b.offset(b.d3_stackedOffset_stackPercent),b.order("default")}}},duration:{get:function(){return t},set:function(a){t=a,v.reset(t),s.duration(t)}}}),a.utils.inheritOptions(b,s),a.utils.initOptions(b),b},a.models.stackedAreaChart=function(){"use strict";function b(k){return J.reset(),J.models(e),s&&J.models(f),t&&J.models(g),k.each(function(k){function B(){s&&V.select(".nv-focus .nv-x.nv-axis").attr("transform","translate(0,"+R+")").transition().duration(G).call(f)}function J(){if(t){if("expand"===e.style()||"stack_percent"===e.style()){var a=g.tickFormat();H&&a===N||(H=a),g.tickFormat(N)}else H&&(g.tickFormat(H),H=null);V.select(".nv-focus .nv-y.nv-axis").transition().duration(0).call(g)}}function O(a){var b=V.select(".nv-focus .nv-stackedWrap").datum(k.filter(function(a){return!a.disabled}).map(function(b,c){return{key:b.key,area:b.area,classed:b.classed,values:b.values.filter(function(b,c){return e.x()(b,c)>=a[0]&&e.x()(b,c)<=a[1]}),disableTooltip:b.disableTooltip}}));b.transition().duration(G).call(e),B(),J()}var P=d3.select(this);a.utils.initSVG(P);var Q=a.utils.availableWidth(n,P,m),R=a.utils.availableHeight(o,P,m)-(v?l.height():0);if(b.update=function(){P.transition().duration(G).call(b)},b.container=this,z.setter(M(k),b.update).getter(L(k)).update(),z.disabled=k.map(function(a){return!!a.disabled}),!A){var S;A={};for(S in z)z[S]instanceof Array?A[S]=z[S].slice(0):A[S]=z[S]}if(!(k&&k.length&&k.filter(function(a){return a.values.length}).length))return a.utils.noData(b,P),b;P.selectAll(".nv-noData").remove(),c=e.xScale(),d=e.yScale();var T=P.selectAll("g.nv-wrap.nv-stackedAreaChart").data([k]),U=T.enter().append("g").attr("class","nvd3 nv-wrap nv-stackedAreaChart").append("g"),V=T.select("g");U.append("g").attr("class","nv-legendWrap"),U.append("g").attr("class","nv-controlsWrap");var W=U.append("g").attr("class","nv-focus");W.append("g").attr("class","nv-background").append("rect"),W.append("g").attr("class","nv-x nv-axis"),W.append("g").attr("class","nv-y nv-axis"),W.append("g").attr("class","nv-stackedWrap"),W.append("g").attr("class","nv-interactive");U.append("g").attr("class","nv-focusWrap");if(r){var X=q?Q-D:Q;h.width(X),V.select(".nv-legendWrap").datum(k).call(h),h.height()>m.top&&(m.top=h.height(),R=a.utils.availableHeight(o,P,m)-(v?l.height():0)),V.select(".nv-legendWrap").attr("transform","translate("+(Q-X)+","+-m.top+")")}else V.select(".nv-legendWrap").selectAll("*").remove();if(q){var Y=[{key:F.stacked||"Stacked",metaKey:"Stacked",disabled:"stack"!=e.style(),style:"stack"},{key:F.stream||"Stream",metaKey:"Stream",disabled:"stream"!=e.style(),style:"stream"},{key:F.expanded||"Expanded",metaKey:"Expanded",disabled:"expand"!=e.style(),style:"expand"},{key:F.stack_percent||"Stack %",metaKey:"Stack_Percent",disabled:"stack_percent"!=e.style(),style:"stack_percent"}];D=E.length/3*260,Y=Y.filter(function(a){return-1!==E.indexOf(a.metaKey)}),i.width(D).color(["#444","#444","#444"]),V.select(".nv-controlsWrap").datum(Y).call(i),Math.max(i.height(),h.height())>m.top&&(m.top=Math.max(i.height(),h.height()),R=a.utils.availableHeight(o,P,m)),V.select(".nv-controlsWrap").attr("transform","translate(0,"+-m.top+")")}else V.select(".nv-controlsWrap").selectAll("*").remove();T.attr("transform","translate("+m.left+","+m.top+")"),u&&V.select(".nv-y.nv-axis").attr("transform","translate("+Q+",0)"),w&&(j.width(Q).height(R).margin({left:m.left,top:m.top}).svgContainer(P).xScale(c),T.select(".nv-interactive").call(j)),V.select(".nv-focus .nv-background rect").attr("width",Q).attr("height",R),e.width(Q).height(R).color(k.map(function(a,b){return a.color||p(a,b)}).filter(function(a,b){return!k[b].disabled}));var Z=V.select(".nv-focus .nv-stackedWrap").datum(k.filter(function(a){return!a.disabled}));if(s&&f.scale(c)._ticks(a.utils.calcTicksX(Q/100,k)).tickSize(-R,0),t){var $;$="wiggle"===e.offset()?0:a.utils.calcTicksY(R/36,k),g.scale(d)._ticks($).tickSize(-Q,0)}if(v){l.width(Q),V.select(".nv-focusWrap").attr("transform","translate(0,"+(R+m.bottom+l.margin().top)+")").datum(k.filter(function(a){return!a.disabled})).call(l);var _=l.brush.empty()?l.xDomain():l.brush.extent();null!==_&&O(_)}else Z.transition().call(e),B(),J();e.dispatch.on("areaClick.toggle",function(a){1===k.filter(function(a){return!a.disabled}).length?k.forEach(function(a){a.disabled=!1}):k.forEach(function(b,c){b.disabled=c!=a.seriesIndex}),z.disabled=k.map(function(a){return!!a.disabled}),C.stateChange(z),b.update()}),h.dispatch.on("stateChange",function(a){for(var c in a)z[c]=a[c];C.stateChange(z),b.update()}),i.dispatch.on("legendClick",function(a,c){a.disabled&&(Y=Y.map(function(a){return a.disabled=!0,a}),a.disabled=!1,e.style(a.style),z.style=e.style(),C.stateChange(z),b.update())}),j.dispatch.on("elementMousemove",function(c){e.clearHighlights();var d,f,g,h=[],i=0,l=!0;if(k.filter(function(a,b){return a.seriesIndex=b,!a.disabled}).forEach(function(j,k){f=a.interactiveBisect(j.values,c.pointXValue,b.x());var m=j.values[f],n=b.y()(m,f);if(null!=n&&e.highlightPoint(k,f,!0),"undefined"!=typeof m){"undefined"==typeof d&&(d=m),"undefined"==typeof g&&(g=b.xScale()(b.x()(m,f)));var o="expand"==e.style()?m.display.y:b.y()(m,f);h.push({key:j.key,value:o,color:p(j,j.seriesIndex),point:m}),x&&"expand"!=e.style()&&null!=o&&(i+=o,l=!1)}}),h.reverse(),h.length>2){var m=b.yScale().invert(c.mouseY),n=null;h.forEach(function(a,b){m=Math.abs(m);var c=Math.abs(a.point.display.y0),d=Math.abs(a.point.display.y);return m>=c&&d+c>=m?void(n=b):void 0}),null!=n&&(h[n].highlight=!0)}x&&"expand"!=e.style()&&h.length>=2&&!l&&h.push({key:y,value:i,total:!0});var o=b.x()(d,f),q=j.tooltip.valueFormatter();"expand"===e.style()||"stack_percent"===e.style()?(I||(I=q),q=d3.format(".1%")):I&&(q=I,I=null),j.tooltip.valueFormatter(q).data({value:o,series:h})(),j.renderGuideLine(g)}),j.dispatch.on("elementMouseout",function(a){e.clearHighlights()}),l.dispatch.on("onBrush",function(a){O(a)}),C.on("changeState",function(a){"undefined"!=typeof a.disabled&&k.length===a.disabled.length&&(k.forEach(function(b,c){b.disabled=a.disabled[c]}),z.disabled=a.disabled),"undefined"!=typeof a.style&&(e.style(a.style),K=a.style),b.update()})}),J.renderEnd("stacked Area chart immediate"),b}var c,d,e=a.models.stackedArea(),f=a.models.axis(),g=a.models.axis(),h=a.models.legend(),i=a.models.legend(),j=a.interactiveGuideline(),k=a.models.tooltip(),l=a.models.focus(a.models.stackedArea()),m={top:30,right:25,bottom:50,left:60},n=null,o=null,p=a.utils.defaultColor(),q=!0,r=!0,s=!0,t=!0,u=!1,v=!1,w=!1,x=!0,y="TOTAL",z=a.utils.state(),A=null,B=null,C=d3.dispatch("stateChange","changeState","renderEnd"),D=250,E=["Stacked","Stream","Expanded"],F={},G=250;z.style=e.style(),f.orient("bottom").tickPadding(7),g.orient(u?"right":"left"),k.headerFormatter(function(a,b){return f.tickFormat()(a,b)}).valueFormatter(function(a,b){return g.tickFormat()(a,b)}),j.tooltip.headerFormatter(function(a,b){return f.tickFormat()(a,b)}).valueFormatter(function(a,b){return null==a?"N/A":g.tickFormat()(a,b)});var H=null,I=null;i.updateState(!1);var J=a.utils.renderWatch(C),K=e.style(),L=function(a){return function(){return{active:a.map(function(a){return!a.disabled}),style:e.style()}}},M=function(a){return function(b){void 0!==b.style&&(K=b.style),void 0!==b.active&&a.forEach(function(a,c){a.disabled=!b.active[c]})}},N=d3.format("%");return e.dispatch.on("elementMouseover.tooltip",function(a){a.point.x=e.x()(a.point),a.point.y=e.y()(a.point),k.data(a).hidden(!1)}),e.dispatch.on("elementMouseout.tooltip",function(a){k.hidden(!0)}),b.dispatch=C,b.stacked=e,b.legend=h,b.controls=i,b.xAxis=f,b.x2Axis=l.xAxis,b.yAxis=g,b.y2Axis=l.yAxis,b.interactiveLayer=j,b.tooltip=k,b.focus=l,b.dispatch=C,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{width:{get:function(){return n},set:function(a){n=a}},height:{get:function(){return o},set:function(a){o=a}},showLegend:{get:function(){return r},set:function(a){r=a}},showXAxis:{get:function(){return s},set:function(a){s=a}},showYAxis:{get:function(){return t},set:function(a){t=a}},defaultState:{get:function(){return A},set:function(a){A=a}},noData:{get:function(){return B},set:function(a){B=a}},showControls:{get:function(){return q},set:function(a){q=a}},controlLabels:{get:function(){return F},set:function(a){F=a}},controlOptions:{get:function(){return E},set:function(a){E=a}},showTotalInTooltip:{get:function(){return x},set:function(a){x=a}},totalLabel:{get:function(){return y},set:function(a){y=a}},focusEnable:{get:function(){return v},set:function(a){v=a}},focusHeight:{get:function(){return l.height()},set:function(a){l.height(a)}},brushExtent:{get:function(){return l.brushExtent()},set:function(a){l.brushExtent(a)}},margin:{get:function(){return m},set:function(a){m.top=void 0!==a.top?a.top:m.top,m.right=void 0!==a.right?a.right:m.right,m.bottom=void 0!==a.bottom?a.bottom:m.bottom,m.left=void 0!==a.left?a.left:m.left}},focusMargin:{get:function(){return l.margin},set:function(a){l.margin.top=void 0!==a.top?a.top:l.margin.top,l.margin.right=void 0!==a.right?a.right:l.margin.right,l.margin.bottom=void 0!==a.bottom?a.bottom:l.margin.bottom,l.margin.left=void 0!==a.left?a.left:l.margin.left}},duration:{get:function(){return G},set:function(a){G=a,J.reset(G),e.duration(G),f.duration(G),g.duration(G)}},color:{get:function(){return p},set:function(b){p=a.utils.getColor(b),h.color(p),e.color(p),l.color(p)}},x:{get:function(){return e.x()},set:function(a){e.x(a),l.x(a)}},y:{get:function(){return e.y()},set:function(a){e.y(a),l.y(a)}},rightAlignYAxis:{get:function(){return u},set:function(a){u=a,g.orient(u?"right":"left")}},useInteractiveGuideline:{get:function(){return w},set:function(a){w=!!a,b.interactive(!a),b.useVoronoi(!a),e.scatter.interactive(!a)}}}),a.utils.inheritOptions(b,e),a.utils.initOptions(b),b},a.models.stackedAreaWithFocusChart=function(){return a.models.stackedAreaChart().margin({bottom:30}).focusEnable(!0)},a.models.sunburst=function(){"use strict";function b(a){var b=c(a);return b>90?180:0}function c(a){var b=Math.max(0,Math.min(2*Math.PI,F(a.x))),c=Math.max(0,Math.min(2*Math.PI,F(a.x+a.dx))),d=(b+c)/2*(180/Math.PI)-90;return d}function d(a){var b=Math.max(0,Math.min(2*Math.PI,F(a.x))),c=Math.max(0,Math.min(2*Math.PI,F(a.x+a.dx)));return(c-b)/(2*Math.PI)}function e(a){var b=Math.max(0,Math.min(2*Math.PI,F(a.x))),c=Math.max(0,Math.min(2*Math.PI,F(a.x+a.dx))),d=c-b;return d>z}function f(a,b){var c=d3.interpolate(F.domain(),[l.x,l.x+l.dx]),d=d3.interpolate(G.domain(),[l.y,1]),e=d3.interpolate(G.range(),[l.y?20:0,o]);return 0===b?function(){return J(a)}:function(b){return F.domain(c(b)),G.domain(d(b)).range(e(b)),J(a)}}function g(a){var b=d3.interpolate({x:a.x0,dx:a.dx0,y:a.y0,dy:a.dy0},a);return function(c){var d=b(c);return a.x0=d.x,a.dx0=d.dx,a.y0=d.y,a.dy0=d.dy,J(d)}}function h(a){var b=B(a);I[b]||(I[b]={});var c=I[b];c.dx=a.dx,c.x=a.x,c.dy=a.dy,c.y=a.y}function i(a){a.forEach(function(a){var b=B(a),c=I[b];c?(a.dx0=c.dx,a.x0=c.x,a.dy0=c.dy,a.y0=c.y):(a.dx0=a.dx,a.x0=a.x,a.dy0=a.dy,a.y0=a.y),h(a)})}function j(a){var d=v.selectAll("text"),g=v.selectAll("path");d.transition().attr("opacity",0),l=a,g.transition().duration(D).attrTween("d",f).each("end",function(d){if(d.x>=a.x&&d.x<a.x+a.dx&&d.depth>=a.depth){var f=d3.select(this.parentNode),g=f.select("text");g.transition().duration(D).text(function(a){return y(a)}).attr("opacity",function(a){return e(a)?1:0}).attr("transform",function(){var e=this.getBBox().width;if(0===d.depth)return"translate("+e/2*-1+",0)";if(d.depth===a.depth)return"translate("+(G(d.y)+5)+",0)";var f=c(d),g=b(d);return 0===g?"rotate("+f+")translate("+(G(d.y)+5)+",0)":"rotate("+f+")translate("+(G(d.y)+e+5)+",0)rotate("+g+")"})}})}function k(f){return K.reset(),f.each(function(f){v=d3.select(this),m=a.utils.availableWidth(q,v,p),n=a.utils.availableHeight(r,v,p),o=Math.min(m,n)/2,G.range([0,o]);var h=v.select("g.nvd3.nv-wrap.nv-sunburst");h[0][0]?h.attr("transform","translate("+(m/2+p.left+p.right)+","+(n/2+p.top+p.bottom)+")"):h=v.append("g").attr("class","nvd3 nv-wrap nv-sunburst nv-chart-"+u).attr("transform","translate("+(m/2+p.left+p.right)+","+(n/2+p.top+p.bottom)+")"),v.on("click",function(a,b){E.chartClick({data:a,index:b,pos:d3.event,id:u})}),H.value(t[s]||t.count);var k=H.nodes(f[0]).reverse();i(k);var l=h.selectAll(".arc-container").data(k,B),z=l.enter().append("g").attr("class","arc-container");z.append("path").attr("d",J).style("fill",function(a){return a.color?a.color:w(C?(a.children?a:a.parent).name:a.name)}).style("stroke","#FFF").on("click",j).on("mouseover",function(a,b){d3.select(this).classed("hover",!0).style("opacity",.8),E.elementMouseover({data:a,color:d3.select(this).style("fill"),percent:d(a)})}).on("mouseout",function(a,b){d3.select(this).classed("hover",!1).style("opacity",1),E.elementMouseout({data:a})}).on("mousemove",function(a,b){E.elementMousemove({data:a})}),l.each(function(a){d3.select(this).select("path").transition().duration(D).attrTween("d",g)}),x&&(l.selectAll("text").remove(),l.append("text").text(function(a){return y(a)}).transition().duration(D).attr("opacity",function(a){return e(a)?1:0}).attr("transform",function(a){var d=this.getBBox().width;if(0===a.depth)return"rotate(0)translate("+d/2*-1+",0)";var e=c(a),f=b(a);return 0===f?"rotate("+e+")translate("+(G(a.y)+5)+",0)":"rotate("+e+")translate("+(G(a.y)+d+5)+",0)rotate("+f+")"})),j(k[k.length-1]),l.exit().transition().duration(D).attr("opacity",0).each("end",function(a){var b=B(a);I[b]=void 0}).remove()}),K.renderEnd("sunburst immediate"),k}var l,m,n,o,p={top:0,right:0,bottom:0,left:0},q=600,r=600,s="count",t={count:function(a){return 1},value:function(a){return a.value||a.size},size:function(a){return a.value||a.size}},u=Math.floor(1e4*Math.random()),v=null,w=a.utils.defaultColor(),x=!1,y=function(a){return"count"===s?a.name+" #"+a.value:a.name+" "+(a.value||a.size)},z=.02,A=function(a,b){return a.name>b.name},B=function(a,b){return a.name},C=!0,D=500,E=d3.dispatch("chartClick","elementClick","elementDblClick","elementMousemove","elementMouseover","elementMouseout","renderEnd"),F=d3.scale.linear().range([0,2*Math.PI]),G=d3.scale.sqrt(),H=d3.layout.partition().sort(A),I={},J=d3.svg.arc().startAngle(function(a){return Math.max(0,Math.min(2*Math.PI,F(a.x)))}).endAngle(function(a){return Math.max(0,Math.min(2*Math.PI,F(a.x+a.dx)))}).innerRadius(function(a){return Math.max(0,G(a.y))}).outerRadius(function(a){return Math.max(0,G(a.y+a.dy))}),K=a.utils.renderWatch(E);return k.dispatch=E,k.options=a.utils.optionsFunc.bind(k),k._options=Object.create({},{width:{get:function(){return q},set:function(a){q=a}},height:{get:function(){return r},set:function(a){r=a}},mode:{get:function(){return s},set:function(a){s=a}},id:{get:function(){return u},set:function(a){u=a}},duration:{get:function(){return D},set:function(a){D=a}},groupColorByParent:{get:function(){return C},set:function(a){C=!!a}},showLabels:{get:function(){return x},set:function(a){x=!!a}},labelFormat:{get:function(){return y},set:function(a){y=a}},labelThreshold:{get:function(){return z},set:function(a){z=a}},sort:{get:function(){return A},set:function(a){A=a}},key:{get:function(){return B},set:function(a){B=a}},margin:{get:function(){return p},set:function(a){p.top=void 0!=a.top?a.top:p.top,p.right=void 0!=a.right?a.right:p.right,p.bottom=void 0!=a.bottom?a.bottom:p.bottom,p.left=void 0!=a.left?a.left:p.left}},color:{get:function(){return w},set:function(b){w=a.utils.getColor(b)}}}),a.utils.initOptions(k),k},a.models.sunburstChart=function(){"use strict";function b(d){return n.reset(),n.models(c),d.each(function(d){var h=d3.select(this);a.utils.initSVG(h);var i=a.utils.availableWidth(f,h,e),j=a.utils.availableHeight(g,h,e);return b.update=function(){0===l?h.call(b):h.transition().duration(l).call(b)},b.container=h,d&&d.length?(h.selectAll(".nv-noData").remove(),c.width(i).height(j).margin(e),void h.call(c)):(a.utils.noData(b,h),b)}),n.renderEnd("sunburstChart immediate"),b}var c=a.models.sunburst(),d=a.models.tooltip(),e={top:30,right:20,bottom:20,left:20},f=null,g=null,h=a.utils.defaultColor(),i=!1,j=(Math.round(1e5*Math.random()),null),k=null,l=250,m=d3.dispatch("stateChange","changeState","renderEnd"),n=a.utils.renderWatch(m);return d.duration(0).headerEnabled(!1).valueFormatter(function(a){return a}),c.dispatch.on("elementMouseover.tooltip",function(a){a.series={key:a.data.name,value:a.data.value||a.data.size,color:a.color,percent:a.percent},i||(delete a.percent,delete a.series.percent),d.data(a).hidden(!1)}),c.dispatch.on("elementMouseout.tooltip",function(a){d.hidden(!0)}),c.dispatch.on("elementMousemove.tooltip",function(a){d()}),b.dispatch=m,b.sunburst=c,b.tooltip=d,b.options=a.utils.optionsFunc.bind(b),b._options=Object.create({},{noData:{get:function(){return k},set:function(a){k=a}},defaultState:{get:function(){return j},set:function(a){j=a}},showTooltipPercent:{get:function(){return i},set:function(a){i=a}},color:{get:function(){return h},set:function(a){h=a,c.color(h)}},duration:{get:function(){return l},set:function(a){l=a,n.reset(l),c.duration(l)}},margin:{get:function(){return e},set:function(a){e.top=void 0!==a.top?a.top:e.top,e.right=void 0!==a.right?a.right:e.right,e.bottom=void 0!==a.bottom?a.bottom:e.bottom,e.left=void 0!==a.left?a.left:e.left,c.margin(e)}}}),a.utils.inheritOptions(b,c),a.utils.initOptions(b),b},a.version="1.8.4-dev"}();
\ No newline at end of file
diff --git a/contrib/eigen/bench/perf_monitoring/gemm/run.sh b/contrib/eigen/bench/perf_monitoring/run.sh
similarity index 51%
rename from contrib/eigen/bench/perf_monitoring/gemm/run.sh
rename to contrib/eigen/bench/perf_monitoring/run.sh
index 9d6ee40bca2b7315ceed7624980f183423d9ae76..7e2ea1264d52d3cac44bf1fc0901c54d4783f0e1 100755
--- a/contrib/eigen/bench/perf_monitoring/gemm/run.sh
+++ b/contrib/eigen/bench/perf_monitoring/run.sh
@@ -1,35 +1,58 @@
 #!/bin/bash
 
-# ./run.sh gemm
-# ./run.sh lazy_gemm
+# ./run.sh gemm gemm_settings.txt
+# ./run.sh lazy_gemm lazy_gemm_settings.txt
+# ./run.sh gemv gemv_settings.txt
+# ./run.sh trmv_up gemv_square_settings.txt
+# ...
 
 # Examples of environment variables to be set:
 #   PREFIX="haswell-fma-"
 #   CXX_FLAGS="-mfma"
+#   CXX=clang++
 
 # Options:
 #   -up : enforce the recomputation of existing data, and keep best results as a merging strategy
 #   -s  : recompute selected changesets only and keep bests
+#   -np : no plotting of results, just generate the data
 
 bench=$1
+settings_file=$2
 
-if echo "$*" | grep '\-up' > /dev/null; then
+if [[ "$*" =~ '-up' ]]; then
   update=true
 else
   update=false
 fi
 
-if echo "$*" | grep '\-s' > /dev/null; then
+if [[ "$*" =~ '-s' ]]; then
   selected=true
 else
   selected=false
 fi
 
+if [[ "$*" =~ '-np' ]]; then
+  do_plot=false
+else
+  do_plot=true
+fi
+
+
+WORKING_DIR=${PREFIX:?"default"}
+
+if [ -z "$PREFIX" ]; then
+  WORKING_DIR_PREFIX="$WORKING_DIR/"
+else
+  WORKING_DIR_PREFIX="$WORKING_DIR/$PREFIX-"
+fi
+echo "WORKING_DIR_PREFIX=$WORKING_DIR_PREFIX"
+mkdir -p $WORKING_DIR
+
 global_args="$*"
 
-if [ $selected == true ]; then
+if $selected ; then
  echo "Recompute selected changesets only and keep bests"
-elif [ $update == true ]; then
+elif $update ; then
  echo "(Re-)Compute all changesets and keep bests"
 else
  echo "Skip previously computed changesets"
@@ -38,14 +61,14 @@ fi
 
 
 if [ ! -d "eigen_src" ]; then
-  hg clone https://bitbucket.org/eigen/eigen eigen_src
+  git clone https://gitlab.com/libeigen/eigen.git eigen_src
 else
   cd eigen_src
-  hg pull -u
+  git pull
   cd ..
 fi
 
-if [ ! -z '$CXX' ]; then
+if [ -z "$CXX" ]; then
   CXX=g++
 fi
 
@@ -84,11 +107,11 @@ function test_current
   
   prev=""
   if [ -e "$name.backup" ]; then
-    prev=`grep $rev "$name.backup" | cut -c 14-`
+    prev=`grep $rev "$name.backup" | cut -d ' ' -f 2-`
   fi
   res=$prev
   count_rev=`echo $prev |  wc -w`
-  count_ref=`cat $bench"_settings.txt" |  wc -l`
+  count_ref=`cat $settings_file |  wc -l`
   if echo "$global_args" | grep "$rev" > /dev/null; then
     rev_found=true
   else
@@ -96,9 +119,10 @@ function test_current
   fi
 #  echo $update et $selected et $rev_found because $rev et "$global_args"
 #  echo $count_rev et $count_ref
-  if [ $update == true ] || [ $count_rev != $count_ref ] || ([ $selected == true ] &&  [ $rev_found == true ]); then
-    if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src $bench.cpp -DSCALAR=$scalar -o $name; then
-      curr=`./$name`
+  if $update || [ $count_rev != $count_ref ] || ( $selected &&  $rev_found ); then
+    echo "RUN: $CXX -O3 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src $bench.cpp -DSCALAR=$scalar -o $name"
+    if $CXX -O3 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src $bench.cpp -DSCALAR=$scalar -o $name; then
+      curr=`./$name $settings_file`
       if [ $count_rev == $count_ref ]; then
         echo "merge previous $prev"
         echo "with new       $curr"
@@ -117,40 +141,43 @@ function test_current
   fi
 }
 
-make_backup $PREFIX"s"$bench
-make_backup $PREFIX"d"$bench
-make_backup $PREFIX"c"$bench
+make_backup $WORKING_DIR_PREFIX"s"$bench
+make_backup $WORKING_DIR_PREFIX"d"$bench
+make_backup $WORKING_DIR_PREFIX"c"$bench
 
 cut -f1 -d"#" < changesets.txt | grep -E '[[:alnum:]]' | while read rev
 do
   if [ ! -z '$rev' ]; then
-    echo "Testing rev $rev"
+    rev2=`echo $rev | cut -f 2 -d':'`
+    echo "Testing rev $rev, $rev2"
     cd eigen_src
-    hg up -C $rev > /dev/null
-    actual_rev=`hg identify | cut -f1 -d' '`
+    git checkout $rev2 > /dev/null
+    actual_rev=`git rev-parse --short HEAD`
     cd ..
     
-    test_current $actual_rev float                  $PREFIX"s"$bench
-    test_current $actual_rev double                 $PREFIX"d"$bench
-    test_current $actual_rev "std::complex<double>" $PREFIX"c"$bench
+    test_current $actual_rev float                  $WORKING_DIR_PREFIX"s"$bench
+    test_current $actual_rev double                 $WORKING_DIR_PREFIX"d"$bench
+    test_current $actual_rev "std::complex<double>" $WORKING_DIR_PREFIX"c"$bench
   fi
   
 done
 
 echo "Float:"
-cat $PREFIX"s""$bench.out"
+cat $WORKING_DIR_PREFIX"s""$bench.out"
 echo " "
 
 echo "Double:"
-cat $PREFIX"d""$bench.out"
+cat $WORKING_DIR_PREFIX"d""$bench.out"
 echo ""
 
 echo "Complex:"
-cat $PREFIX"c""$bench.out"
+cat $WORKING_DIR_PREFIX"c""$bench.out"
 echo ""
 
-./make_plot.sh $PREFIX"s"$bench $bench
-./make_plot.sh $PREFIX"d"$bench $bench
-./make_plot.sh $PREFIX"c"$bench $bench
+if $do_plot ; then
 
+./make_plot.sh $WORKING_DIR_PREFIX"s"$bench $bench $settings_file
+./make_plot.sh $WORKING_DIR_PREFIX"d"$bench $bench $settings_file
+./make_plot.sh $WORKING_DIR_PREFIX"c"$bench $bench $settings_file
 
+fi
diff --git a/contrib/eigen/bench/perf_monitoring/runall.sh b/contrib/eigen/bench/perf_monitoring/runall.sh
new file mode 100755
index 0000000000000000000000000000000000000000..cdbe48eb836269ecd0a3996b0abef4f747b6a7ec
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/runall.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# ./runall.sh "Title"
+
+# Examples of environment variables to be set:
+#   PREFIX="haswell-fma-"
+#   CXX_FLAGS="-mfma"
+#   CXX=clang++
+
+# Options:
+#   -up : enforce the recomputation of existing data, and keep best results as a merging strategy
+#   -s  : recompute selected changesets only and keep bests
+#   -np : no plotting of results, just generate the data
+
+if [[ "$*" =~ '-np' ]]; then
+  do_plot=false
+else
+  do_plot=true
+fi
+
+./run.sh gemm gemm_settings.txt $*
+./run.sh lazy_gemm lazy_gemm_settings.txt $*
+./run.sh gemv gemv_settings.txt $*
+./run.sh gemvt gemv_settings.txt $*
+./run.sh trmv_up gemv_square_settings.txt $*
+./run.sh trmv_lo gemv_square_settings.txt $*
+./run.sh trmv_upt gemv_square_settings.txt $*
+./run.sh trmv_lot gemv_square_settings.txt $*
+./run.sh llt gemm_square_settings.txt $*
+
+if $do_plot ; then
+
+# generate html file
+
+function print_td {
+  echo '<td><a href="'$PREFIX'-'$1"$2"'.html"><img src="'$PREFIX'-'$1"$2"'.png" title="'$3'"></a></td>' >> $htmlfile
+}
+
+function print_tr {
+  echo '<tr><th colspan="3">'"$2"'</th></tr>' >> $htmlfile
+  echo '<tr>' >> $htmlfile
+  print_td s $1 float
+  print_td d $1 double
+  print_td c $1 complex
+  echo '</tr>' >> $htmlfile
+}
+
+if [ -n "$PREFIX" ]; then
+
+
+cp resources/s1.js $PREFIX/
+cp resources/s2.js $PREFIX/
+
+htmlfile="$PREFIX/index.html"
+cat resources/header.html > $htmlfile
+
+echo '<h1>'$1'</h1>' >> $htmlfile
+echo '<table>' >> $htmlfile
+print_tr gemm       'C += A &middot; B   &nbsp; (gemm)'
+print_tr lazy_gemm  'C += A &middot; B   &nbsp; (gemm lazy)'
+print_tr gemv       'y += A &middot; x   &nbsp; (gemv)'
+print_tr gemvt      'y += A<sup>T</sup> &middot; x  &nbsp; (gemv)'
+print_tr trmv_up    'y += U &middot; x   &nbsp; (trmv)'
+print_tr trmv_upt   'y += U<sup>T</sup> &middot; x  &nbsp; (trmv)'
+print_tr trmv_lo    'y += L &middot; x   &nbsp; (trmv)'
+print_tr trmv_lot   'y += L<sup>T</sup> &middot; x  &nbsp; (trmv)'
+print_tr trmv_lot   'L &middot; L<sup>T<sup> = A &nbsp;  (Cholesky,potrf)'
+
+cat resources/footer.html >> $htmlfile
+
+fi
+fi
diff --git a/contrib/eigen/bench/perf_monitoring/trmv_lo.cpp b/contrib/eigen/bench/perf_monitoring/trmv_lo.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3fabb6e5444e0ddd0b2e7a6e94f5a810aaa9bf79
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/trmv_lo.cpp
@@ -0,0 +1,12 @@
+#include "gemv_common.h"
+
+EIGEN_DONT_INLINE
+void trmv(const Mat &A, const Vec &B, Vec &C)
+{
+  C.noalias() += A.triangularView<Lower>() * B;
+}
+
+int main(int argc, char **argv)
+{
+  return main_gemv(argc, argv, trmv);
+}
diff --git a/contrib/eigen/bench/perf_monitoring/trmv_lot.cpp b/contrib/eigen/bench/perf_monitoring/trmv_lot.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..32e085aaf0eb94f4d872dc5a7b3b0fd896607ad4
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/trmv_lot.cpp
@@ -0,0 +1,12 @@
+#include "gemv_common.h"
+
+EIGEN_DONT_INLINE
+void trmv(const Mat &A, Vec &B, const Vec &C)
+{
+  B.noalias() += A.transpose().triangularView<Lower>() * C;
+}
+
+int main(int argc, char **argv)
+{
+  return main_gemv(argc, argv, trmv);
+}
diff --git a/contrib/eigen/bench/perf_monitoring/trmv_up.cpp b/contrib/eigen/bench/perf_monitoring/trmv_up.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c58e471ec36abad1257acaabd4c7c6b0f37163f1
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/trmv_up.cpp
@@ -0,0 +1,12 @@
+#include "gemv_common.h"
+
+EIGEN_DONT_INLINE
+void trmv(const Mat &A, const Vec &B, Vec &C)
+{
+  C.noalias() += A.triangularView<Upper>() * B;
+}
+
+int main(int argc, char **argv)
+{
+  return main_gemv(argc, argv, trmv);
+}
diff --git a/contrib/eigen/bench/perf_monitoring/trmv_upt.cpp b/contrib/eigen/bench/perf_monitoring/trmv_upt.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..511e00885b817c75cd90652b208a23bd8bafce49
--- /dev/null
+++ b/contrib/eigen/bench/perf_monitoring/trmv_upt.cpp
@@ -0,0 +1,12 @@
+#include "gemv_common.h"
+
+EIGEN_DONT_INLINE
+void trmv(const Mat &A, Vec &B, const Vec &C)
+{
+  B.noalias() += A.transpose().triangularView<Upper>() * C;
+}
+
+int main(int argc, char **argv)
+{
+  return main_gemv(argc, argv, trmv);
+}
diff --git a/contrib/eigen/bench/sparse_randomsetter.cpp b/contrib/eigen/bench/sparse_randomsetter.cpp
index 19a76e38da65834ab5f9fbb9e7ab13c4783a2cdb..c433742120cdff785aa8197972a4bbbdcbed60ee 100644
--- a/contrib/eigen/bench/sparse_randomsetter.cpp
+++ b/contrib/eigen/bench/sparse_randomsetter.cpp
@@ -1,6 +1,7 @@
 
 #define NOGMM
 #define NOMTL
+#define EIGEN_GOOGLEHASH_SUPPORT 1
 
 #include <map>
 #include <ext/hash_map>
diff --git a/contrib/eigen/bench/spbench/CMakeLists.txt b/contrib/eigen/bench/spbench/CMakeLists.txt
index 029ba6d6be57b404e697fe9c399a1d5401553ae1..b1860049c0cb5607d4f2e221379ab211266accdf 100644
--- a/contrib/eigen/bench/spbench/CMakeLists.txt
+++ b/contrib/eigen/bench/spbench/CMakeLists.txt
@@ -11,9 +11,9 @@ set(SPARSE_LIBS "")
 # if(PARDISO_LIBRARIES)
 #   add_definitions("-DEIGEN_PARDISO_SUPPORT")
 #   set(SPARSE_LIBS ${SPARSE_LIBS} ${PARDISO_LIBRARIES})
-# endif(PARDISO_LIBRARIES)
+# endif()
 
-find_package(Cholmod)
+find_package(CHOLMOD)
 if(CHOLMOD_FOUND AND BLAS_FOUND AND LAPACK_FOUND)
   add_definitions("-DEIGEN_CHOLMOD_SUPPORT")
   include_directories(${CHOLMOD_INCLUDES})
@@ -21,7 +21,7 @@ if(CHOLMOD_FOUND AND BLAS_FOUND AND LAPACK_FOUND)
   set(CHOLMOD_ALL_LIBS  ${CHOLMOD_LIBRARIES} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
 endif()
 
-find_package(Umfpack)
+find_package(UMFPACK)
 if(UMFPACK_FOUND AND BLAS_FOUND)
   add_definitions("-DEIGEN_UMFPACK_SUPPORT")
   include_directories(${UMFPACK_INCLUDES})
@@ -29,8 +29,15 @@ if(UMFPACK_FOUND AND BLAS_FOUND)
   set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${BLAS_LIBRARIES})
 endif()
 
+find_package(KLU)
+if(KLU_FOUND)
+  add_definitions("-DEIGEN_KLU_SUPPORT")
+  include_directories(${KLU_INCLUDES})
+  set(SPARSE_LIBS ${SPARSE_LIBS} ${KLU_LIBRARIES})
+endif()
+
 find_package(SuperLU 4.0)
-if(SUPERLU_FOUND AND BLAS_FOUND)
+if(SuperLU_FOUND AND BLAS_FOUND)
   add_definitions("-DEIGEN_SUPERLU_SUPPORT")
   include_directories(${SUPERLU_INCLUDES})
   set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${BLAS_LIBRARIES})
@@ -57,7 +64,7 @@ if(PASTIX_FOUND AND PASTIX_pastix_nompi.h_INCLUDE_DIRS AND BLAS_FOUND)
   elseif(METIS_FOUND)
     include_directories(${METIS_INCLUDE_DIRS})
     set(PASTIX_LIBRARIES ${PASTIX_LIBRARIES} ${METIS_LIBRARIES})  
-  endif(SCOTCH_FOUND)
+  endif()
   set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES_DEP} ${ORDERING_LIBRARIES})
   set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES_DEP})
 endif()
@@ -66,12 +73,12 @@ if(METIS_FOUND)
   include_directories(${METIS_INCLUDE_DIRS})
   set (SPARSE_LIBS ${SPARSE_LIBS} ${METIS_LIBRARIES})
   add_definitions("-DEIGEN_METIS_SUPPORT")
-endif(METIS_FOUND)
+endif()
 
 find_library(RT_LIBRARY rt)
 if(RT_LIBRARY)
   set(SPARSE_LIBS ${SPARSE_LIBS} ${RT_LIBRARY})
-endif(RT_LIBRARY)
+endif()
 
 add_executable(spbenchsolver spbenchsolver.cpp)
 target_link_libraries (spbenchsolver ${SPARSE_LIBS})
diff --git a/contrib/eigen/bench/spbench/sp_solver.cpp b/contrib/eigen/bench/spbench/sp_solver.cpp
index a1f4bac8afdb4aa03bc8f6201a93fc86f982d4e7..75821a620c65f04ca68fa4bbe3799ab89b65221a 100644
--- a/contrib/eigen/bench/spbench/sp_solver.cpp
+++ b/contrib/eigen/bench/spbench/sp_solver.cpp
@@ -122,4 +122,4 @@ int main(int argc, char **args)
 //  std::cout<<x.transpose()<<"\n";
   
   return 0;
-}
\ No newline at end of file
+}
diff --git a/contrib/eigen/bench/spbench/spbenchsolver.cpp b/contrib/eigen/bench/spbench/spbenchsolver.cpp
index 4acd0039cee4c0af59e5c794b79d9443458ec283..2a7351124e85959afa4af5ac6e49a72004095cf5 100644
--- a/contrib/eigen/bench/spbench/spbenchsolver.cpp
+++ b/contrib/eigen/bench/spbench/spbenchsolver.cpp
@@ -54,7 +54,7 @@ int main(int argc, char ** args)
       statbuf.close();
     }
     else
-      std::cerr << "Unable to open the provided file for writting... \n";
+      std::cerr << "Unable to open the provided file for writing... \n";
   }       
   
   // Get the maximum number of iterations and the tolerance
diff --git a/contrib/eigen/bench/spbench/spbenchsolver.h b/contrib/eigen/bench/spbench/spbenchsolver.h
index 19c719c049a3f84d748446489bcc89b0792df42b..8f59d10711f63a3229508ea86ed17aef5e2c187c 100644
--- a/contrib/eigen/bench/spbench/spbenchsolver.h
+++ b/contrib/eigen/bench/spbench/spbenchsolver.h
@@ -37,6 +37,10 @@
 #include <Eigen/UmfPackSupport>
 #endif
 
+#ifdef EIGEN_KLU_SUPPORT
+#include <Eigen/KLUSupport>
+#endif
+
 #ifdef EIGEN_PARDISO_SUPPORT
 #include <Eigen/PardisoSupport>
 #endif
@@ -51,6 +55,7 @@
 
 // CONSTANTS
 #define EIGEN_UMFPACK  10
+#define EIGEN_KLU  11
 #define EIGEN_SUPERLU  20
 #define EIGEN_PASTIX  30
 #define EIGEN_PARDISO  40
@@ -109,6 +114,12 @@ void printStatheader(std::ofstream& out)
   out << "   <PACKAGE> UMFPACK </PACKAGE> \n"; 
   out << "  </SOLVER> \n"; 
 #endif
+#ifdef EIGEN_KLU_SUPPORT
+  out <<"  <SOLVER ID='" << EIGEN_KLU << "'>\n";
+  out << "   <TYPE> LU </TYPE> \n";
+  out << "   <PACKAGE> KLU </PACKAGE> \n";
+  out << "  </SOLVER> \n";
+#endif
 #ifdef EIGEN_SUPERLU_SUPPORT
   out <<"  <SOLVER ID='" << EIGEN_SUPERLU << "'>\n"; 
   out << "   <TYPE> LU </TYPE> \n";
@@ -315,6 +326,14 @@ void SelectSolvers(const SparseMatrix<Scalar>&A, unsigned int sym, Matrix<Scalar
     UmfPackLU<SpMat> solver; 
     call_directsolver(solver, EIGEN_UMFPACK, A, b, refX,statFile); 
   }
+  #endif
+  //KLU
+  #ifdef EIGEN_KLU_SUPPORT
+  {
+    cout << "Solving with KLU LU ... \n";
+    KLU<SpMat> solver;
+    call_directsolver(solver, EIGEN_KLU, A, b, refX,statFile);
+  }
   #endif
     //SuperLU
   #ifdef EIGEN_SUPERLU_SUPPORT
diff --git a/contrib/eigen/bench/spbench/test_sparseLU.cpp b/contrib/eigen/bench/spbench/test_sparseLU.cpp
index f8ecbe69b9b53afd8dfd504c6a8972933e7d2131..a4cade641f9a15031fbeb606b8611d8a65bca1e7 100644
--- a/contrib/eigen/bench/spbench/test_sparseLU.cpp
+++ b/contrib/eigen/bench/spbench/test_sparseLU.cpp
@@ -90,4 +90,4 @@ int main(int argc, char **args)
   cout << "Number of nonzeros in the factor : " << solver.nnzL() + solver.nnzU() << std::endl; 
   
   return 0;
-}
\ No newline at end of file
+}
diff --git a/contrib/eigen/bench/tensors/README b/contrib/eigen/bench/tensors/README
index 3a5fdbe17e7a26ac0902ae5310278ede1ab73231..dcbf0217a88d3502d429c60fd177df974688a98c 100644
--- a/contrib/eigen/bench/tensors/README
+++ b/contrib/eigen/bench/tensors/README
@@ -11,11 +11,10 @@ nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBU
 We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code.
 nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu
 
-last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
-g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
+To compile and run the benchmark for SYCL, using ComputeCpp, simply run the
+following commands:
+1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY}
+2. bash eigen_sycl_bench.sh
 
-To compile the benchmark for SYCL, using ComputeCpp you currently need 2 passes (only for translation units containing device code):
-1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code.
-{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc
-2. The host compilation pass that generates the final host binary.
-clang++-3.7 -include tensor_benchmarks_sycl.sycl benchmark_main.cc tensor_benchmarks_sycl.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++11 -o tensor_benchmark_sycl
+Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
+g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
diff --git a/contrib/eigen/bench/tensors/eigen_sycl_bench.sh b/contrib/eigen/bench/tensors/eigen_sycl_bench.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3f67b3d86bc84f5acb36172850e36d353d27b421
--- /dev/null
+++ b/contrib/eigen/bench/tensors/eigen_sycl_bench.sh
@@ -0,0 +1,30 @@
+rm -f tensor_benchmark_sycl
+: "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}"
+echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR
+${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++ \
+tensor_benchmarks_sycl.cc \
+benchmark_main.cc \
+-I ../../ \
+-I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ \
+-std=c++11 \
+-march=native \
+-O3 \
+-DNDEBUG \
+-DEIGEN_MPL2_ONLY \
+-DEIGEN_USE_SYCL=1 \
+-DEIGEN_SYCL_LOCAL_MEM=1 \
+-no-serial-memop \
+-mllvm \
+-inline-threshold=10000 \
+-fsycl-ih-last \
+-sycl-driver \
+-Xclang -cl-mad-enable \
+-lOpenCL \
+-lComputeCpp \
+-lpthread \
+-o \
+tensor_benchmark_sycl\
+${@:1}
+
+export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH
+./tensor_benchmark_sycl
diff --git a/contrib/eigen/bench/tensors/eigen_sycl_bench_contract.sh b/contrib/eigen/bench/tensors/eigen_sycl_bench_contract.sh
new file mode 100644
index 0000000000000000000000000000000000000000..73fd6c4a07e94d70617c08e16a8ee581a5a242c0
--- /dev/null
+++ b/contrib/eigen/bench/tensors/eigen_sycl_bench_contract.sh
@@ -0,0 +1,7 @@
+rm -f tensor_contract_sycl_bench
+: "${COMPUTECPP_PACKAGE_ROOT_DIR:?Need to set COMPUTECPP_PACKAGE_ROOT_DIR}"
+echo "COMPUTECPP_PACKAGE_ROOT_DIR is set to: "$COMPUTECPP_PACKAGE_ROOT_DIR
+${COMPUTECPP_PACKAGE_ROOT_DIR}/bin/compute++  tensor_contract_sycl_bench.cc -I ../../ -I ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/ -std=c++11 -O3 -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -no-serial-memop -mllvm -inline-threshold=10000 -fsycl-ih-last -sycl-driver -Xclang -cl-mad-enable -lOpenCL -lComputeCpp -lpthread -o tensor_contract_sycl_bench ${@:1}
+export LD_LIBRARY_PATH=${COMPUTECPP_PACKAGE_ROOT_DIR}/lib:$LD_LIBRARY_PATH
+./tensor_contract_sycl_bench
+
diff --git a/contrib/eigen/bench/tensors/tensor_benchmarks.h b/contrib/eigen/bench/tensors/tensor_benchmarks.h
index c2fb3dedefb4977f4f7e5b6a4cdda30409ed2912..0825e156337bf7b5b4cd99250fd1db39f3200cad 100644
--- a/contrib/eigen/bench/tensors/tensor_benchmarks.h
+++ b/contrib/eigen/bench/tensors/tensor_benchmarks.h
@@ -27,6 +27,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     initialize();
   }
 
+  BenchmarkSuite(const Device& device, size_t m, size_t k)
+      : m_(1), k_(k), n_(m), device_(device) {
+    initialize();
+  }
+
   ~BenchmarkSuite() {
     device_.deallocate(a_);
     device_.deallocate(b_);
@@ -35,6 +40,11 @@ template <typename Device, typename T> class BenchmarkSuite {
 
   void memcpy(int num_iters) {
     eigen_assert(m_ == k_ && k_ == n_);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
@@ -55,7 +65,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     }
     const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes);
     TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      B.device(device_) = A.template cast<T>();
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       B.device(device_) = A.template cast<T>();
@@ -70,7 +84,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     sizes[0] = m_;
     sizes[1] = m_;
     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = C.random();
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = C.random();
@@ -93,7 +111,18 @@ template <typename Device, typename T> class BenchmarkSuite {
     const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
     const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
     const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.slice(first_quadrant, quarter_sizes).device(device_) =
+          A.slice(first_quadrant, quarter_sizes);
+      C.slice(second_quadrant, quarter_sizes).device(device_) =
+          B.slice(second_quadrant, quarter_sizes);
+      C.slice(third_quadrant, quarter_sizes).device(device_) =
+          A.slice(third_quadrant, quarter_sizes);
+      C.slice(fourth_quadrant, quarter_sizes).device(device_) =
+          B.slice(fourth_quadrant, quarter_sizes);
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.slice(first_quadrant, quarter_sizes).device(device_) =
@@ -118,7 +147,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     Eigen::array<TensorIndex, 1> output_size;
     output_size[0] = n_;
     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = B.chip(iter % k_, 0);
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = B.chip(iter % k_, 0);
@@ -135,7 +168,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     Eigen::array<TensorIndex, 1> output_size;
     output_size[0] = n_;
     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = B.chip(iter % n_, 1);
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = B.chip(iter % n_, 1);
@@ -158,7 +195,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     Eigen::array<int, 2> shuffle;
     shuffle[0] = 1;
     shuffle[1] = 0;
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      B.device(device_) = A.shuffle(shuffle);
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       B.device(device_) = A.shuffle(shuffle);
@@ -186,7 +227,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
     paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
 #endif
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      B.device(device_) = A.pad(paddings);
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       B.device(device_) = A.pad(paddings);
@@ -216,6 +261,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
 #endif
 
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      B.device(device_) = A.stride(strides);
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       B.device(device_) = A.stride(strides);
@@ -224,6 +274,7 @@ template <typename Device, typename T> class BenchmarkSuite {
     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
   }
 
+
   void broadcasting(int num_iters) {
     Eigen::array<TensorIndex, 2> size_a;
     size_a[0] = m_;
@@ -245,6 +296,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     broadcast.set(1, n_);
 #endif
 
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.broadcast(broadcast);
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = A.broadcast(broadcast);
@@ -261,7 +317,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
@@ -280,6 +340,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
 
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
+}
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
@@ -297,7 +362,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.exp() + B.log();
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = A.exp() + B.log();
@@ -325,7 +394,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     // optimize the code.
     Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
 #endif
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = B.sum(sum_along_dim);
+  }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = B.sum(sum_along_dim);
@@ -344,8 +417,8 @@ template <typename Device, typename T> class BenchmarkSuite {
         b_, input_size);
     Eigen::array<TensorIndex, 1> output_size;
     output_size[0] = k_;
-    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(
-        c_, output_size);
+    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> A(
+        a_, output_size);
 
 #ifndef EIGEN_HAS_INDEX_LIST
     Eigen::array<TensorIndex, 1> sum_along_dim;
@@ -355,10 +428,14 @@ template <typename Device, typename T> class BenchmarkSuite {
     // optimize the code.
     Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
 #endif
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    A.device(device_) = B.sum(sum_along_dim);
+  }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
-      C.device(device_) = B.sum(sum_along_dim);
+      A.device(device_) = B.sum(sum_along_dim);
     }
     // Record the number of FLOP executed per second (assuming one operation
     // per value)
@@ -375,7 +452,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     Eigen::array<TensorIndex, 0> output_size;
     TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
         c_, output_size);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = B.sum();
+    }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = B.sum();
@@ -385,33 +466,27 @@ template <typename Device, typename T> class BenchmarkSuite {
     finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
   }
 
+  
+
   // do a contraction which is equivalent to a matrix multiplication
   void contraction(int num_iters) {
-    Eigen::array<TensorIndex, 2> sizeA;
-    sizeA[0] = m_;
-    sizeA[1] = k_;
-    Eigen::array<TensorIndex, 2> sizeB;
-    sizeB[0] = k_;
-    sizeB[1] = n_;
-    Eigen::array<TensorIndex, 2> sizeC;
-    sizeC[0] = m_;
-    sizeC[1] = n_;
+      contraction<static_cast<int>(Eigen::ColMajor)>(num_iters, false, false);
+  }
 
-    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
-    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
-    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
+    void contractionRowMajor(int num_iters) {
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, false);
+  }
+    
+  void contractionRowMajorAT(int num_iters) {
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, false);
+  }
 
-    typedef typename Tensor<T, 2>::DimensionPair DimPair;
-    Eigen::array<DimPair, 1> dims;
-    dims[0] = DimPair(1, 0);
+  void contractionRowMajorBT(int num_iters) {
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, true);
+  }
 
-    StartBenchmarkTiming();
-    for (int iter = 0; iter < num_iters; ++iter) {
-      C.device(device_) = A.contract(B, dims);
-    }
-    // Record the number of FLOP executed per second (size_ multiplications and
-    // additions for each value in the resulting tensor)
-    finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
+  void contractionRowMajorABT(int num_iters) {
+      contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, true);
   }
 
   void convolution(int num_iters, int kernel_x, int kernel_y) {
@@ -430,18 +505,58 @@ template <typename Device, typename T> class BenchmarkSuite {
     Eigen::array<TensorIndex, 2> dims;
     dims[0] = 0;
     dims[1] = 1;
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.convolve(B, dims);
+     }
+#endif
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       C.device(device_) = A.convolve(B, dims);
     }
-    // Record the number of FLOP executed per second (kernel_size
+    // Record the number of FLOPs executed per second (kernel_size
     // multiplications and additions for each value in the resulting tensor)
     finalizeBenchmark(static_cast<int64_t>(2) *
         (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
   }
 
  private:
+ // do a contraction which is equivalent to a matrix multiplication
+  template<int Layout>
+  void contraction(int num_iters, bool trans_a, bool trans_b) {
+    Eigen::array<TensorIndex, 2> sizeA;
+    sizeA[0] = (trans_a ? k_: m_);
+    sizeA[1] = (trans_a ? m_:  k_);
+    Eigen::array<TensorIndex, 2> sizeB;
+    sizeB[0] = (trans_b ? n_: k_);
+    sizeB[1] = (trans_b ? k_: n_);
+    Eigen::array<TensorIndex, 2> sizeC;
+    sizeC[0] = m_;
+    sizeC[1] = n_;
+
+    const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> A(a_, sizeA);
+    const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> B(b_, sizeB);
+    TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> C(c_, sizeC);
+
+    typedef typename Tensor<T, 2, Layout>::DimensionPair DimPair;
+    Eigen::array<DimPair, 1> dims;
+    TensorIndex a_contract_dim = (trans_a ? 0 : 1);
+    TensorIndex b_contract_dim = (trans_b ? 1 : 0);
+    dims[0] = DimPair(a_contract_dim, b_contract_dim);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.contract(B, dims);
+     }
+#endif
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.contract(B, dims);
+    }
+    // Record the number of FLOP executed per second (size_ multiplications and
+    // additions for each value in the resulting tensor)
+    finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
+  }
+
   void initialize() {
     a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
     b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
@@ -453,7 +568,6 @@ template <typename Device, typename T> class BenchmarkSuite {
     device_.memset(b_, 23, k_ * n_ * sizeof(T));
     device_.memset(c_, 31, m_ * n_ * sizeof(T));
 
-    //BenchmarkUseRealTime();
   }
 
   inline void finalizeBenchmark(int64_t num_items) {
@@ -461,6 +575,11 @@ template <typename Device, typename T> class BenchmarkSuite {
     if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
       device_.synchronize();
     }
+#elif defined(EIGEN_USE_SYCL)
+    if (Eigen::internal::is_same<Device, Eigen::SyclDevice>::value) {
+      device_.synchronize();
+    }
+
 #endif
     StopBenchmarkTiming();
     SetBenchmarkFlopsProcessed(num_items);
diff --git a/contrib/eigen/bench/tensors/tensor_benchmarks_sycl.cc b/contrib/eigen/bench/tensors/tensor_benchmarks_sycl.cc
index 7eca4d966071c242fc0b5c733ce75111fbba60d1..6f9f871791a6addb0db314e0cdc66eb751379dc7 100644
--- a/contrib/eigen/bench/tensors/tensor_benchmarks_sycl.cc
+++ b/contrib/eigen/bench/tensors/tensor_benchmarks_sycl.cc
@@ -1,37 +1,140 @@
-#define EIGEN_USE_SYCL
+#ifdef EIGEN_USE_SYCL
 
-#include <SYCL/sycl.hpp>
+#include <CL/sycl.hpp>
 #include <iostream>
 
 #include "tensor_benchmarks.h"
 
-using Eigen::array;
-using Eigen::SyclDevice;
-using Eigen::Tensor;
-using Eigen::TensorMap;
-// Simple functions
-template <typename device_selector>
-cl::sycl::queue sycl_queue() {
-  return cl::sycl::queue(device_selector(), [=](cl::sycl::exception_list l) {
-    for (const auto& e : l) {
-      try {
-        std::rethrow_exception(e);
-      } catch (cl::sycl::exception e) {
-        std::cout << e.what() << std::endl;
-      }
-    }
-  });
-}
+cl::sycl::gpu_selector selector;
+Eigen::QueueInterface queue(selector);
+#define BM_FuncWithInput2DimsGPU(FUNC, D1, D2)                      \
+  static void BM_##FUNC##_##D1##x##D2(int iters, int N) {           \
+    StopBenchmarkTiming();                                          \
+    Eigen::SyclDevice device(&queue);                               \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2); \
+    suite.FUNC(iters);                                              \
+  }                                                                 \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2, 10, 10);
 
+BM_FuncWithInput2DimsGPU(rowReduction, 256, 100352);
+BM_FuncWithInput2DimsGPU(rowReduction, 64, 100352);
+BM_FuncWithInput2DimsGPU(rowReduction, 512, 25088);
+BM_FuncWithInput2DimsGPU(rowReduction, 128, 25088);
+BM_FuncWithInput2DimsGPU(rowReduction, 102, 6272);
+BM_FuncWithInput2DimsGPU(rowReduction, 256, 6272);
+BM_FuncWithInput2DimsGPU(rowReduction, 204, 1568);
+BM_FuncWithInput2DimsGPU(rowReduction, 512, 1568);
+BM_FuncWithInput2DimsGPU(rowReduction, 1024, 1568);
+BM_FuncWithInput2DimsGPU(rowReduction, 2048, 1568);
+
+BM_FuncWithInput2DimsGPU(colReduction, 100352, 256);
+BM_FuncWithInput2DimsGPU(colReduction, 100352, 64);
+BM_FuncWithInput2DimsGPU(colReduction, 25088, 512);
+BM_FuncWithInput2DimsGPU(colReduction, 6272, 102);
+BM_FuncWithInput2DimsGPU(colReduction, 25088, 128);
+BM_FuncWithInput2DimsGPU(colReduction, 6272, 256);
+BM_FuncWithInput2DimsGPU(colReduction, 1568, 204);
+BM_FuncWithInput2DimsGPU(colReduction, 1568, 512);
+BM_FuncWithInput2DimsGPU(colReduction, 1568, 1024);
+BM_FuncWithInput2DimsGPU(colReduction, 1568, 2048);
+BM_FuncWithInput2DimsGPU(fullReduction, 1001, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 2050048, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 2097152, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 2048, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 262144, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 256, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 589824, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 1024, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 524288, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 512, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 2359296, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 1048576, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 131072, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 16384, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 9408, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 64, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 4096, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 36864, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 32768, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 128, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 147456, 1);
+BM_FuncWithInput2DimsGPU(fullReduction, 65536, 1);
 #define BM_FuncGPU(FUNC)                                       \
   static void BM_##FUNC(int iters, int N) {                    \
     StopBenchmarkTiming();                                     \
-    cl::sycl::queue q = sycl_queue<cl::sycl::gpu_selector>();  \
-    Eigen::SyclDevice device(q);                               \
+    Eigen::SyclDevice device(&queue);                          \
     BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
     suite.FUNC(iters);                                         \
   }                                                            \
   BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
 
+BM_FuncGPU(rowReduction);
+BM_FuncGPU(colReduction);
+BM_FuncGPU(fullReduction);
+
+BM_FuncGPU(memcpy);
+BM_FuncGPU(typeCasting);
+BM_FuncGPU(random);
+BM_FuncGPU(slicing);
+BM_FuncGPU(rowChip);
+BM_FuncGPU(colChip);
+BM_FuncGPU(shuffling);
+BM_FuncGPU(padding);
+BM_FuncGPU(striding);
 BM_FuncGPU(broadcasting);
 BM_FuncGPU(coeffWiseOp);
+BM_FuncGPU(algebraicFunc);
+BM_FuncGPU(transcendentalFunc);
+// Contractions
+#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                       \
+  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {        \
+    StopBenchmarkTiming();                                              \
+    Eigen::SyclDevice device(&queue);                                   \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2, D3); \
+    suite.FUNC(iters);                                                  \
+  }                                                                     \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
+
+BM_FuncWithInputDimsGPU(contraction, N, N, N);
+BM_FuncWithInputDimsGPU(contraction, 64, N, N);
+BM_FuncWithInputDimsGPU(contraction, N, 64, N);
+BM_FuncWithInputDimsGPU(contraction, N, N, 64);
+
+BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajor, 64, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajor, N, 64, N);
+BM_FuncWithInputDimsGPU(contractionRowMajor, N, N, 64);
+
+BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorAT, 64, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, 64, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorAT, N, N, 64);
+
+BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorBT, 64, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, 64, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorBT, N, N, 64);
+
+
+BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorABT, 64, N, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, 64, N);
+BM_FuncWithInputDimsGPU(contractionRowMajorABT, N, N, 64);
+
+// Convolutions
+#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)             \
+  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {  \
+    StopBenchmarkTiming();                                     \
+    Eigen::SyclDevice device(&queue);                          \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
+    suite.FUNC(iters, DIM1, DIM2);                             \
+  }                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
+
+BM_FuncWithKernelDimsGPU(convolution, 7, 1);
+BM_FuncWithKernelDimsGPU(convolution, 1, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 4);
+BM_FuncWithKernelDimsGPU(convolution, 4, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 64);
+BM_FuncWithKernelDimsGPU(convolution, 64, 7);
+#endif
diff --git a/contrib/eigen/bench/tensors/tensor_contract_sycl_bench.cc b/contrib/eigen/bench/tensors/tensor_contract_sycl_bench.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f2defe4252f39812db4b37502ac753fcca57767
--- /dev/null
+++ b/contrib/eigen/bench/tensors/tensor_contract_sycl_bench.cc
@@ -0,0 +1,325 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_BENCH_CONTRACT_SYCL
+#define EIGEN_BENCH_CONTRACT_SYCL
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#include <SYCL/sycl.hpp>
+#include <fstream>
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+std::ofstream out("Result.txt");
+
+std::chrono::time_point<std::chrono::system_clock> get_time(){
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  return std::chrono::system_clock::now();
+}
+
+template<typename Start, typename End, typename TensorIndex>
+void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_ , TensorIndex num_iters, std::string name){
+
+  std::chrono::duration<double> elapsed_seconds = end-start;
+  std::cout <<"Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
+  static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
+    out <<"Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : " <<
+    static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters)/ elapsed_seconds.count()) * 1e-9 << "\n";
+}
+
+// do a contraction which is equivalent to a matrix multiplication
+template<typename T, typename Device, typename TensorIndex>
+void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = m_;
+  sizeA[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = k_;
+  sizeB[1] = n_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 0);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+ auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+}
+
+
+
+// do a contraction which is equivalent to a matrix multiplication
+template<typename T, typename Device, typename TensorIndex>
+void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = m_;
+  sizeA[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = k_;
+  sizeB[1] = n_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 0);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+}
+
+
+template<typename T, typename Device, typename TensorIndex>
+void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = k_;
+  sizeA[1] = m_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = k_;
+  sizeB[1] = n_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(0, 0);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+
+}
+
+template<typename T, typename Device, typename TensorIndex>
+void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = m_;
+  sizeA[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = n_;
+  sizeB[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 1);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+
+}
+
+template<typename T, typename Device, typename TensorIndex>
+void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
+  T* a_;
+  T* b_;
+  T* c_;
+  a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
+  b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
+  c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
+
+  // Initialize the content of the memory pools to prevent asan from
+  // complaining.
+  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+
+  Eigen::array<TensorIndex, 2> sizeA;
+  sizeA[0] = k_;
+  sizeA[1] = m_;
+  Eigen::array<TensorIndex, 2> sizeB;
+  sizeB[0] = n_;
+  sizeB[1] = k_;
+  Eigen::array<TensorIndex, 2> sizeC;
+  sizeC[0] = m_;
+  sizeC[1] = n_;
+
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
+  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
+  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);
+
+  typedef typename Tensor<T, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(0, 1);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+   }
+#endif
+  auto start = get_time();
+  for (int iter = 0; iter < num_iters; ++iter) {
+    C.device(device_) = A.contract(B, dims);
+  }
+  auto end = get_time();
+  // Record the number of FLOPs executed per second (size_ multiplications and
+  // additions for each value in the resulting tensor)
+  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT");
+  device_.deallocate(a_);
+  device_.deallocate(b_);
+  device_.deallocate(c_);
+  device_.synchronize();
+}
+
+int main() {
+  cl::sycl::gpu_selector selector;
+  Eigen::QueueInterface queue(selector);
+  Eigen::SyclDevice device(&queue);
+  int64_t num_iters =20;
+  for(int64_t m = 32; m <= 4096; m *= 2)
+    for(int64_t k = 32; k <= 4096; k *= 2)
+      for(int64_t n = 32; n <= 4096; n*= 2){
+        (contraction<float>(device, num_iters, m, k, n));
+        (contractionRowMajor<float>(device, num_iters, m, k, n));
+        (contractionAT<float>(device, num_iters, m, k, n));
+        (contractionBT<float>(device, num_iters, m, k, n));
+        (contractionABT<float>(device, num_iters, m, k, n));
+      }
+  return 0;
+  }
+
+#endif // EIGEN_BENCH_CONTRACT_SYCL
diff --git a/contrib/eigen/blas/CMakeLists.txt b/contrib/eigen/blas/CMakeLists.txt
index 9887d58043b611c5b717cb0fd871f36d4ab0065b..f3a94ec4a515d7ded1236e78afc4db1eac492f67 100644
--- a/contrib/eigen/blas/CMakeLists.txt
+++ b/contrib/eigen/blas/CMakeLists.txt
@@ -1,15 +1,13 @@
 
 project(EigenBlas CXX)
 
-include("../cmake/language_support.cmake")
-
-workaround_9220(Fortran EIGEN_Fortran_COMPILER_WORKS)
-
-if(EIGEN_Fortran_COMPILER_WORKS)
-  enable_language(Fortran OPTIONAL)
-  if(NOT CMAKE_Fortran_COMPILER)
-    set(EIGEN_Fortran_COMPILER_WORKS OFF)
-  endif()
+include(CheckLanguage)
+check_language(Fortran)
+if(CMAKE_Fortran_COMPILER)
+  enable_language(Fortran)
+  set(EIGEN_Fortran_COMPILER_WORKS ON)
+else()
+  set(EIGEN_Fortran_COMPILER_WORKS OFF)
 endif()
 
 add_custom_target(blas)
@@ -28,20 +26,27 @@ else()
   set(EigenBlas_SRCS ${EigenBlas_SRCS} f2c/complexdots.c)
 endif()
 
+set(EIGEN_BLAS_TARGETS "")
+
 add_library(eigen_blas_static ${EigenBlas_SRCS})
-add_library(eigen_blas SHARED ${EigenBlas_SRCS})
+list(APPEND EIGEN_BLAS_TARGETS eigen_blas_static)
 
-if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
-  target_link_libraries(eigen_blas_static ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
-  target_link_libraries(eigen_blas        ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+if (EIGEN_BUILD_SHARED_LIBS)
+  add_library(eigen_blas SHARED ${EigenBlas_SRCS})
+  list(APPEND EIGEN_BLAS_TARGETS eigen_blas)
 endif()
 
-add_dependencies(blas eigen_blas eigen_blas_static)
+foreach(target IN LISTS EIGEN_BLAS_TARGETS)
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+      target_link_libraries(${target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  endif()
 
-install(TARGETS eigen_blas eigen_blas_static
-        RUNTIME DESTINATION bin
-        LIBRARY DESTINATION lib
-        ARCHIVE DESTINATION lib)
+  add_dependencies(blas ${target})
+  install(TARGETS ${target}
+          RUNTIME DESTINATION bin
+          LIBRARY DESTINATION lib
+          ARCHIVE DESTINATION lib)
+endforeach()
 
 if(EIGEN_Fortran_COMPILER_WORKS)
 
diff --git a/contrib/eigen/blas/common.h b/contrib/eigen/blas/common.h
index 61d8344d9a94e17693d500bddd90054186e69663..a9b6978423fade7385c2bab0badcc6153b60bab3 100644
--- a/contrib/eigen/blas/common.h
+++ b/contrib/eigen/blas/common.h
@@ -10,6 +10,14 @@
 #ifndef EIGEN_BLAS_COMMON_H
 #define EIGEN_BLAS_COMMON_H
 
+#ifdef __GNUC__
+# if __GNUC__<5
+// GCC < 5.0 does not like the global Scalar typedef
+// we just keep shadow-warnings disabled permanently
+#  define EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+# endif
+#endif
+
 #include "../Eigen/Core"
 #include "../Eigen/Jacobi"
 
@@ -158,6 +166,10 @@ T* copy_back(T* x_cpy, T* x, int n, int incx)
   return x_cpy;
 }
 
-#define EIGEN_BLAS_FUNC(X) EIGEN_CAT(SCALAR_SUFFIX,X##_)
+#ifndef EIGEN_BLAS_FUNC_SUFFIX
+#define EIGEN_BLAS_FUNC_SUFFIX _
+#endif
+
+#define EIGEN_BLAS_FUNC(X) EIGEN_CAT(SCALAR_SUFFIX, EIGEN_CAT(X, EIGEN_BLAS_FUNC_SUFFIX))
 
 #endif // EIGEN_BLAS_COMMON_H
diff --git a/contrib/eigen/blas/double.cpp b/contrib/eigen/blas/double.cpp
index 295b1d1f2af8d6878415d8dc12467583a5057662..eb2e57307241a1bd9f5a8f99d419578fa953a3e7 100644
--- a/contrib/eigen/blas/double.cpp
+++ b/contrib/eigen/blas/double.cpp
@@ -19,7 +19,7 @@
 #include "level2_real_impl.h"
 #include "level3_impl.h"
 
-double BLASFUNC(dsdot)(int* n, float* x, int* incx, float* y, int* incy)
+double EIGEN_BLAS_FUNC(sdot)(int* n, float* x, int* incx, float* y, int* incy)
 {
   if(*n<=0) return 0;
 
diff --git a/contrib/eigen/blas/f2c/ctbmv.c b/contrib/eigen/blas/f2c/ctbmv.c
index 790fd581fedf8950814526becac229c2314a3d92..a6e0dae80985224d9ab08180184b38dc68c99fdc 100644
--- a/contrib/eigen/blas/f2c/ctbmv.c
+++ b/contrib/eigen/blas/f2c/ctbmv.c
@@ -147,7 +147,7 @@
 /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
 /*           Before entry, the incremented array X must contain the n */
 /*           element vector x. On exit, X is overwritten with the */
-/*           tranformed vector x. */
+/*           transformed vector x. */
 
 /*  INCX   - INTEGER. */
 /*           On entry, INCX specifies the increment for the elements of */
diff --git a/contrib/eigen/blas/f2c/dtbmv.c b/contrib/eigen/blas/f2c/dtbmv.c
index fdf73ebb52b13a8c35d8900db45260229566792e..aa67d19da67d608d0800d17562b1a9675260ad08 100644
--- a/contrib/eigen/blas/f2c/dtbmv.c
+++ b/contrib/eigen/blas/f2c/dtbmv.c
@@ -143,7 +143,7 @@
 /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
 /*           Before entry, the incremented array X must contain the n */
 /*           element vector x. On exit, X is overwritten with the */
-/*           tranformed vector x. */
+/*           transformed vector x. */
 
 /*  INCX   - INTEGER. */
 /*           On entry, INCX specifies the increment for the elements of */
diff --git a/contrib/eigen/blas/f2c/stbmv.c b/contrib/eigen/blas/f2c/stbmv.c
index fcf9ce336f5cc37d3751bc93d2e16af5097c0ec8..b5a68b545a7d0095398f189eb489b881ef927778 100644
--- a/contrib/eigen/blas/f2c/stbmv.c
+++ b/contrib/eigen/blas/f2c/stbmv.c
@@ -143,7 +143,7 @@
 /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
 /*           Before entry, the incremented array X must contain the n */
 /*           element vector x. On exit, X is overwritten with the */
-/*           tranformed vector x. */
+/*           transformed vector x. */
 
 /*  INCX   - INTEGER. */
 /*           On entry, INCX specifies the increment for the elements of */
diff --git a/contrib/eigen/blas/f2c/ztbmv.c b/contrib/eigen/blas/f2c/ztbmv.c
index 4cdcd7f8893aed466018cb03ed0cbc733fa6730f..3bf0beb019c4cc92633496a8d1de84ce3004e6b3 100644
--- a/contrib/eigen/blas/f2c/ztbmv.c
+++ b/contrib/eigen/blas/f2c/ztbmv.c
@@ -147,7 +147,7 @@
 /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
 /*           Before entry, the incremented array X must contain the n */
 /*           element vector x. On exit, X is overwritten with the */
-/*           tranformed vector x. */
+/*           transformed vector x. */
 
 /*  INCX   - INTEGER. */
 /*           On entry, INCX specifies the increment for the elements of */
diff --git a/contrib/eigen/blas/level1_cplx_impl.h b/contrib/eigen/blas/level1_cplx_impl.h
index 719f5bac9135c8eace7ff3aae8771e6ae1dd8bd5..6c7edd7eb374b74a2fcfea8dd8d30b62addf96fa 100644
--- a/contrib/eigen/blas/level1_cplx_impl.h
+++ b/contrib/eigen/blas/level1_cplx_impl.h
@@ -25,7 +25,7 @@ namespace Eigen {
 
 // computes the sum of magnitudes of all vector elements or, for a complex vector x, the sum
 // res = |Rex1| + |Imx1| + |Rex2| + |Imx2| + ... + |Rexn| + |Imxn|, where x is a vector of order n
-RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),asum_)(int *n, RealScalar *px, int *incx)
+RealScalar EIGEN_CAT(REAL_SCALAR_SUFFIX, EIGEN_BLAS_FUNC(asum))(int *n, RealScalar *px, int *incx)
 {
 //   std::cerr << "__asum " << *n << " " << *incx << "\n";
   Complex* x = reinterpret_cast<Complex*>(px);
@@ -36,6 +36,28 @@ RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),asum_)(int *n,
   else          return make_vector(x,*n,std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum();
 }
 
+int EIGEN_CAT(i, EIGEN_BLAS_FUNC(amax))(int *n, RealScalar *px, int *incx)
+{
+  if(*n<=0) return 0;
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+
+  DenseIndex ret;
+  if(*incx==1)  make_vector(x,*n).unaryExpr<scalar_norm1_op>().maxCoeff(&ret);
+  else          make_vector(x,*n,std::abs(*incx)).unaryExpr<scalar_norm1_op>().maxCoeff(&ret);
+  return int(ret)+1;
+}
+
+int EIGEN_CAT(i, EIGEN_BLAS_FUNC(amin))(int *n, RealScalar *px, int *incx)
+{
+  if(*n<=0) return 0;
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+
+  DenseIndex ret;
+  if(*incx==1)  make_vector(x,*n).unaryExpr<scalar_norm1_op>().minCoeff(&ret);
+  else          make_vector(x,*n,std::abs(*incx)).unaryExpr<scalar_norm1_op>().minCoeff(&ret);
+  return int(ret)+1;
+}
+
 // computes a dot product of a conjugated vector with another vector.
 int EIGEN_BLAS_FUNC(dotcw)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar* pres)
 {
@@ -81,7 +103,7 @@ int EIGEN_BLAS_FUNC(dotuw)(int *n, RealScalar *px, int *incx, RealScalar *py, in
   return 0;
 }
 
-RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),nrm2_)(int *n, RealScalar *px, int *incx)
+RealScalar EIGEN_CAT(REAL_SCALAR_SUFFIX, EIGEN_BLAS_FUNC(nrm2))(int *n, RealScalar *px, int *incx)
 {
 //   std::cerr << "__nrm2 " << *n << " " << *incx << "\n";
   if(*n<=0) return 0;
@@ -94,7 +116,7 @@ RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),nrm2_)(int *n,
   return make_vector(x,*n,*incx).stableNorm();
 }
 
-int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps)
+int EIGEN_BLAS_FUNC(EIGEN_CAT(REAL_SCALAR_SUFFIX, rot))(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps)
 {
   if(*n<=0) return 0;
 
@@ -117,7 +139,7 @@ int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScal
   return 0;
 }
 
-int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),scal_)(int *n, RealScalar *palpha, RealScalar *px, int *incx)
+int EIGEN_BLAS_FUNC(EIGEN_CAT(REAL_SCALAR_SUFFIX, scal))(int *n, RealScalar *palpha, RealScalar *px, int *incx)
 {
   if(*n<=0) return 0;
 
diff --git a/contrib/eigen/blas/level1_impl.h b/contrib/eigen/blas/level1_impl.h
index f857bfa20cf66eb64fb3182963bc7964ea8c7a54..71bd534b74f62db39d84c45fdfebfebafec01be3 100644
--- a/contrib/eigen/blas/level1_impl.h
+++ b/contrib/eigen/blas/level1_impl.h
@@ -33,7 +33,7 @@ int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
 
-  // be carefull, *incx==0 is allowed !!
+  // be careful, *incx==0 is allowed !!
   if(*incx==1 && *incy==1)
     make_vector(y,*n) = make_vector(x,*n);
   else
@@ -51,28 +51,6 @@ int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int
   return 0;
 }
 
-int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amax_)(int *n, RealScalar *px, int *incx)
-{
-  if(*n<=0) return 0;
-  Scalar* x = reinterpret_cast<Scalar*>(px);
-
-  DenseIndex ret;
-  if(*incx==1)  make_vector(x,*n).cwiseAbs().maxCoeff(&ret);
-  else          make_vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret);
-  return int(ret)+1;
-}
-
-int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amin_)(int *n, RealScalar *px, int *incx)
-{
-  if(*n<=0) return 0;
-  Scalar* x = reinterpret_cast<Scalar*>(px);
-
-  DenseIndex ret;
-  if(*incx==1)  make_vector(x,*n).cwiseAbs().minCoeff(&ret);
-  else          make_vector(x,*n,std::abs(*incx)).cwiseAbs().minCoeff(&ret);
-  return int(ret)+1;
-}
-
 int EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealScalar *ps)
 {
   using std::sqrt;
diff --git a/contrib/eigen/blas/level1_real_impl.h b/contrib/eigen/blas/level1_real_impl.h
index 02586d5195e5e853a891c50530ff67f11ccf24a6..c58771125ddb6da4a39a74273aa246f843b4edf9 100644
--- a/contrib/eigen/blas/level1_real_impl.h
+++ b/contrib/eigen/blas/level1_real_impl.h
@@ -23,6 +23,28 @@ RealScalar EIGEN_BLAS_FUNC(asum)(int *n, RealScalar *px, int *incx)
   else          return make_vector(x,*n,std::abs(*incx)).cwiseAbs().sum();
 }
 
+int EIGEN_CAT(i, EIGEN_BLAS_FUNC(amax))(int *n, RealScalar *px, int *incx)
+{
+  if(*n<=0) return 0;
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+
+  DenseIndex ret;
+  if(*incx==1)  make_vector(x,*n).cwiseAbs().maxCoeff(&ret);
+  else          make_vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret);
+  return int(ret)+1;
+}
+
+int EIGEN_CAT(i, EIGEN_BLAS_FUNC(amin))(int *n, RealScalar *px, int *incx)
+{
+  if(*n<=0) return 0;
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+
+  DenseIndex ret;
+  if(*incx==1)  make_vector(x,*n).cwiseAbs().minCoeff(&ret);
+  else          make_vector(x,*n,std::abs(*incx)).cwiseAbs().minCoeff(&ret);
+  return int(ret)+1;
+}
+
 // computes a vector-vector dot product.
 Scalar EIGEN_BLAS_FUNC(dot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
 {
diff --git a/contrib/eigen/blas/single.cpp b/contrib/eigen/blas/single.cpp
index 20ea57d5c72494fc8f335ded5c8fde2a97113ce8..e66879aea1a7659699e7079933763050cab072e5 100644
--- a/contrib/eigen/blas/single.cpp
+++ b/contrib/eigen/blas/single.cpp
@@ -18,5 +18,5 @@
 #include "level2_real_impl.h"
 #include "level3_impl.h"
 
-float BLASFUNC(sdsdot)(int* n, float* alpha, float* x, int* incx, float* y, int* incy)
+float EIGEN_BLAS_FUNC(dsdot)(int* n, float* alpha, float* x, int* incx, float* y, int* incy)
 { return double(*alpha) + BLASFUNC(dsdot)(n, x, incx, y, incy); }
diff --git a/contrib/eigen/blas/testing/CMakeLists.txt b/contrib/eigen/blas/testing/CMakeLists.txt
index 3ab8026ea3e4dd051697be9303e3ad5a5576d72a..52c23acda1ca160da22e0ff31cfda6d745485b9a 100644
--- a/contrib/eigen/blas/testing/CMakeLists.txt
+++ b/contrib/eigen/blas/testing/CMakeLists.txt
@@ -17,7 +17,7 @@ macro(ei_add_blas_test testname)
   add_test(${testname} "${Eigen_SOURCE_DIR}/blas/testing/runblastest.sh" "${testname}" "${Eigen_SOURCE_DIR}/blas/testing/${testname}.dat")
   add_dependencies(buildtests ${targetname})
   
-endmacro(ei_add_blas_test)
+endmacro()
 
 ei_add_blas_test(sblat1)
 ei_add_blas_test(sblat2)
diff --git a/contrib/eigen/blas/testing/cblat1.f b/contrib/eigen/blas/testing/cblat1.f
index 8ca67fb19934250cce1a19616fbcd17a22d26707..73015f5a998a0293c38a89725f1a085e916bd797 100644
--- a/contrib/eigen/blas/testing/cblat1.f
+++ b/contrib/eigen/blas/testing/cblat1.f
@@ -619,7 +619,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/contrib/eigen/blas/testing/dblat1.f b/contrib/eigen/blas/testing/dblat1.f
index 30691f9bff9a66960a0bed397d87e3cf32ed3510..03d9f13456a6fa98163a612f23cf8ae97efde5ee 100644
--- a/contrib/eigen/blas/testing/dblat1.f
+++ b/contrib/eigen/blas/testing/dblat1.f
@@ -990,7 +990,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/contrib/eigen/blas/testing/sblat1.f b/contrib/eigen/blas/testing/sblat1.f
index 6657c269303786e1c2a594431380841e140e149a..4d43d9b4810f69f9daba956befe6d5d48aca2b69 100644
--- a/contrib/eigen/blas/testing/sblat1.f
+++ b/contrib/eigen/blas/testing/sblat1.f
@@ -946,7 +946,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/contrib/eigen/blas/testing/zblat1.f b/contrib/eigen/blas/testing/zblat1.f
index d30112c63775207dc38e7ba31b379817e43fa993..c00b67dc82b23e9273e59a9f30b2fd8cb43d2f13 100644
--- a/contrib/eigen/blas/testing/zblat1.f
+++ b/contrib/eigen/blas/testing/zblat1.f
@@ -619,7 +619,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/contrib/eigen/ci/CTest2JUnit.xsl b/contrib/eigen/ci/CTest2JUnit.xsl
new file mode 100644
index 0000000000000000000000000000000000000000..8ba21f4e60b5485ae0b81429bac76baaae2e9dba
--- /dev/null
+++ b/contrib/eigen/ci/CTest2JUnit.xsl
@@ -0,0 +1,120 @@
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+<xsl:output method="xml" indent="yes"/>
+    <xsl:template match="/Site">
+	<xsl:variable name="Name"><xsl:value-of select="@Name"/></xsl:variable>
+	<xsl:variable name="Hostname"><xsl:value-of select="@Hostname"/></xsl:variable>
+	<xsl:variable name="TestCount"><xsl:value-of select="count(//TestList/Test)"/> </xsl:variable>
+	<xsl:variable name="ErrorCount"><xsl:value-of select="count(//TestList/Test[@Status='error'])"/> </xsl:variable>
+	<xsl:variable name="FailureCount"><xsl:value-of select="count(//Testing/Test[@Status='failed'])"/> </xsl:variable>
+	<testsuite name="{$Name}" hostname="{$Hostname}" errors="0" failures="{$FailureCount}" tests="{$TestCount}">
+	    <xsl:variable name="BuildName"><xsl:value-of select="@BuildName"/></xsl:variable>
+	    <xsl:variable name="BuildStamp"><xsl:value-of select="@BuildStamp"/></xsl:variable>
+	    <xsl:variable name="Generator"><xsl:value-of select="@Generator"/></xsl:variable>
+	    <xsl:variable name="CompilerName"><xsl:value-of select="@CompilerName"/></xsl:variable>
+	    <xsl:variable name="OSName"><xsl:value-of select="@OSName"/></xsl:variable>
+	    <xsl:variable name="OSRelease"><xsl:value-of select="@OSRelease"/></xsl:variable>
+	    <xsl:variable name="OSVersion"><xsl:value-of select="@OSVersion"/></xsl:variable>
+	    <xsl:variable name="OSPlatform"><xsl:value-of select="@OSPlatform"/></xsl:variable>
+	    <xsl:variable name="Is64Bits"><xsl:value-of select="@Is64Bits"/></xsl:variable>
+	    <xsl:variable name="VendorString"><xsl:value-of select="@VendorString"/></xsl:variable>
+	    <xsl:variable name="VendorID"><xsl:value-of select="@VendorID"/></xsl:variable>
+	    <xsl:variable name="FamilyID"><xsl:value-of select="@FamilyID"/></xsl:variable>
+	    <xsl:variable name="ModelID"><xsl:value-of select="@ModelID"/></xsl:variable>
+	    <xsl:variable name="ProcessorCacheSize"><xsl:value-of select="@ProcessorCacheSize"/></xsl:variable>
+	    <xsl:variable name="NumberOfLogicalCPU"><xsl:value-of select="@NumberOfLogicalCPU"/></xsl:variable>
+	    <xsl:variable name="NumberOfPhysicalCPU"><xsl:value-of select="@NumberOfPhysicalCPU"/></xsl:variable>
+	    <xsl:variable name="TotalVirtualMemory"><xsl:value-of select="@TotalVirtualMemory"/></xsl:variable>
+	    <xsl:variable name="TotalPhysicalMemory"><xsl:value-of select="@TotalPhysicalMemory"/></xsl:variable>
+	    <xsl:variable name="LogicalProcessorsPerPhysical"><xsl:value-of select="@LogicalProcessorsPerPhysical"/></xsl:variable>
+	    <xsl:variable name="ProcessorClockFrequency"><xsl:value-of select="@ProcessorClockFrequency"/></xsl:variable>
+	    <properties>
+		<property name="BuildName" value="{$BuildName}" />
+		<property name="BuildStamp" value="{$BuildStamp}" />
+		<property name="Name" value="{$Name}" />
+		<property name="Generator" value="{$Generator}" />
+		<property name="CompilerName" value="{$CompilerName}" />
+		<property name="OSName" value="{$OSName}" />
+		<property name="Hostname" value="{$Hostname}" />
+		<property name="OSRelease" value="{$OSRelease}" />
+		<property name="OSVersion" value="{$OSVersion}" />
+		<property name="OSPlatform" value="{$OSPlatform}" />
+		<property name="Is64Bits" value="{$Is64Bits}" />
+		<property name="VendorString" value="{$VendorString}" />
+		<property name="VendorID" value="{$VendorID}" />
+		<property name="FamilyID" value="{$FamilyID}" />
+		<property name="ModelID" value="{$ModelID}" />
+		<property name="ProcessorCacheSize" value="{$ProcessorCacheSize}" />
+		<property name="NumberOfLogicalCPU" value="{$NumberOfLogicalCPU}" />
+		<property name="NumberOfPhysicalCPU" value="{$NumberOfPhysicalCPU}" />
+		<property name="TotalVirtualMemory" value="{$TotalVirtualMemory}" />
+		<property name="TotalPhysicalMemory" value="{$TotalPhysicalMemory}" />
+		<property name="LogicalProcessorsPerPhysical" value="{$LogicalProcessorsPerPhysical}" />
+		<property name="ProcessorClockFrequency" value="{$ProcessorClockFrequency}" />
+	    </properties>
+	    <xsl:apply-templates select="Testing/Test"/>
+
+	    <system-out>
+		BuildName: <xsl:value-of select="$BuildName" />
+		BuildStamp: <xsl:value-of select="$BuildStamp" />
+		Name: <xsl:value-of select="$Name" />
+		Generator: <xsl:value-of select="$Generator" />
+		CompilerName: <xsl:value-of select="$CompilerName" />
+		OSName: <xsl:value-of select="$OSName" />
+		Hostname: <xsl:value-of select="$Hostname" />
+		OSRelease: <xsl:value-of select="$OSRelease" />
+		OSVersion: <xsl:value-of select="$OSVersion" />
+		OSPlatform: <xsl:value-of select="$OSPlatform" />
+		Is64Bits: <xsl:value-of select="$Is64Bits" />
+		VendorString: <xsl:value-of select="$VendorString" />
+		VendorID: <xsl:value-of select="$VendorID" />
+		FamilyID: <xsl:value-of select="$FamilyID" />
+		ModelID: <xsl:value-of select="$ModelID" />
+		ProcessorCacheSize: <xsl:value-of select="$ProcessorCacheSize" />
+		NumberOfLogicalCPU: <xsl:value-of select="$NumberOfLogicalCPU" />
+		NumberOfPhysicalCPU: <xsl:value-of select="$NumberOfPhysicalCPU" />
+		TotalVirtualMemory: <xsl:value-of select="$TotalVirtualMemory" />
+		TotalPhysicalMemory: <xsl:value-of select="$TotalPhysicalMemory" />
+		LogicalProcessorsPerPhysical: <xsl:value-of select="$LogicalProcessorsPerPhysical" />
+		ProcessorClockFrequency: <xsl:value-of select="$ProcessorClockFrequency" />
+	    </system-out>
+	</testsuite>
+    </xsl:template>
+
+    <xsl:template match="Testing/Test">
+	<xsl:variable name="testcasename"><xsl:value-of select= "Name"/></xsl:variable>
+	<xsl:variable name="testclassname"><xsl:value-of select= " concat('this', substring(Path,2))"/></xsl:variable>
+	<xsl:variable name="exectime">
+	    <xsl:for-each select="Results/NamedMeasurement">
+		<xsl:if test="@name = 'Execution Time'">
+		    <xsl:value-of select="."/>
+		</xsl:if>
+	    </xsl:for-each>
+	</xsl:variable>
+
+	<testcase name="{$testcasename}" classname="{$testclassname}" time="{$exectime}">
+	    <xsl:if test="@Status = 'passed'">
+	    </xsl:if>
+	    <xsl:if test="@Status = 'failed'">
+		<xsl:variable name="failtype">
+		    <xsl:for-each select="Results/NamedMeasurement">
+			<xsl:if test="@name = 'Exit Code'">
+			    <xsl:value-of select="."/>
+			</xsl:if>
+		    </xsl:for-each>
+		</xsl:variable>
+		<xsl:variable name="failcode">
+		    <xsl:for-each select="Results/NamedMeasurement">
+			<xsl:if test="@name = 'Exit Value'">
+			    <xsl:value-of select="."/>
+			</xsl:if>
+		    </xsl:for-each>
+		</xsl:variable>
+		<failure message="{$failtype} ({$failcode})"><xsl:value-of select="Results/Measurement/Value/text()" /></failure>
+	    </xsl:if>
+	    <xsl:if test="@Status = 'notrun'">
+		<skipped><xsl:value-of select="Results/Measurement/Value/text()" /></skipped>
+	    </xsl:if>
+	</testcase>
+    </xsl:template>
+
+</xsl:stylesheet>
diff --git a/contrib/eigen/ci/README.md b/contrib/eigen/ci/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8395b16015244296d8ef4f63dc4e50a361a410c2
--- /dev/null
+++ b/contrib/eigen/ci/README.md
@@ -0,0 +1,56 @@
+## Eigen CI infrastructure
+
+Eigen's CI infrastructure uses two stages: A `build` stage to build the unit-test
+suite and a `test` stage to run the unit-tests.
+
+### Build Stage
+
+The build stage consists of the following jobs:
+
+| Job Name                                 | Arch      | OS             | Compiler   | C++11   |
+|------------------------------------------|-----------|----------------|------------|---------|
+| `build:x86-64:linux:gcc-4.8:cxx11-off`   | `x86-64`  | `Ubuntu 18.04` | `GCC-4.8`  | `Off`   |
+| `build:x86-64:linux:gcc-4.8:cxx11-on`    | `x86-64`  | `Ubuntu 18.04` | `GCC-4.8`  | `On`    |
+| `build:x86-64:linux:gcc-9:cxx11-off`     | `x86-64`  | `Ubuntu 18.04` | `GCC-9`    | `Off`   |
+| `build:x86-64:linux:gcc-9:cxx11-on`      | `x86-64`  | `Ubuntu 18.04` | `GCC-9`    | `On`    |
+| `build:x86-64:linux:gcc-10:cxx11-off`    | `x86-64`  | `Ubuntu 18.04` | `GCC-10`   | `Off`   |
+| `build:x86-64:linux:gcc-10:cxx11-on`     | `x86-64`  | `Ubuntu 18.04` | `GCC-10`   | `On`    |
+| `build:x86-64:linux:clang-10:cxx11-off`  | `x86-64`  | `Ubuntu 18.04` | `Clang-10` | `Off`   |
+| `build:x86-64:linux:clang-10:cxx11-on`   | `x86-64`  | `Ubuntu 18.04` | `Clang-10` | `On`    |
+| `build:aarch64:linux:gcc-10:cxx11-off`   | `AArch64` | `Ubuntu 18.04` | `GCC-10`   | `Off`   |
+| `build:aarch64:linux:gcc-10:cxx11-on`    | `AArch64` | `Ubuntu 18.04` | `GCC-10`   | `On`    |
+| `build:aarch64:linux:clang-10:cxx11-off` | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `Off`   |
+| `build:aarch64:linux:clang-10:cxx11-on`  | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `On`    |
+
+### Test stage
+
+In principle every build-job has a corresponding test-job, however testing supported and unsupported modules is divided into separate jobs. The test jobs in detail:
+
+### Job dependecies
+
+| Job Name                                            | Arch      | OS             | Compiler   | C++11   | Module
+|-----------------------------------------------------|-----------|----------------|------------|---------|--------
+| `test:x86-64:linux:gcc-4.8:cxx11-off:official`      | `x86-64`  | `Ubuntu 18.04` | `GCC-4.8`  | `Off`   | `Official`
+| `test:x86-64:linux:gcc-4.8:cxx11-off:unsupported`   | `x86-64`  | `Ubuntu 18.04` | `GCC-4.8`  | `Off`   | `Unsupported`
+| `test:x86-64:linux:gcc-4.8:cxx11-on:official`       | `x86-64`  | `Ubuntu 18.04` | `GCC-4.8`  | `On`    | `Official`
+| `test:x86-64:linux:gcc-4.8:cxx11-on:unsupported`    | `x86-64`  | `Ubuntu 18.04` | `GCC-4.8`  | `On`    | `Unsupported`
+| `test:x86-64:linux:gcc-9:cxx11-off:official`        | `x86-64`  | `Ubuntu 18.04` | `GCC-9`    | `Off`   | `Official`
+| `test:x86-64:linux:gcc-9:cxx11-off:unsupported`     | `x86-64`  | `Ubuntu 18.04` | `GCC-9`    | `Off`   | `Unsupported`
+| `test:x86-64:linux:gcc-9:cxx11-on:official`         | `x86-64`  | `Ubuntu 18.04` | `GCC-9`    | `On`    | `Official`
+| `test:x86-64:linux:gcc-9:cxx11-on:unsupported`      | `x86-64`  | `Ubuntu 18.04` | `GCC-9`    | `On`    | `Unsupported`
+| `test:x86-64:linux:gcc-10:cxx11-off:official`       | `x86-64`  | `Ubuntu 18.04` | `GCC-10`   | `Off`   | `Official`
+| `test:x86-64:linux:gcc-10:cxx11-off:unsupported`    | `x86-64`  | `Ubuntu 18.04` | `GCC-10`   | `Off`   | `Unsupported`
+| `test:x86-64:linux:gcc-10:cxx11-on:official`        | `x86-64`  | `Ubuntu 18.04` | `GCC-10`   | `On`    | `Official`
+| `test:x86-64:linux:gcc-10:cxx11-on:unsupported`     | `x86-64`  | `Ubuntu 18.04` | `GCC-10`   | `On`    | `Unsupported`
+| `test:x86-64:linux:clang-10:cxx11-off:official`     | `x86-64`  | `Ubuntu 18.04` | `Clang-10` | `Off`   | `Official`
+| `test:x86-64:linux:clang-10:cxx11-off:unsupported`  | `x86-64`  | `Ubuntu 18.04` | `Clang-10` | `Off`   | `Unsupported`
+| `test:x86-64:linux:clang-10:cxx11-on:official`      | `x86-64`  | `Ubuntu 18.04` | `Clang-10` | `On`    | `Official`
+| `test:x86-64:linux:clang-10:cxx11-on:unsupported`   | `x86-64`  | `Ubuntu 18.04` | `Clang-10` | `On`    | `Unsupported`
+| `test:aarch64:linux:gcc-10:cxx11-off:official`      | `AArch64` | `Ubuntu 18.04` | `GCC-10`   | `Off`   | `Official`
+| `test:aarch64:linux:gcc-10:cxx11-off:unsupported`   | `AArch64` | `Ubuntu 18.04` | `GCC-10`   | `Off`   | `Unsupported`
+| `test:aarch64:linux:gcc-10:cxx11-on:official`       | `AArch64` | `Ubuntu 18.04` | `GCC-10`   | `On`    | `Official`
+| `test:aarch64:linux:gcc-10:cxx11-on:unsupported`    | `AArch64` | `Ubuntu 18.04` | `GCC-10`   | `On`    | `Unsupported`
+| `test:aarch64:linux:clang-10:cxx11-off:official`    | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `Off`   | `Official`
+| `test:aarch64:linux:clang-10:cxx11-off:unsupported` | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `Off`   | `Unsupported`
+| `test:aarch64:linux:clang-10:cxx11-on:official`     | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `On`    | `Official`
+| `test:aarch64:linux:clang-10:cxx11-on:unsupported`  | `AArch64` | `Ubuntu 18.04` | `Clang-10` | `On`    | `Unsupported`
diff --git a/contrib/eigen/ci/smoketests.gitlab-ci.yml b/contrib/eigen/ci/smoketests.gitlab-ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6384f10769152b7d119da88e9d4538631da80e42
--- /dev/null
+++ b/contrib/eigen/ci/smoketests.gitlab-ci.yml
@@ -0,0 +1,107 @@
+.buildsmoketests:linux:base:
+  stage: buildsmoketests
+  image: ubuntu:18.04
+  before_script:
+    - apt-get update -y
+    - apt-get install -y --no-install-recommends software-properties-common
+    - add-apt-repository -y  ppa:ubuntu-toolchain-r/test
+    - apt-get update
+    - apt-get install --no-install-recommends -y ${EIGEN_CI_CXX_COMPILER}
+      ${EIGEN_CI_CC_COMPILER} cmake ninja-build
+  script:
+    - mkdir -p ${BUILDDIR} && cd ${BUILDDIR}
+    - CXX=${EIGEN_CI_CXX_COMPILER} CC=${EIGEN_CI_CC_COMPILER} cmake -G
+      ${EIGEN_CI_CMAKE_GENEATOR} -DEIGEN_TEST_CXX11=${EIGEN_TEST_CXX11}
+      ${EIGEN_CI_ADDITIONAL_ARGS} ..
+    - cmake --build . --target buildsmoketests
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME"
+    paths:
+      - ${BUILDDIR}/
+    expire_in: 5 days
+  only:
+    - merge_requests
+
+buildsmoketests:x86-64:linux:gcc-10:cxx11-off:
+  extends: .buildsmoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "g++-10"
+    EIGEN_CI_CC_COMPILER: "gcc-10"
+    EIGEN_TEST_CXX11: "off"
+
+buildsmoketests:x86-64:linux:gcc-10:cxx11-on:
+  extends: .buildsmoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "g++-10"
+    EIGEN_CI_CC_COMPILER: "gcc-10"
+    EIGEN_TEST_CXX11: "on"
+
+buildsmoketests:x86-64:linux:clang-10:cxx11-off:
+  extends: .buildsmoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "clang++-10"
+    EIGEN_CI_CC_COMPILER: "clang-10"
+    EIGEN_TEST_CXX11: "off"
+
+buildsmoketests:x86-64:linux:clang-10:cxx11-on:
+  extends: .buildsmoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: "clang++-10"
+    EIGEN_CI_CC_COMPILER: "clang-10"
+    EIGEN_TEST_CXX11: "on"
+
+.smoketests:linux:base:
+  stage: smoketests
+  image: ubuntu:18.04
+  before_script:
+    - apt-get update -y
+    - apt-get install -y --no-install-recommends software-properties-common
+    - add-apt-repository -y ppa:ubuntu-toolchain-r/test
+    - apt-get update
+    - apt-get install --no-install-recommends -y ${EIGEN_CI_CXX_COMPILER}
+      ${EIGEN_CI_CC_COMPILER} cmake ninja-build xsltproc
+  script:
+    - export CXX=${EIGEN_CI_CXX_COMPILER}
+    - export CC=${EIGEN_CI_CC_COMPILER}
+    - cd ${BUILDDIR} && ctest --output-on-failure --no-compress-output
+      --build-no-clean -T test -L smoketest
+  after_script:
+    - apt-get update -y
+    - apt-get install --no-install-recommends -y xsltproc
+    - cd ${BUILDDIR}
+    - xsltproc ../ci/CTest2JUnit.xsl Testing/`head -n 1 < Testing/TAG`/Test.xml > "JUnitTestResults_$CI_JOB_ID.xml"
+  artifacts:
+    reports:
+      junit:
+        - ${BUILDDIR}/JUnitTestResults_$CI_JOB_ID.xml
+    expire_in: 5 days
+  only:
+    - merge_requests
+
+smoketests:x86-64:linux:gcc-10:cxx11-off:
+  extends: .smoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "buildsmoketests:x86-64:linux:gcc-10:cxx11-off" ]
+
+smoketests:x86-64:linux:gcc-10:cxx11-on:
+  extends: .smoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "buildsmoketests:x86-64:linux:gcc-10:cxx11-on" ]
+
+smoketests:x86-64:linux:clang-10:cxx11-off:
+  extends: .smoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "buildsmoketests:x86-64:linux:clang-10:cxx11-off" ]
+
+smoketests:x86-64:linux:clang-10:cxx11-on:
+  extends: .smoketests:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "buildsmoketests:x86-64:linux:clang-10:cxx11-on" ]
diff --git a/contrib/eigen/ci/test.gitlab-ci.yml b/contrib/eigen/ci/test.gitlab-ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2a0f5dd17741c8e4b38642e221aadef6361d5e0d
--- /dev/null
+++ b/contrib/eigen/ci/test.gitlab-ci.yml
@@ -0,0 +1,388 @@
+.test:linux:base:
+  stage: test
+  image: ubuntu:18.04
+  retry: 2
+  before_script:
+    - apt-get update -y
+    - apt-get install -y --no-install-recommends software-properties-common
+    - add-apt-repository -y ppa:ubuntu-toolchain-r/test
+    - apt-get update
+    - apt-get install --no-install-recommends -y ${EIGEN_CI_CXX_COMPILER}
+      ${EIGEN_CI_CC_COMPILER} cmake ninja-build xsltproc
+  script:
+    - export CXX=${EIGEN_CI_CXX_COMPILER}
+    - export CC=${EIGEN_CI_CC_COMPILER}
+    - cd ${BUILDDIR} && ctest --output-on-failure --no-compress-output
+      --build-no-clean -T test -L ${EIGEN_CI_TEST_LABEL}
+  after_script:
+    - apt-get update -y
+    - apt-get install --no-install-recommends -y xsltproc
+    - cd ${BUILDDIR}
+    - xsltproc ../ci/CTest2JUnit.xsl Testing/`head -n 1 < Testing/TAG`/Test.xml > "JUnitTestResults_$CI_JOB_ID.xml"
+  artifacts:
+    reports:
+      junit:
+        - ${BUILDDIR}/JUnitTestResults_$CI_JOB_ID.xml
+    expire_in: 5 days
+  only:
+    - schedules
+
+##### x86-64 ###################################################################
+# GCC-4.8
+.test:x86-64:linux:gcc-4.8:cxx11-off:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-4.8
+    EIGEN_CI_CC_COMPILER: gcc-4.8
+  needs: [ "build:x86-64:linux:gcc-4.8:cxx11-off" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - x86-64
+
+test:x86-64:linux:gcc-4.8:cxx11-off:official:
+  extends: .test:x86-64:linux:gcc-4.8:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:x86-64:linux:gcc-4.8:cxx11-off:unsupported:
+  extends: .test:x86-64:linux:gcc-4.8:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+.test:x86-64:linux:gcc-4.8:cxx11-on:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-4.8
+    EIGEN_CI_CC_COMPILER: gcc-4.8
+  needs: [ "build:x86-64:linux:gcc-4.8:cxx11-on" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - x86-64
+
+test:x86-64:linux:gcc-4.8:cxx11-on:official:
+  extends: .test:x86-64:linux:gcc-4.8:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:x86-64:linux:gcc-4.8:cxx11-on:unsupported:
+  extends: .test:x86-64:linux:gcc-4.8:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+# GCC-9
+.test:x86-64:linux:gcc-9:cxx11-off:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-9
+    EIGEN_CI_CC_COMPILER: gcc-9
+  needs: [ "build:x86-64:linux:gcc-9:cxx11-off" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - x86-64
+
+test:x86-64:linux:gcc-9:cxx11-off:official:
+  extends: .test:x86-64:linux:gcc-9:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:x86-64:linux:gcc-9:cxx11-off:unsupported:
+  extends: .test:x86-64:linux:gcc-9:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+.test:x86-64:linux:gcc-9:cxx11-on:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-9
+    EIGEN_CI_CC_COMPILER: gcc-9
+  needs: [ "build:x86-64:linux:gcc-9:cxx11-on" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - x86-64
+
+test:x86-64:linux:gcc-9:cxx11-on:official:
+  extends: .test:x86-64:linux:gcc-9:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:x86-64:linux:gcc-9:cxx11-on:unsupported:
+  extends: .test:x86-64:linux:gcc-9:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+# GCC-10
+.test:x86-64:linux:gcc-10:cxx11-off:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "build:x86-64:linux:gcc-10:cxx11-off" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - x86-64
+
+test:x86-64:linux:gcc-10:cxx11-off:official:
+  extends: .test:x86-64:linux:gcc-10:cxx11-off
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:x86-64:linux:gcc-10:cxx11-off:unsupported:
+  extends: .test:x86-64:linux:gcc-10:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+.test:x86-64:linux:gcc-10:cxx11-on:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "build:x86-64:linux:gcc-10:cxx11-on" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - x86-64
+
+test:x86-64:linux:gcc-10:cxx11-on:official:
+  extends: .test:x86-64:linux:gcc-10:cxx11-on
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:x86-64:linux:gcc-10:cxx11-on:unsupported:
+  extends: .test:x86-64:linux:gcc-10:cxx11-on
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+# Clang 10
+.test:x86-64:linux:clang-10:cxx11-off:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "build:x86-64:linux:clang-10:cxx11-off" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - x86-64
+
+test:x86-64:linux:clang-10:cxx11-off:official:
+  extends: .test:x86-64:linux:clang-10:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:x86-64:linux:clang-10:cxx11-off:unsupported:
+  extends: .test:x86-64:linux:clang-10:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+.test:x86-64:linux:clang-10:cxx11-on:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "build:x86-64:linux:clang-10:cxx11-on" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - x86-64
+
+test:x86-64:linux:clang-10:cxx11-on:official:
+  extends: .test:x86-64:linux:clang-10:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:x86-64:linux:clang-10:cxx11-on:unsupported:
+  extends: .test:x86-64:linux:clang-10:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+##### AArch64 ##################################################################
+# GCC-10
+.test:aarch64:linux:gcc-10:cxx11-off:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "build:aarch64:linux:gcc-10:cxx11-off" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - aarch64
+
+test:aarch64:linux:gcc-10:cxx11-off:official:
+  extends: .test:aarch64:linux:gcc-10:cxx11-off
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:aarch64:linux:gcc-10:cxx11-off:unsupported:
+  extends: .test:aarch64:linux:gcc-10:cxx11-off
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+.test:aarch64:linux:gcc-10:cxx11-on:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "build:aarch64:linux:gcc-10:cxx11-on" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - aarch64
+
+test:aarch64:linux:gcc-10:cxx11-on:official:
+  extends: .test:aarch64:linux:gcc-10:cxx11-on
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:aarch64:linux:gcc-10:cxx11-on:unsupported:
+  extends: .test:aarch64:linux:gcc-10:cxx11-on
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+# Clang 10
+.test:aarch64:linux:clang-10:cxx11-off:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "build:aarch64:linux:clang-10:cxx11-off" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - aarch64
+
+test:aarch64:linux:clang-10:cxx11-off:official:
+  extends: .test:aarch64:linux:clang-10:cxx11-off
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:aarch64:linux:clang-10:cxx11-off:unsupported:
+  extends: .test:aarch64:linux:clang-10:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+.test:aarch64:linux:clang-10:cxx11-on:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "build:aarch64:linux:clang-10:cxx11-on" ]
+  tags: 
+    - eigen-runner
+    - linux
+    - aarch64
+
+test:aarch64:linux:clang-10:cxx11-on:official:
+  extends: .test:aarch64:linux:clang-10:cxx11-on
+  allow_failure: true
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:aarch64:linux:clang-10:cxx11-on:unsupported:
+  extends: .test:aarch64:linux:clang-10:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+##### ppc64le ##################################################################
+# GCC-10
+.test:ppc64le:linux:gcc-10:cxx11-off:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "build:ppc64le:linux:gcc-10:cxx11-off" ]
+  allow_failure: true
+  tags: 
+    - eigen-runner
+    - linux
+    - ppc64le
+
+test:ppc64le:linux:gcc-10:cxx11-off:official:
+  extends: .test:ppc64le:linux:gcc-10:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:ppc64le:linux:gcc-10:cxx11-off:unsupported:
+  extends: .test:ppc64le:linux:gcc-10:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+.test:ppc64le:linux:gcc-10:cxx11-on:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10
+  needs: [ "build:ppc64le:linux:gcc-10:cxx11-on" ]
+  allow_failure: true
+  tags: 
+    - eigen-runner
+    - linux
+    - ppc64le
+
+test:ppc64le:linux:gcc-10:cxx11-on:official:
+  extends: .test:ppc64le:linux:gcc-10:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:ppc64le:linux:gcc-10:cxx11-on:unsupported:
+  extends: .test:ppc64le:linux:gcc-10:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+# # Clang 10
+.test:ppc64le:linux:clang-10:cxx11-off:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "build:ppc64le:linux:clang-10:cxx11-off" ]
+  allow_failure: true
+  tags: 
+    - eigen-runner
+    - linux
+    - ppc64le
+
+test:ppc64le:linux:clang-10:cxx11-off:official:
+  extends: .test:ppc64le:linux:clang-10:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:ppc64le:linux:clang-10:cxx11-off:unsupported:
+  extends: .test:ppc64le:linux:clang-10:cxx11-off
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
+
+.test:ppc64le:linux:clang-10:cxx11-on:
+  extends: .test:linux:base
+  variables:
+    EIGEN_CI_CXX_COMPILER: clang++-10
+    EIGEN_CI_CC_COMPILER: clang-10
+  needs: [ "build:ppc64le:linux:clang-10:cxx11-on" ]
+  allow_failure: true
+  tags: 
+    - eigen-runner
+    - linux
+    - ppc64le
+
+test:ppc64le:linux:clang-10:cxx11-on:official:
+  extends: .test:ppc64le:linux:clang-10:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Official"
+
+test:ppc64le:linux:clang-10:cxx11-on:unsupported:
+  extends: .test:ppc64le:linux:clang-10:cxx11-on
+  variables:
+    EIGEN_CI_TEST_LABEL: "Unsupported"
diff --git a/contrib/eigen/cmake/ComputeCppCompilerChecks.cmake b/contrib/eigen/cmake/ComputeCppCompilerChecks.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..1807485e47d535af2e9bf0e2590554d9662b3707
--- /dev/null
+++ b/contrib/eigen/cmake/ComputeCppCompilerChecks.cmake
@@ -0,0 +1,50 @@
+cmake_minimum_required(VERSION 3.4.3)
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
+    message(FATAL_ERROR "host compiler - gcc version must be > 4.8")
+  endif()
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6)
+    message(FATAL_ERROR "host compiler - clang version must be > 3.6")
+  endif()
+endif()
+
+if(MSVC)
+  set(ComputeCpp_STL_CHECK_SRC __STL_check)
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp
+    "#include <ios>\n"
+    "int main() { return 0; }\n")
+  execute_process(
+    COMMAND ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE}
+            ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
+            -isystem ${ComputeCpp_INCLUDE_DIRS}
+            -o ${ComputeCpp_STL_CHECK_SRC}.sycl
+            -c ${ComputeCpp_STL_CHECK_SRC}.cpp
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    RESULT_VARIABLE ComputeCpp_STL_CHECK_RESULT
+    ERROR_QUIET
+    OUTPUT_QUIET)
+  if(NOT ${ComputeCpp_STL_CHECK_RESULT} EQUAL 0)
+    # Try disabling compiler version checks
+    execute_process(
+      COMMAND ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE}
+              ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
+              -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH
+              -isystem ${ComputeCpp_INCLUDE_DIRS}
+              -o ${ComputeCpp_STL_CHECK_SRC}.cpp.sycl
+              -c ${ComputeCpp_STL_CHECK_SRC}.cpp
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      RESULT_VARIABLE ComputeCpp_STL_CHECK_RESULT
+      ERROR_QUIET
+      OUTPUT_QUIET)
+    if(NOT ${ComputeCpp_STL_CHECK_RESULT} EQUAL 0)
+      message(STATUS "Device compiler cannot consume hosted STL headers. Using any parts of the STL will likely result in device compiler errors.")
+    else()
+    message(STATUS "Device compiler does not meet certain STL version requirements. Disabling version checks and hoping for the best.")
+      list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH)
+    endif()
+  endif()
+  file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp
+              ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp.sycl)
+endif(MSVC)
diff --git a/contrib/eigen/cmake/ComputeCppIRMap.cmake b/contrib/eigen/cmake/ComputeCppIRMap.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..942d91d6401975cf80bf2fc87b9799a55d9e77d2
--- /dev/null
+++ b/contrib/eigen/cmake/ComputeCppIRMap.cmake
@@ -0,0 +1,18 @@
+cmake_minimum_required(VERSION 3.4.3)
+
+# These should match the types of IR output by compute++
+set(IR_MAP_spir bc)
+set(IR_MAP_spir64 bc)
+set(IR_MAP_spir32 bc)
+set(IR_MAP_spirv spv)
+set(IR_MAP_spirv64 spv)
+set(IR_MAP_spirv32 spv)
+set(IR_MAP_aorta-x86_64 o)
+set(IR_MAP_aorta-aarch64 o)
+set(IR_MAP_aorta-rcar-cve o)
+set(IR_MAP_custom-spir64 bc)
+set(IR_MAP_custom-spir32 bc)
+set(IR_MAP_custom-spirv64 spv)
+set(IR_MAP_custom-spirv32 spv)
+set(IR_MAP_ptx64 s)
+set(IR_MAP_amdgcn s)
diff --git a/contrib/eigen/cmake/Eigen3Config.cmake.in b/contrib/eigen/cmake/Eigen3Config.cmake.in
index c5c54688771aae50c8c8bbb79b90f07b718a2757..0a1ac61c94d79fa917d7483d14b511085db7319a 100644
--- a/contrib/eigen/cmake/Eigen3Config.cmake.in
+++ b/contrib/eigen/cmake/Eigen3Config.cmake.in
@@ -3,7 +3,9 @@
 
 @PACKAGE_INIT@
 
-include ("${CMAKE_CURRENT_LIST_DIR}/Eigen3Targets.cmake")
+if (NOT TARGET eigen)
+  include ("${CMAKE_CURRENT_LIST_DIR}/Eigen3Targets.cmake")
+endif ()
 
 # Legacy variables, do *not* use. May be removed in the future.
 
diff --git a/contrib/eigen/cmake/EigenConfigureTesting.cmake b/contrib/eigen/cmake/EigenConfigureTesting.cmake
index 3a824397f398bca7927acda55bb3bc86385fe71e..9cb3bb20bbcc56fa206e0714fd832d66227d7a18 100644
--- a/contrib/eigen/cmake/EigenConfigureTesting.cmake
+++ b/contrib/eigen/cmake/EigenConfigureTesting.cmake
@@ -22,7 +22,7 @@ set(EIGEN_DASHBOARD_BUILD_TARGET "buildtests" CACHE STRING "Target to be built i
 set(EIGEN_CTEST_ERROR_EXCEPTION "" CACHE STRING "Regular expression for build error messages to be filtered out")
 
 # Overwrite default DartConfiguration.tcl such that ctest can build our unit tests.
-# Recall that our unit tests are not in the "all" target, so we have to explicitely ask ctest to build our custom 'buildtests' target.
+# Recall that our unit tests are not in the "all" target, so we have to explicitly ask ctest to build our custom 'buildtests' target.
 # At this stage, we can also add custom flags to the build tool through the user defined EIGEN_TEST_BUILD_FLAGS variable.
 file(READ  "${CMAKE_CURRENT_BINARY_DIR}/DartConfiguration.tcl" EIGEN_DART_CONFIG_FILE)
 # try to grab the default flags
@@ -41,7 +41,7 @@ ei_init_testing()
 
 # configure Eigen related testing options
 option(EIGEN_NO_ASSERTION_CHECKING "Disable checking of assertions using exceptions" OFF)
-option(EIGEN_DEBUG_ASSERTS "Enable advanced debuging of assertions" OFF)
+option(EIGEN_DEBUG_ASSERTS "Enable advanced debugging of assertions" OFF)
 
 if(CMAKE_COMPILER_IS_GNUCXX)
   option(EIGEN_COVERAGE_TESTING "Enable/disable gcov" OFF)
@@ -49,10 +49,10 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     set(COVERAGE_FLAGS "-fprofile-arcs -ftest-coverage")
     set(CTEST_CUSTOM_COVERAGE_EXCLUDE "/test/")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_FLAGS}")
-  endif(EIGEN_COVERAGE_TESTING)
+  endif()
   
 elseif(MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS")
-endif(CMAKE_COMPILER_IS_GNUCXX)
+endif()
 
 
diff --git a/contrib/eigen/cmake/EigenSmokeTestList.cmake b/contrib/eigen/cmake/EigenSmokeTestList.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..6f0f724186915916174ad8244bd11651785d9e3d
--- /dev/null
+++ b/contrib/eigen/cmake/EigenSmokeTestList.cmake
@@ -0,0 +1,131 @@
+# List of tests that will be build and run during Eigen's smoke testing. If one
+# of these tests doesn't exists or cannot be build with the current configuration
+# it will just be skipped.
+set(ei_smoke_test_list
+  adjoint_1
+  alignedvector3
+  array_cwise_7
+  array_cwise_8
+  array_for_matrix_1
+  array_of_string
+  array_replicate_1
+  array_reverse_1
+  autodiff_1
+  autodiff_scalar_1
+  bandmatrix
+  bdcsvd_9
+  bessel_functions_1
+  bfloat16_float
+  blasutil_1
+  block_5
+  BVH
+  cholesky_1
+  cholmod_support_23
+  cholmod_support_24
+  conservative_resize_1
+  constructor_1
+  corners_1
+  ctorleakmiscmatrices_4
+  dense_storage
+  determinant_1
+  diagonal_1
+  diagonal_2
+  diagonalmatrices_1
+  dynalloc
+  eigensolver_complex_1
+  eigensolver_selfadjoint_8
+  EulerAngles_1
+  exceptions
+  fastmath
+  first_aligned
+  geo_alignedbox_2
+  geo_eulerangles_1
+  geo_homogeneous_1
+  geo_hyperplane_1
+  geo_orthomethods_1
+  geo_parametrizedline_1
+  geo_transformations_7
+  half_float
+  hessenberg_1
+  hessenberg_6qr_10
+  householder_8
+  indexed_view_1
+  inplace_decomposition_1
+  integer_types_1
+  inverse_1
+  is_same_dense
+  jacobi_1
+  jacobisvd_1
+  kronecker_product
+  linearstructure_1
+  mapped_matrix_1
+  mapstaticmethods_1
+  mapstride_1
+  matrix_square_root_1
+  meta
+  minres_2
+  miscmatrices_1
+  mixingtypes_7
+  nestbyvalue
+  nesting_ops_1
+  nomalloc_1
+  nullary_1
+  num_dimensions
+  NumericalDiff
+  numext
+  packetmath
+  permutationmatrices_1
+  polynomialsolver_1
+  prec_inverse_4x4_1
+  product_extra_5
+  product_selfadjoint_1
+  product_small_7
+  product_symm_1
+  product_syrk_1
+  product_trmm_1
+  product_trmv_1
+  product_trsolve_5
+  qr_1
+  qr_colpivoting_7
+  qr_fullpivoting_4
+  rand
+  real_qz_1
+  redux_1
+  ref_1
+  resize
+  rvalue_types_1
+  schur_complex_1
+  schur_real_1
+  selfadjoint_1
+  sizeof
+  sizeoverflow
+  smallvectors
+  sparse_basic_3
+  sparse_block_1
+  sparse_extra_4
+  sparse_permutations_2
+  sparse_product_4
+  sparse_ref_1
+  sparse_solvers_1
+  sparse_vector_1
+  special_functions_1
+  special_numbers_1
+  special_packetmath_1
+  spqr_support_2
+  stable_norm_1
+  stddeque_1
+  stddeque_overload_1
+  stdlist_1
+  stdlist_overload_1
+  stdvector_1
+  stdvector_overload_1
+  stl_iterators_1
+  swap_1
+  symbolic_index_1
+  triangular_1
+  type_aliaslu_9
+  umeyama_3
+  unalignedassert
+  unalignedcount
+  vectorwiseop_1
+  visitor_1)
\ No newline at end of file
diff --git a/contrib/eigen/cmake/EigenTesting.cmake b/contrib/eigen/cmake/EigenTesting.cmake
index 3d0074c71ad1c14418daf805953e2c557dacff25..eb8457db6f90e205a6571e6b18676299f533e510 100644
--- a/contrib/eigen/cmake/EigenTesting.cmake
+++ b/contrib/eigen/cmake/EigenTesting.cmake
@@ -6,7 +6,7 @@ macro(ei_add_property prop value)
   else()
     set_property(GLOBAL PROPERTY ${prop} "${previous} ${value}")
   endif()
-endmacro(ei_add_property)
+endmacro()
 
 #internal. See documentation of ei_add_test for details.
 macro(ei_add_test_internal testname testname_with_suffix)
@@ -18,20 +18,34 @@ macro(ei_add_test_internal testname testname_with_suffix)
     set(filename ${testname}.cpp)
   endif()
 
+  # Add the current target to the list of subtest targets
+  get_property(EIGEN_SUBTESTS_LIST GLOBAL PROPERTY EIGEN_SUBTESTS_LIST)
+  set(EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}${targetname}\n")
+  set_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}")
+
   if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
-    if(EIGEN_TEST_CUDA_CLANG)
+    if(EIGEN_TEST_HIP)
+      hip_reset_flags()
+      hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP ${ARGV2}")
+    elseif(EIGEN_TEST_CUDA_CLANG)
       set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
-      if(CUDA_64_BIT_DEVICE_CODE)
+      
+      if(CUDA_64_BIT_DEVICE_CODE AND (EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64"))
         link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
       else()
         link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib")
       endif()
+
       if (${ARGC} GREATER 2)
         add_executable(${targetname} ${filename})
       else()
         add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
       endif()
-      target_link_libraries(${targetname} "cudart_static" "cuda" "dl" "rt" "pthread")
+      set(CUDA_CLANG_LINK_LIBRARIES "cudart_static" "cuda" "dl" "pthread")
+      if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+      set(CUDA_CLANG_LINK_LIBRARIES ${CUDA_CLANG_LINK_LIBRARIES} "rt")
+      endif()
+      target_link_libraries(${targetname} ${CUDA_CLANG_LINK_LIBRARIES})
     else()
       if (${ARGC} GREATER 2)
         cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
@@ -51,119 +65,22 @@ macro(ei_add_test_internal testname testname_with_suffix)
 
   if(EIGEN_NO_ASSERTION_CHECKING)
     ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_NO_ASSERTION_CHECKING=1")
-  else(EIGEN_NO_ASSERTION_CHECKING)
+  else()
     if(EIGEN_DEBUG_ASSERTS)
       ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_DEBUG_ASSERTS=1")
-    endif(EIGEN_DEBUG_ASSERTS)
-  endif(EIGEN_NO_ASSERTION_CHECKING)
-
-  ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}")
-
-  ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_FUNC=${testname}")
-
-  if(MSVC)
-    ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj")
-  endif()
-
-  # let the user pass flags.
-  if(${ARGC} GREATER 2)
-    ei_add_target_property(${targetname} COMPILE_FLAGS "${ARGV2}")
-  endif(${ARGC} GREATER 2)
-
-  if(EIGEN_TEST_CUSTOM_CXX_FLAGS)
-    ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}")
-  endif()
-
-  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
-    target_link_libraries(${targetname} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
-  endif()
-  if(EXTERNAL_LIBS)
-    target_link_libraries(${targetname} ${EXTERNAL_LIBS})
-  endif()
-  if(EIGEN_TEST_CUSTOM_LINKER_FLAGS)
-    target_link_libraries(${targetname} ${EIGEN_TEST_CUSTOM_LINKER_FLAGS})
-  endif()
-
-  if(${ARGC} GREATER 3)
-    set(libs_to_link ${ARGV3})
-    # it could be that some cmake module provides a bad library string " "  (just spaces),
-    # and that severely breaks target_link_libraries ("can't link to -l-lstdc++" errors).
-    # so we check for strings containing only spaces.
-    string(STRIP "${libs_to_link}" libs_to_link_stripped)
-    string(LENGTH "${libs_to_link_stripped}" libs_to_link_stripped_length)
-    if(${libs_to_link_stripped_length} GREATER 0)
-      # notice: no double quotes around ${libs_to_link} here. It may be a list.
-      target_link_libraries(${targetname} ${libs_to_link})
     endif()
   endif()
 
-  add_test(${testname_with_suffix} "${targetname}")
-
-  # Specify target and test labels accoirding to EIGEN_CURRENT_SUBPROJECT
-  get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT)
-  if ((current_subproject) AND (NOT (current_subproject STREQUAL "")))
-    set_property(TARGET ${targetname} PROPERTY LABELS "Build${current_subproject}")
-    add_dependencies("Build${current_subproject}" ${targetname})
-    set_property(TEST ${testname_with_suffix} PROPERTY LABELS "${current_subproject}")
-  endif()
-
-endmacro(ei_add_test_internal)
-
-# SYCL
-macro(ei_add_test_internal_sycl testname testname_with_suffix)
-  include_directories( SYSTEM ${COMPUTECPP_PACKAGE_ROOT_DIR}/include)
-  set(targetname ${testname_with_suffix})
-
-  if(EIGEN_ADD_TEST_FILENAME_EXTENSION)
-    set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION})
-  else()
-    set(filename ${testname}.cpp)
-  endif()
-
-  set( include_file ${CMAKE_CURRENT_BINARY_DIR}/inc_${filename})
-  set( bc_file ${CMAKE_CURRENT_BINARY_DIR}/${filename})
-  set( host_file ${CMAKE_CURRENT_SOURCE_DIR}/${filename})
-
-  ADD_CUSTOM_COMMAND(
-    OUTPUT ${include_file}
-    COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${host_file}\\\"" > ${include_file}
-    COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${bc_file}.sycl\\\"" >> ${include_file}
-    DEPENDS ${filename} ${bc_file}.sycl
-    COMMENT "Building ComputeCpp integration header file ${include_file}"
-  )
-  # Add a custom target for the generated integration header
-  add_custom_target(${testname}_integration_header_sycl DEPENDS ${include_file})
-
-  add_executable(${targetname} ${include_file})
-  add_dependencies(${targetname} ${testname}_integration_header_sycl)
-  add_sycl_to_target(${targetname} ${filename} ${CMAKE_CURRENT_BINARY_DIR})
-
-  if (targetname MATCHES "^eigen2_")
-    add_dependencies(eigen2_buildtests ${targetname})
-  else()
-    add_dependencies(buildtests ${targetname})
-  endif()
-
-  if(EIGEN_NO_ASSERTION_CHECKING)
-    ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_NO_ASSERTION_CHECKING=1")
-  else(EIGEN_NO_ASSERTION_CHECKING)
-    if(EIGEN_DEBUG_ASSERTS)
-      ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_DEBUG_ASSERTS=1")
-    endif(EIGEN_DEBUG_ASSERTS)
-  endif(EIGEN_NO_ASSERTION_CHECKING)
-
   ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}")
 
-  ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_FUNC=${testname}")
-
-  if(MSVC AND NOT EIGEN_SPLIT_LARGE_TESTS)
+  if(MSVC)
     ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj")
   endif()
 
   # let the user pass flags.
   if(${ARGC} GREATER 2)
     ei_add_target_property(${targetname} COMPILE_FLAGS "${ARGV2}")
-  endif(${ARGC} GREATER 2)
+  endif()
 
   if(EIGEN_TEST_CUSTOM_CXX_FLAGS)
     ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}")
@@ -201,11 +118,28 @@ macro(ei_add_test_internal_sycl testname testname_with_suffix)
     add_dependencies("Build${current_subproject}" ${targetname})
     set_property(TEST ${testname_with_suffix} PROPERTY LABELS "${current_subproject}")
   endif()
-
-
-endmacro(ei_add_test_internal_sycl)
-
-
+  if(EIGEN_SYCL)
+    # Force include of the SYCL file at the end to avoid errors.
+    set_property(TARGET ${targetname} PROPERTY COMPUTECPP_INCLUDE_AFTER 1)
+    # Set COMPILE_FLAGS to COMPILE_DEFINITIONS instead to avoid having to duplicate the flags
+    # to the device compiler.
+    get_target_property(target_compile_flags ${targetname} COMPILE_FLAGS)
+    separate_arguments(target_compile_flags)
+    foreach(flag ${target_compile_flags})
+      if(${flag} MATCHES "^-D.*")
+        string(REPLACE "-D" "" definition_flag ${flag})
+        set_property(TARGET ${targetname} APPEND PROPERTY COMPILE_DEFINITIONS ${definition_flag})
+        list(REMOVE_ITEM target_compile_flags ${flag})
+      endif()
+    endforeach()
+    set_property(TARGET ${targetname} PROPERTY COMPILE_FLAGS ${target_compile_flags})
+    # Link against pthread and add sycl to target
+    set(THREADS_PREFER_PTHREAD_FLAG ON)
+    find_package(Threads REQUIRED)
+    target_link_libraries(${targetname} Threads::Threads)
+    add_sycl_to_target(TARGET ${targetname} SOURCES ${filename})
+  endif(EIGEN_SYCL)
+endmacro(ei_add_test_internal)
 # Macro to add a test
 #
 # the unique mandatory parameter testname must correspond to a file
@@ -240,7 +174,7 @@ endmacro(ei_add_test_internal_sycl)
 #
 # If EIGEN_SPLIT_LARGE_TESTS is ON, the test is split into multiple executables
 #   test_<testname>_<N>
-# where N runs from 1 to the greatest occurence found in the source file. Each of these
+# where N runs from 1 to the greatest occurrence found in the source file. Each of these
 # executables is built passing -DEIGEN_TEST_PART_N. This allows to split large tests
 # into smaller executables.
 #
@@ -260,98 +194,61 @@ macro(ei_add_test testname)
   endif()
 
   file(READ "${filename}" test_source)
-  set(parts 0)
   string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+"
-         occurences "${test_source}")
-  string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurences}")
+         occurrences "${test_source}")
+  string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurrences}")
   list(REMOVE_DUPLICATES suffixes)
-  if(EIGEN_SPLIT_LARGE_TESTS AND suffixes)
+  set(explicit_suffixes "")
+  if( (NOT EIGEN_SPLIT_LARGE_TESTS) AND suffixes)
+    # Check whether we have EIGEN_TEST_PART_* statements, in which case we likely must enforce splitting.
+    # For instance, indexed_view activate a different c++ version for each part.
+    string(REGEX MATCHALL "EIGEN_TEST_PART_[0-9]+" occurrences "${test_source}")
+    string(REGEX REPLACE "EIGEN_TEST_PART_" "" explicit_suffixes "${occurrences}")
+    list(REMOVE_DUPLICATES explicit_suffixes)
+  endif()
+  if( (EIGEN_SPLIT_LARGE_TESTS AND suffixes) OR explicit_suffixes)
     add_custom_target(${testname})
     foreach(suffix ${suffixes})
       ei_add_test_internal(${testname} ${testname}_${suffix}
         "${ARGV1} -DEIGEN_TEST_PART_${suffix}=1" "${ARGV2}")
       add_dependencies(${testname} ${testname}_${suffix})
-    endforeach(suffix)
-  else(EIGEN_SPLIT_LARGE_TESTS AND suffixes)
-    set(symbols_to_enable_all_parts "")
-    foreach(suffix ${suffixes})
-      set(symbols_to_enable_all_parts
-        "${symbols_to_enable_all_parts} -DEIGEN_TEST_PART_${suffix}=1")
-    endforeach(suffix)
-    ei_add_test_internal(${testname} ${testname} "${ARGV1} ${symbols_to_enable_all_parts}" "${ARGV2}")
-  endif(EIGEN_SPLIT_LARGE_TESTS AND suffixes)
-endmacro(ei_add_test)
-
-macro(ei_add_test_sycl testname)
-  get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST)
-  set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}${testname}\n")
-  set_property(GLOBAL PROPERTY EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
-
-  if(EIGEN_ADD_TEST_FILENAME_EXTENSION)
-    set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION})
+    endforeach()
   else()
-    set(filename ${testname}.cpp)
+    ei_add_test_internal(${testname} ${testname} "${ARGV1} -DEIGEN_TEST_PART_ALL=1" "${ARGV2}")
   endif()
-
-  file(READ "${filename}" test_source)
-  set(parts 0)
-  string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+"
-         occurences "${test_source}")
-  string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurences}")
-  list(REMOVE_DUPLICATES suffixes)
-  if(EIGEN_SPLIT_LARGE_TESTS AND suffixes)
-    add_custom_target(${testname})
-    foreach(suffix ${suffixes})
-      ei_add_test_internal_sycl(${testname} ${testname}_${suffix}
-        "${ARGV1} -DEIGEN_TEST_PART_${suffix}=1" "${ARGV2}")
-      add_dependencies(${testname} ${testname}_${suffix})
-    endforeach(suffix)
-  else(EIGEN_SPLIT_LARGE_TESTS AND suffixes)
-    set(symbols_to_enable_all_parts "")
-    foreach(suffix ${suffixes})
-      set(symbols_to_enable_all_parts
-        "${symbols_to_enable_all_parts} -DEIGEN_TEST_PART_${suffix}=1")
-    endforeach(suffix)
-    ei_add_test_internal_sycl(${testname} ${testname} "${ARGV1} ${symbols_to_enable_all_parts}" "${ARGV2}")
-  endif(EIGEN_SPLIT_LARGE_TESTS AND suffixes)
-endmacro(ei_add_test_sycl)
+endmacro()
 
 # adds a failtest, i.e. a test that succeed if the program fails to compile
 # note that the test runner for these is CMake itself, when passed -DEIGEN_FAILTEST=ON
 # so here we're just running CMake commands immediately, we're not adding any targets.
 macro(ei_add_failtest testname)
-  get_property(EIGEN_FAILTEST_FAILURE_COUNT GLOBAL PROPERTY EIGEN_FAILTEST_FAILURE_COUNT)
-  get_property(EIGEN_FAILTEST_COUNT GLOBAL PROPERTY EIGEN_FAILTEST_COUNT)
 
-  message(STATUS "Checking failtest: ${testname}")
-  set(filename "${testname}.cpp")
-  file(READ "${filename}" test_source)
+  set(test_target_ok ${testname}_ok)
+  set(test_target_ko ${testname}_ko)
 
-  try_compile(succeeds_when_it_should_fail
-              "${CMAKE_CURRENT_BINARY_DIR}"
-              "${CMAKE_CURRENT_SOURCE_DIR}/${filename}"
-              COMPILE_DEFINITIONS "-DEIGEN_SHOULD_FAIL_TO_BUILD")
-  if (succeeds_when_it_should_fail)
-    message(STATUS "FAILED: ${testname} build succeeded when it should have failed")
-  endif()
+  # Add executables
+  add_executable(${test_target_ok} ${testname}.cpp)
+  add_executable(${test_target_ko} ${testname}.cpp)
 
-  try_compile(succeeds_when_it_should_succeed
-              "${CMAKE_CURRENT_BINARY_DIR}"
-              "${CMAKE_CURRENT_SOURCE_DIR}/${filename}"
-              COMPILE_DEFINITIONS)
-  if (NOT succeeds_when_it_should_succeed)
-    message(STATUS "FAILED: ${testname} build failed when it should have succeeded")
-  endif()
+  # Remove them from the normal build process
+  set_target_properties(${test_target_ok} ${test_target_ko} PROPERTIES
+                        EXCLUDE_FROM_ALL TRUE
+                        EXCLUDE_FROM_DEFAULT_BUILD TRUE)
 
-  if (succeeds_when_it_should_fail OR NOT succeeds_when_it_should_succeed)
-    math(EXPR EIGEN_FAILTEST_FAILURE_COUNT ${EIGEN_FAILTEST_FAILURE_COUNT}+1)
-  endif()
+  # Configure the failing test
+  target_compile_definitions(${test_target_ko} PRIVATE EIGEN_SHOULD_FAIL_TO_BUILD)
 
-  math(EXPR EIGEN_FAILTEST_COUNT ${EIGEN_FAILTEST_COUNT}+1)
+  # Add the tests to ctest.
+  add_test(NAME ${test_target_ok}
+          COMMAND ${CMAKE_COMMAND} --build . --target ${test_target_ok} --config $<CONFIGURATION>
+          WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  add_test(NAME ${test_target_ko}
+          COMMAND ${CMAKE_COMMAND} --build . --target ${test_target_ko} --config $<CONFIGURATION>
+          WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
-  set_property(GLOBAL PROPERTY EIGEN_FAILTEST_FAILURE_COUNT ${EIGEN_FAILTEST_FAILURE_COUNT})
-  set_property(GLOBAL PROPERTY EIGEN_FAILTEST_COUNT ${EIGEN_FAILTEST_COUNT})
-endmacro(ei_add_failtest)
+  # Expect the second test to fail
+  set_tests_properties(${test_target_ko} PROPERTIES WILL_FAIL TRUE)
+endmacro()
 
 # print a summary of the different options
 macro(ei_testing_print_summary)
@@ -418,6 +315,12 @@ macro(ei_testing_print_summary)
       message(STATUS "AVX:               Using architecture defaults")
     endif()
 
+    if(EIGEN_TEST_AVX2)
+      message(STATUS "AVX2:              ON")
+    else()
+      message(STATUS "AVX2:              Using architecture defaults")
+    endif()
+
     if(EIGEN_TEST_FMA)
       message(STATUS "FMA:               ON")
     else()
@@ -430,6 +333,12 @@ macro(ei_testing_print_summary)
       message(STATUS "AVX512:            Using architecture defaults")
     endif()
 
+    if(EIGEN_TEST_AVX512DQ)
+      message(STATUS "AVX512DQ:          ON")
+    else()
+      message(STATUS "AVX512DQ:          Using architecture defaults")
+    endif()
+
     if(EIGEN_TEST_ALTIVEC)
       message(STATUS "Altivec:           ON")
     else()
@@ -442,6 +351,12 @@ macro(ei_testing_print_summary)
       message(STATUS "VSX:               Using architecture defaults")
     endif()
 
+    if(EIGEN_TEST_MSA)
+      message(STATUS "MIPS MSA:          ON")
+    else()
+      message(STATUS "MIPS MSA:          Using architecture defaults")
+    endif()
+
     if(EIGEN_TEST_NEON)
       message(STATUS "ARM NEON:          ON")
     else()
@@ -467,7 +382,11 @@ macro(ei_testing_print_summary)
     endif()
 
     if(EIGEN_TEST_SYCL)
-      message(STATUS "SYCL:              ON")
+      if(EIGEN_SYCL_TRISYCL)
+        message(STATUS "SYCL:              ON (using triSYCL)")
+      else()
+        message(STATUS "SYCL:              ON (using computeCPP)")
+      endif()
     else()
       message(STATUS "SYCL:              OFF")
     endif()
@@ -480,13 +399,18 @@ macro(ei_testing_print_summary)
     else()
       message(STATUS "CUDA:              OFF")
     endif()
+    if(EIGEN_TEST_HIP)
+      message(STATUS "HIP:               ON (using hipcc)")
+    else()
+      message(STATUS "HIP:               OFF")
+    endif()
 
   endif() # vectorization / alignment options
 
   message(STATUS "\n${EIGEN_TESTING_SUMMARY}")
 
   message(STATUS "************************************************************")
-endmacro(ei_testing_print_summary)
+endmacro()
 
 macro(ei_init_testing)
   define_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT BRIEF_DOCS " " FULL_DOCS " ")
@@ -494,11 +418,13 @@ macro(ei_init_testing)
   define_property(GLOBAL PROPERTY EIGEN_MISSING_BACKENDS BRIEF_DOCS " " FULL_DOCS " ")
   define_property(GLOBAL PROPERTY EIGEN_TESTING_SUMMARY BRIEF_DOCS " " FULL_DOCS " ")
   define_property(GLOBAL PROPERTY EIGEN_TESTS_LIST BRIEF_DOCS " " FULL_DOCS " ")
+  define_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST BRIEF_DOCS " " FULL_DOCS " ")
 
   set_property(GLOBAL PROPERTY EIGEN_TESTED_BACKENDS "")
   set_property(GLOBAL PROPERTY EIGEN_MISSING_BACKENDS "")
   set_property(GLOBAL PROPERTY EIGEN_TESTING_SUMMARY "")
   set_property(GLOBAL PROPERTY EIGEN_TESTS_LIST "")
+  set_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST "")
 
   define_property(GLOBAL PROPERTY EIGEN_FAILTEST_FAILURE_COUNT BRIEF_DOCS " " FULL_DOCS " ")
   define_property(GLOBAL PROPERTY EIGEN_FAILTEST_COUNT BRIEF_DOCS " " FULL_DOCS " ")
@@ -508,7 +434,7 @@ macro(ei_init_testing)
 
   # uncomment anytime you change the ei_get_compilerver_from_cxx_version_string macro
   # ei_test_get_compilerver_from_cxx_version_string()
-endmacro(ei_init_testing)
+endmacro()
 
 macro(ei_set_sitename)
   # if the sitename is not yet set, try to set it
@@ -525,7 +451,7 @@ macro(ei_set_sitename)
   if(SITE)
     string(TOLOWER ${SITE} SITE)
   endif()
-endmacro(ei_set_sitename)
+endmacro()
 
 macro(ei_get_compilerver VAR)
     if(MSVC)
@@ -538,6 +464,8 @@ macro(ei_get_compilerver VAR)
       else()
         set(${VAR} "na")
       endif()
+    elseif(${CMAKE_CXX_COMPILER_ID} MATCHES "PGI")
+      set(${VAR} "${CMAKE_CXX_COMPILER_ID}-${CMAKE_CXX_COMPILER_VERSION}")
     else()
     # on all other system we rely on ${CMAKE_CXX_COMPILER}
     # supporting a "--version" or "/version" flag
@@ -550,18 +478,20 @@ macro(ei_get_compilerver VAR)
 
     execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${EIGEN_CXX_FLAG_VERSION}
                     OUTPUT_VARIABLE eigen_cxx_compiler_version_string OUTPUT_STRIP_TRAILING_WHITESPACE)
+    string(REGEX REPLACE "^[ \n\r]+" "" eigen_cxx_compiler_version_string ${eigen_cxx_compiler_version_string})
     string(REGEX REPLACE "[\n\r].*"  ""  eigen_cxx_compiler_version_string  ${eigen_cxx_compiler_version_string})
 
     ei_get_compilerver_from_cxx_version_string("${eigen_cxx_compiler_version_string}" CNAME CVER)
     set(${VAR} "${CNAME}-${CVER}")
 
   endif()
-endmacro(ei_get_compilerver)
+endmacro()
 
 # Extract compiler name and version from a raw version string
-# WARNING: if you edit thid macro, then please test it by  uncommenting
+# WARNING: if you edit this macro, then please test it by uncommenting
 # the testing macro call in ei_init_testing() of the EigenTesting.cmake file.
-# See also the ei_test_get_compilerver_from_cxx_version_string macro at the end of the file
+# See also the ei_test_get_compilerver_from_cxx_version_string macro at the end
+# of the file
 macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER)
   # extract possible compiler names
   string(REGEX MATCH "g\\+\\+"      ei_has_gpp    ${VERSTRING})
@@ -569,6 +499,7 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER)
   string(REGEX MATCH "gcc|GCC"      ei_has_gcc    ${VERSTRING})
   string(REGEX MATCH "icpc|ICC"     ei_has_icpc   ${VERSTRING})
   string(REGEX MATCH "clang|CLANG"  ei_has_clang  ${VERSTRING})
+  string(REGEX MATCH "mingw32"      ei_has_mingw  ${VERSTRING})
 
   # combine them
   if((ei_has_llvm) AND (ei_has_gpp OR ei_has_gcc))
@@ -577,6 +508,8 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER)
     set(${CNAME} "llvm-clang++")
   elseif(ei_has_clang)
     set(${CNAME} "clang++")
+  elseif ((ei_has_mingw) AND (ei_has_gpp OR ei_has_gcc))
+    set(${CNAME} "mingw32-g++")
   elseif(ei_has_icpc)
     set(${CNAME} "icpc")
   elseif(ei_has_gpp OR ei_has_gcc)
@@ -597,15 +530,21 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER)
       if(NOT eicver)
         # try to extract 2:
         string(REGEX MATCH "[^0-9][0-9]+\\.[0-9]+" eicver ${VERSTRING})
-      else()
-        set(eicver " _")
+        if (NOT eicver AND ei_has_mingw)
+          # try to extract 1 number plus suffix:
+          string(REGEX MATCH "[^0-9][0-9]+-win32" eicver ${VERSTRING})          
+        endif()
       endif()
     endif()
   endif()
+  
+  if (NOT eicver)
+    set(eicver " _")
+  endif()
 
   string(REGEX REPLACE ".(.*)" "\\1" ${CVER} ${eicver})
 
-endmacro(ei_get_compilerver_from_cxx_version_string)
+endmacro()
 
 macro(ei_get_cxxflags VAR)
   set(${VAR} "")
@@ -634,6 +573,8 @@ macro(ei_get_cxxflags VAR)
     set(${VAR} SSE3)
   elseif(EIGEN_TEST_SSE2 OR IS_64BIT_ENV)
     set(${VAR} SSE2)
+  elseif(EIGEN_TEST_MSA)
+    set(${VAR} MSA)
   endif()
 
   if(EIGEN_TEST_OPENMP)
@@ -651,7 +592,7 @@ macro(ei_get_cxxflags VAR)
       set(${VAR} ${${VAR}}-ROWMAJ)
     endif()
   endif()
-endmacro(ei_get_cxxflags)
+endmacro()
 
 macro(ei_set_build_string)
   ei_get_compilerver(LOCAL_COMPILER_VERSION)
@@ -666,6 +607,10 @@ macro(ei_set_build_string)
     set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${LOCAL_COMPILER_FLAGS})
   endif()
 
+  if(EIGEN_TEST_EXTERNAL_BLAS)
+    set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-external_blas)
+  endif()
+
   ei_is_64bit_env(IS_64BIT_ENV)
   if(NOT IS_64BIT_ENV)
     set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-32bit)
@@ -677,14 +622,12 @@ macro(ei_set_build_string)
     set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-cxx11)
   endif()
 
-  set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-v3.3)
-
   if(EIGEN_BUILD_STRING_SUFFIX)
     set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${EIGEN_BUILD_STRING_SUFFIX})
   endif()
 
   string(TOLOWER ${TMP_BUILD_STRING} BUILDNAME)
-endmacro(ei_set_build_string)
+endmacro()
 
 macro(ei_is_64bit_env VAR)
   if(CMAKE_SIZEOF_VOID_P EQUAL 8)
@@ -694,7 +637,7 @@ macro(ei_is_64bit_env VAR)
   else()
     message(WARNING "Unsupported pointer size. Please contact the authors.")
   endif()
-endmacro(ei_is_64bit_env)
+endmacro()
 
 
 # helper macro for testing ei_get_compilerver_from_cxx_version_string
@@ -707,7 +650,7 @@ macro(ei_test1_get_compilerver_from_cxx_version_string STR REFNAME REFVER)
     message("STATUS ei_get_compilerver_from_cxx_version_string error:")
     message("Expected \"${REFNAME}-${REFVER}\", got \"${CNAME}-${CVER}\"")
   endif()
-endmacro(ei_test1_get_compilerver_from_cxx_version_string)
+endmacro()
 
 # macro for testing ei_get_compilerver_from_cxx_version_string
 # feel free to add more version strings
@@ -722,4 +665,118 @@ macro(ei_test_get_compilerver_from_cxx_version_string)
   ei_test1_get_compilerver_from_cxx_version_string("i686-apple-darwin11-llvm-g++-4.2 (GCC) 4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2335.15.00)" "llvm-g++" "4.2.1")
   ei_test1_get_compilerver_from_cxx_version_string("g++-mp-4.4 (GCC) 4.4.6" "g++" "4.4.6")
   ei_test1_get_compilerver_from_cxx_version_string("g++-mp-4.4 (GCC) 2011" "g++" "4.4")
-endmacro(ei_test_get_compilerver_from_cxx_version_string)
+  ei_test1_get_compilerver_from_cxx_version_string("x86_64-w64-mingw32-g++ (GCC) 10-win32 20210110" "mingw32-g++" "10-win32")
+endmacro()
+
+# Split all tests listed in EIGEN_TESTS_LIST into num_splits many targets
+# named buildtestspartN with N = { 0, ..., num_splits-1}.
+#
+# The intention behind the existance of this macro is the size of Eigen's
+# testsuite. Together with the relativly big compile-times building all tests
+# can take a substantial amount of time depending on the available hardware.
+# 
+# The last buildtestspartN target will build possible remaining tests.
+#
+# An example:
+#
+#   EIGEN_TESTS_LIST= [ test1, test2, test3, test4, test5, test6, test7 ]
+#
+# A call to ei_split_testsuite(3) creates the following targets with dependencies
+#
+#   Target                      Dependencies
+#   ------                      ------------
+#   buildtestspart0             test1, test2
+#   buildtestspart1             test3, test4
+#   buildtestspart2             test5, test6, test7
+#
+macro(ei_split_testsuite num_splits)
+  get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST)
+
+  # Translate EIGEN_TESTS_LIST into a CMake list
+  string(REGEX REPLACE "\n" " " EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
+  set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
+  separate_arguments(EIGEN_TESTS_LIST)
+
+  set(eigen_test_count "0")
+  foreach(t IN ITEMS ${EIGEN_TESTS_LIST})
+    math(EXPR eigen_test_count "${eigen_test_count}+1")
+  endforeach()
+
+  # Get number of tests per target
+  math(EXPR num_tests_per_target "${eigen_test_count}/${num_splits} - ${eigen_test_count}/${num_splits} % 1")
+
+  set(test_idx "0")
+  math(EXPR target_bound "${num_splits}-1")
+  foreach(part RANGE "0" "${target_bound}")
+    # Create target
+    set(current_target "buildtestspart${part}")
+    add_custom_target("${current_target}")
+    math(EXPR upper_bound "${test_idx} + ${num_tests_per_target} - 1")
+    foreach(test_idx RANGE "${test_idx}" "${upper_bound}")
+      list(GET EIGEN_TESTS_LIST "${test_idx}" curr_test)
+      add_dependencies("${current_target}" "${curr_test}")
+    endforeach()
+    math(EXPR test_idx "${test_idx} + ${num_tests_per_target}")
+  endforeach()
+  
+  # Handle the possibly remaining tests
+  math(EXPR test_idx "${num_splits} * ${num_tests_per_target}")
+  math(EXPR target_bound "${eigen_test_count} - 1")
+  foreach(test_idx RANGE "${test_idx}" "${target_bound}")
+    list(GET EIGEN_TESTS_LIST "${test_idx}" curr_test)
+    add_dependencies("${current_target}" "${curr_test}")
+  endforeach()
+endmacro(ei_split_testsuite num_splits)
+
+# Defines the custom command buildsmoketests to build a number of tests
+# specified in smoke_test_list.
+# 
+# Test in smoke_test_list can be either test targets (e.g. packetmath) or
+# subtests targets (e.g. packetmath_2). If any of the test are not available
+# in the current configuration they are just skipped. 
+#
+# All tests added via this macro are labeled with the smoketest label. This
+# allows running smoketests only using ctest.
+#
+# Smoke tests are intended to be run before the whole test suite is invoked,
+# e.g., to smoke test patches.
+macro(ei_add_smoke_tests smoke_test_list)
+  # Set the build target to build smoketests
+  set(buildtarget "buildsmoketests")
+  add_custom_target("${buildtarget}")
+
+  # Get list of all tests and translate it into a CMake list
+  get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST)
+  string(REGEX REPLACE "\n" " " EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
+  set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}")
+  separate_arguments(EIGEN_TESTS_LIST)
+
+  # Check if the test in smoke_test_list is a currently valid test target
+  foreach(test IN ITEMS ${smoke_test_list})
+    # Add tests in smoke_test_list to our smoke test target but only if the test
+    # is currently available, i.e., is in EIGEN_SUBTESTS_LIST
+    if ("${test}" IN_LIST EIGEN_TESTS_LIST)
+      add_dependencies("${buildtarget}" "${test}")
+      # In the case of a test we match all subtests
+      set(ctest_regex "${ctest_regex}^${test}_[0-9]+$$|")
+    endif()
+  endforeach()
+
+  # Get list of all subtests and translate it into a CMake list
+  get_property(EIGEN_SUBTESTS_LIST GLOBAL PROPERTY EIGEN_SUBTESTS_LIST)
+  string(REGEX REPLACE "\n" " " EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}")
+  set(EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}")
+  separate_arguments(EIGEN_SUBTESTS_LIST)
+
+  # Check if the test in smoke_test_list is a currently valid subtest target
+  foreach(test IN ITEMS ${smoke_test_list})
+    # Add tests in smoke_test_list to our smoke test target but only if the test
+    # is currently available, i.e., is in EIGEN_SUBTESTS_LIST
+    if ("${test}" IN_LIST EIGEN_SUBTESTS_LIST)
+      add_dependencies("${buildtarget}" "${test}")
+      # Add label smoketest to be able to run smoketests using ctest
+      get_property(test_labels TEST ${test} PROPERTY LABELS)
+      set_property(TEST ${test} PROPERTY LABELS "${test_labels};smoketest")
+    endif()
+  endforeach()
+endmacro(ei_add_smoke_tests)
diff --git a/contrib/eigen/cmake/EigenUninstall.cmake b/contrib/eigen/cmake/EigenUninstall.cmake
index 4dae8c85c8701e8cf676acc88c65065577d7e353..5e63c98d9974cdf7a5679690be9dbf7d78a02b30 100644
--- a/contrib/eigen/cmake/EigenUninstall.cmake
+++ b/contrib/eigen/cmake/EigenUninstall.cmake
@@ -27,7 +27,7 @@ if(EXISTS ${MANIFEST})
     else()
       message(STATUS "File '${file}' does not exist.")
     endif()
-  endforeach(file)
+  endforeach()
 
   message(STATUS "========== Finished Uninstalling Eigen  ==============")
 else()
diff --git a/contrib/eigen/cmake/FindAdolc.cmake b/contrib/eigen/cmake/FindAdolc.cmake
index 937e549904e260bb7b0b4e6ba1ee82369938e7fd..13c59fcf72b239670020a9f9a129fcd4637d91af 100644
--- a/contrib/eigen/cmake/FindAdolc.cmake
+++ b/contrib/eigen/cmake/FindAdolc.cmake
@@ -1,20 +1,20 @@
 
 if (ADOLC_INCLUDES AND ADOLC_LIBRARIES)
   set(ADOLC_FIND_QUIETLY TRUE)
-endif (ADOLC_INCLUDES AND ADOLC_LIBRARIES)
+endif ()
 
 find_path(ADOLC_INCLUDES
-  NAMES
-  adolc/adtl.h
-  PATHS
-  $ENV{ADOLCDIR}
-  ${INCLUDE_INSTALL_DIR}
+  NAMES adolc/adtl.h
+  PATHS $ENV{ADOLCDIR} $ENV{ADOLCDIR}/include ${INCLUDE_INSTALL_DIR}
 )
 
-find_library(ADOLC_LIBRARIES adolc PATHS $ENV{ADOLCDIR} ${LIB_INSTALL_DIR})
+find_library(ADOLC_LIBRARIES 
+  adolc 
+  PATHS $ENV{ADOLCDIR} ${LIB_INSTALL_DIR} 
+  PATH_SUFFIXES lib lib64)
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(ADOLC DEFAULT_MSG
+find_package_handle_standard_args(Adolc DEFAULT_MSG
                                   ADOLC_INCLUDES ADOLC_LIBRARIES)
 
 mark_as_advanced(ADOLC_INCLUDES ADOLC_LIBRARIES)
diff --git a/contrib/eigen/cmake/FindBLAS.cmake b/contrib/eigen/cmake/FindBLAS.cmake
index e3395bc10ec08c14869fa7d28bb8ebfdff4b30a9..1bb8f19652c9582dca81662e1bad7fb23a7aea1b 100644
--- a/contrib/eigen/cmake/FindBLAS.cmake
+++ b/contrib/eigen/cmake/FindBLAS.cmake
@@ -147,6 +147,7 @@ mark_as_advanced(BLAS_VERBOSE)
 
 include(CheckFunctionExists)
 include(CheckFortranFunctionExists)
+include(CMakeFindDependencyMacro)
 
 set(_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
 
@@ -270,8 +271,8 @@ macro(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list _thread)
       endif ()
       set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
       set(_libraries_work ${${_prefix}_${_library}_LIBRARY})
-    endif(_libraries_work)
-  endforeach(_library ${_list})
+    endif()
+  endforeach()
 
   if(_libraries_work)
     # Test this combination of libraries.
@@ -310,11 +311,11 @@ macro(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list _thread)
 
   if(_libraries_work)
     set(${LIBRARIES} ${${LIBRARIES}} ${_thread})
-  else(_libraries_work)
+  else()
     set(${LIBRARIES} FALSE)
-  endif(_libraries_work)
+  endif()
 
-endmacro(Check_Fortran_Libraries)
+endmacro()
 
 
 set(BLAS_LINKER_FLAGS)
@@ -438,7 +439,7 @@ if (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All")
       if (OMP_gomp_LIBRARY)
 	set(OMP_LIB "${OMP_gomp_LIBRARY}")
       endif()
-    else(CMAKE_C_COMPILER_ID STREQUAL "Intel")
+    else()
       if (OMP_iomp5_LIBRARY)
 	set(OMP_LIB "${OMP_iomp5_LIBRARY}")
       endif()
@@ -509,9 +510,9 @@ if (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All")
 
     if (_LANGUAGES_ MATCHES C OR _LANGUAGES_ MATCHES CXX)
       if(BLAS_FIND_QUIETLY OR NOT BLAS_FIND_REQUIRED)
-	find_package(Threads)
+	find_dependency(Threads)
       else()
-	find_package(Threads REQUIRED)
+	find_dependency(Threads REQUIRED)
       endif()
 
       set(BLAS_SEARCH_LIBS "")
@@ -560,7 +561,7 @@ if (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All")
 		"${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}")
 	    endforeach()
 	  endforeach()
-	else (WIN32)
+	else ()
 	  if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All")
 	    list(APPEND BLAS_SEARCH_LIBS
 	      "mkl_blas95 mkl_intel mkl_intel_thread mkl_core guide")
@@ -586,9 +587,9 @@ if (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All")
 	      set(OMP_LIB "")
 	    endif()
 	  endif ()
-	endif (WIN32)
+	endif ()
 
-      else (BLA_F95)
+      else ()
 
 	set(BLAS_mkl_SEARCH_SYMBOL sgemm)
 	set(_LIBRARIES BLAS_LIBRARIES)
@@ -632,7 +633,7 @@ if (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All")
 		"${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}")
 	    endforeach()
 	  endforeach()
-	else (WIN32)
+	else ()
 	  if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All")
 	    list(APPEND BLAS_SEARCH_LIBS
 	      "mkl_intel mkl_intel_thread mkl_core guide")
@@ -667,9 +668,9 @@ if (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All")
 	    list(APPEND BLAS_SEARCH_LIBS
 	      "mkl_em64t")
 	  endif ()
-	endif (WIN32)
+	endif ()
 
-      endif (BLA_F95)
+      endif ()
 
       foreach (IT ${BLAS_SEARCH_LIBS})
 	string(REPLACE " " ";" SEARCH_LIBS ${IT})
@@ -698,9 +699,9 @@ if (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All")
       if (${_LIBRARIES} AND NOT BLAS_VENDOR_FOUND)
           set (BLAS_VENDOR_FOUND "Intel MKL")
       endif()
-    endif (_LANGUAGES_ MATCHES C OR _LANGUAGES_ MATCHES CXX)
-  endif(NOT BLAS_LIBRARIES OR BLA_VENDOR MATCHES "Intel*")
-endif (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All")
+    endif ()
+  endif()
+endif ()
 
 
 if (BLA_VENDOR STREQUAL "Goto" OR BLA_VENDOR STREQUAL "All")
@@ -727,7 +728,7 @@ if (BLA_VENDOR STREQUAL "Goto" OR BLA_VENDOR STREQUAL "All")
       set (BLAS_VENDOR_FOUND "Goto")
   endif()
 
-endif (BLA_VENDOR STREQUAL "Goto" OR BLA_VENDOR STREQUAL "All")
+endif ()
 
 
 # OpenBlas
@@ -755,7 +756,7 @@ if (BLA_VENDOR STREQUAL "Open" OR BLA_VENDOR STREQUAL "All")
       set (BLAS_VENDOR_FOUND "Openblas")
   endif()
 
-endif (BLA_VENDOR STREQUAL "Open" OR BLA_VENDOR STREQUAL "All")
+endif ()
 
 
 # EigenBlas
@@ -802,7 +803,7 @@ if (BLA_VENDOR STREQUAL "Eigen" OR BLA_VENDOR STREQUAL "All")
       set (BLAS_VENDOR_FOUND "Eigen")
   endif()
 
-endif (BLA_VENDOR STREQUAL "Eigen" OR BLA_VENDOR STREQUAL "All")
+endif ()
 
 
 if (BLA_VENDOR STREQUAL "ATLAS" OR BLA_VENDOR STREQUAL "All")
@@ -830,7 +831,7 @@ if (BLA_VENDOR STREQUAL "ATLAS" OR BLA_VENDOR STREQUAL "All")
       set (BLAS_VENDOR_FOUND "Atlas")
   endif()
 
-endif (BLA_VENDOR STREQUAL "ATLAS" OR BLA_VENDOR STREQUAL "All")
+endif ()
 
 
 # BLAS in PhiPACK libraries? (requires generic BLAS lib, too)
@@ -858,7 +859,7 @@ if (BLA_VENDOR STREQUAL "PhiPACK" OR BLA_VENDOR STREQUAL "All")
       set (BLAS_VENDOR_FOUND "PhiPACK")
   endif()
 
-endif (BLA_VENDOR STREQUAL "PhiPACK" OR BLA_VENDOR STREQUAL "All")
+endif ()
 
 
 # BLAS in Alpha CXML library?
@@ -886,7 +887,7 @@ if (BLA_VENDOR STREQUAL "CXML" OR BLA_VENDOR STREQUAL "All")
       set (BLAS_VENDOR_FOUND "CXML")
   endif()
 
-endif (BLA_VENDOR STREQUAL "CXML" OR BLA_VENDOR STREQUAL "All")
+endif ()
 
 
 # BLAS in Alpha DXML library? (now called CXML, see above)
@@ -914,7 +915,7 @@ if (BLA_VENDOR STREQUAL "DXML" OR BLA_VENDOR STREQUAL "All")
       set (BLAS_VENDOR_FOUND "DXML")
   endif()
   
-endif (BLA_VENDOR STREQUAL "DXML" OR BLA_VENDOR STREQUAL "All")
+endif ()
 
 
 # BLAS in Sun Performance library?
@@ -1124,7 +1125,7 @@ if (BLA_VENDOR MATCHES "ACML.*" OR BLA_VENDOR STREQUAL "All")
 	  "${_ACML_ROOT}/${_ACML_COMPILER64}${_ACML_PATH_SUFFIX}/lib" )
       endif()
 
-    endif(_ACML_ROOT)
+    endif()
 
   elseif(BLAS_${BLA_VENDOR}_LIB_DIRS)
 
@@ -1229,7 +1230,7 @@ if (BLA_VENDOR MATCHES "ACML.*" OR BLA_VENDOR STREQUAL "All")
       set (BLAS_VENDOR_FOUND "ACML")
   endif()
 
-endif (BLA_VENDOR MATCHES "ACML.*" OR BLA_VENDOR STREQUAL "All") # ACML
+endif () # ACML
 
 
 # Apple BLAS library?
@@ -1257,7 +1258,7 @@ if (BLA_VENDOR STREQUAL "Apple" OR BLA_VENDOR STREQUAL "All")
       set (BLAS_VENDOR_FOUND "Apple Accelerate")
   endif()
 
-endif (BLA_VENDOR STREQUAL "Apple" OR BLA_VENDOR STREQUAL "All")
+endif ()
 
 
 if (BLA_VENDOR STREQUAL "NAS" OR BLA_VENDOR STREQUAL "All")
@@ -1284,7 +1285,7 @@ if (BLA_VENDOR STREQUAL "NAS" OR BLA_VENDOR STREQUAL "All")
       set (BLAS_VENDOR_FOUND "NAS")
   endif()
 
-endif (BLA_VENDOR STREQUAL "NAS" OR BLA_VENDOR STREQUAL "All")
+endif ()
 
 
 # Generic BLAS library?
@@ -1316,7 +1317,7 @@ if (BLA_VENDOR STREQUAL "Generic" OR BLA_VENDOR STREQUAL "All")
       set (BLAS_VENDOR_FOUND "Netlib or other Generic libblas")
   endif()
 
-endif (BLA_VENDOR STREQUAL "Generic" OR BLA_VENDOR STREQUAL "All")
+endif ()
 
 
 if(BLA_F95)
@@ -1331,7 +1332,7 @@ if(BLA_F95)
     if(BLAS95_FOUND)
       message(STATUS "A library with BLAS95 API found.")
       message(STATUS "BLAS_LIBRARIES ${BLAS_LIBRARIES}")
-    else(BLAS95_FOUND)
+    else()
       message(WARNING "BLA_VENDOR has been set to ${BLA_VENDOR} but blas 95 libraries could not be found or check of symbols failed."
 	"\nPlease indicate where to find blas libraries. You have three options:\n"
 	"- Option 1: Provide the installation directory of BLAS library with cmake option: -DBLAS_DIR=your/path/to/blas\n"
@@ -1349,13 +1350,13 @@ if(BLA_F95)
 	message(STATUS
 	  "A library with BLAS95 API not found. Please specify library location.")
       endif()
-    endif(BLAS95_FOUND)
-  endif(NOT BLAS_FIND_QUIETLY)
+    endif()
+  endif()
 
   set(BLAS_FOUND TRUE)
   set(BLAS_LIBRARIES "${BLAS95_LIBRARIES}")
 
-else(BLA_F95)
+else()
 
   if(BLAS_LIBRARIES)
     set(BLAS_FOUND TRUE)
@@ -1367,7 +1368,7 @@ else(BLA_F95)
     if(BLAS_FOUND)
       message(STATUS "A library with BLAS API found.")
       message(STATUS "BLAS_LIBRARIES ${BLAS_LIBRARIES}")
-    else(BLAS_FOUND)
+    else()
       message(WARNING "BLA_VENDOR has been set to ${BLA_VENDOR} but blas libraries could not be found or check of symbols failed."
 	"\nPlease indicate where to find blas libraries. You have three options:\n"
 	"- Option 1: Provide the installation directory of BLAS library with cmake option: -DBLAS_DIR=your/path/to/blas\n"
@@ -1385,10 +1386,10 @@ else(BLA_F95)
 	message(STATUS
 	  "A library with BLAS API not found. Please specify library location.")
       endif()
-    endif(BLAS_FOUND)
-  endif(NOT BLAS_FIND_QUIETLY)
+    endif()
+  endif()
 
-endif(BLA_F95)
+endif()
 
 set(CMAKE_FIND_LIBRARY_SUFFIXES ${_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
 
diff --git a/contrib/eigen/cmake/FindBLASEXT.cmake b/contrib/eigen/cmake/FindBLASEXT.cmake
index 0fe7fb84931bacf4f7879be412c6abf37c44d175..69a941897ef23c7c77d72fa41fc969e3c320b0df 100644
--- a/contrib/eigen/cmake/FindBLASEXT.cmake
+++ b/contrib/eigen/cmake/FindBLASEXT.cmake
@@ -41,18 +41,19 @@
 #  License text for the above reference.)
 
 # macro to factorize this call
+include(CMakeFindDependencyMacro)
 macro(find_package_blas)
   if(BLASEXT_FIND_REQUIRED)
     if(BLASEXT_FIND_QUIETLY)
-      find_package(BLAS REQUIRED QUIET)
+      find_dependency(BLAS REQUIRED QUIET)
     else()
-      find_package(BLAS REQUIRED)
+      find_dependency(BLAS REQUIRED)
     endif()
   else()
     if(BLASEXT_FIND_QUIETLY)
-      find_package(BLAS QUIET)
+      find_dependency(BLAS QUIET)
     else()
-      find_package(BLAS)
+      find_dependency(BLAS)
     endif()
   endif()
 endmacro()
@@ -316,7 +317,7 @@ if(BLA_VENDOR MATCHES "Intel*")
 	"\n   (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)")
       message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES")
     endif()
-    find_package_handle_standard_args(BLAS DEFAULT_MSG
+    find_package_handle_standard_args(BLASEXT DEFAULT_MSG
       BLAS_SEQ_LIBRARIES
       BLAS_LIBRARY_DIRS
       BLAS_INCLUDE_DIRS)
@@ -324,14 +325,14 @@ if(BLA_VENDOR MATCHES "Intel*")
       if(NOT BLASEXT_FIND_QUIETLY)
 	message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES")
       endif()
-      find_package_handle_standard_args(BLAS DEFAULT_MSG
+      find_package_handle_standard_args(BLASEXT DEFAULT_MSG
 	BLAS_PAR_LIBRARIES)
     endif()
   else()
     if(NOT BLASEXT_FIND_QUIETLY)
       message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES")
     endif()
-    find_package_handle_standard_args(BLAS DEFAULT_MSG
+    find_package_handle_standard_args(BLASEXT DEFAULT_MSG
       BLAS_SEQ_LIBRARIES
       BLAS_LIBRARY_DIRS
       BLAS_INCLUDE_DIRS)
@@ -343,14 +344,14 @@ elseif(BLA_VENDOR MATCHES "ACML*")
       "\n   (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)")
     message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES")
   endif()
-  find_package_handle_standard_args(BLAS DEFAULT_MSG
+  find_package_handle_standard_args(BLASEXT DEFAULT_MSG
     BLAS_SEQ_LIBRARIES
     BLAS_LIBRARY_DIRS)
   if(BLAS_PAR_LIBRARIES)
     if(NOT BLASEXT_FIND_QUIETLY)
       message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES")
     endif()
-    find_package_handle_standard_args(BLAS DEFAULT_MSG
+    find_package_handle_standard_args(BLASEXT DEFAULT_MSG
       BLAS_PAR_LIBRARIES)
   endif()
 elseif(BLA_VENDOR MATCHES "IBMESSL*")
@@ -360,21 +361,24 @@ elseif(BLA_VENDOR MATCHES "IBMESSL*")
       "\n   (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)")
     message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES")
   endif()
-  find_package_handle_standard_args(BLAS DEFAULT_MSG
+  find_package_handle_standard_args(BLASEXT DEFAULT_MSG
     BLAS_SEQ_LIBRARIES
     BLAS_LIBRARY_DIRS)
   if(BLAS_PAR_LIBRARIES)
     if(NOT BLASEXT_FIND_QUIETLY)
       message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES")
     endif()
-    find_package_handle_standard_args(BLAS DEFAULT_MSG
+    find_package_handle_standard_args(BLASEXT DEFAULT_MSG
       BLAS_PAR_LIBRARIES)
   endif()
 else()
   if(NOT BLASEXT_FIND_QUIETLY)
     message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES")
   endif()
-  find_package_handle_standard_args(BLAS DEFAULT_MSG
+  find_package_handle_standard_args(BLASEXT DEFAULT_MSG
     BLAS_SEQ_LIBRARIES
     BLAS_LIBRARY_DIRS)
 endif()
+
+# Callers expect BLAS_FOUND to be set as well.
+set(BLAS_FOUND BLASEXT_FOUND)
diff --git a/contrib/eigen/cmake/FindCholmod.cmake b/contrib/eigen/cmake/FindCholmod.cmake
index 23239c300f1b2004794e9968a825a45fa5bb5ccd..e470cb2e0e502b432625d7813e07c1c42da2f3b1 100644
--- a/contrib/eigen/cmake/FindCholmod.cmake
+++ b/contrib/eigen/cmake/FindCholmod.cmake
@@ -1,9 +1,9 @@
-# Cholmod lib usually requires linking to a blas and lapack library.
+# CHOLMOD lib usually requires linking to a blas and lapack library.
 # It is up to the user of this module to find a BLAS and link to it.
 
 if (CHOLMOD_INCLUDES AND CHOLMOD_LIBRARIES)
   set(CHOLMOD_FIND_QUIETLY TRUE)
-endif (CHOLMOD_INCLUDES AND CHOLMOD_LIBRARIES)
+endif ()
 
 find_path(CHOLMOD_INCLUDES
   NAMES
@@ -29,7 +29,7 @@ if(CHOLMOD_LIBRARIES)
     set(CHOLMOD_LIBRARIES FALSE)
   endif ()
 
-endif(CHOLMOD_LIBRARIES)
+endif()
 
 if(CHOLMOD_LIBRARIES)
 
@@ -40,7 +40,7 @@ if(CHOLMOD_LIBRARIES)
     set(CHOLMOD_LIBRARIES FALSE)
   endif ()
 
-endif(CHOLMOD_LIBRARIES)
+endif()
 
 if(CHOLMOD_LIBRARIES)
 
@@ -51,7 +51,7 @@ if(CHOLMOD_LIBRARIES)
     set(CHOLMOD_LIBRARIES FALSE)
   endif ()
 
-endif(CHOLMOD_LIBRARIES)
+endif()
 
 if(CHOLMOD_LIBRARIES)
 
@@ -62,7 +62,7 @@ if(CHOLMOD_LIBRARIES)
     set(CHOLMOD_LIBRARIES FALSE)
   endif ()
 
-endif(CHOLMOD_LIBRARIES)
+endif()
 
 if(CHOLMOD_LIBRARIES)
 
@@ -71,16 +71,16 @@ if(CHOLMOD_LIBRARIES)
     set(CHOLMOD_LIBRARIES ${CHOLMOD_LIBRARIES} ${CHOLMOD_METIS_LIBRARY})
   endif ()
 
-endif(CHOLMOD_LIBRARIES)
+endif()
 
 if(CHOLMOD_LIBRARIES)
 
   find_library(SUITESPARSE_LIBRARY SuiteSparse PATHS ${CHOLMOD_LIBDIR} $ENV{CHOLMODDIR} ${LIB_INSTALL_DIR})
   if (SUITESPARSE_LIBRARY)
     set(CHOLMOD_LIBRARIES ${CHOLMOD_LIBRARIES} ${SUITESPARSE_LIBRARY})
-  endif (SUITESPARSE_LIBRARY)
+  endif ()
   
-endif(CHOLMOD_LIBRARIES)
+endif()
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(CHOLMOD DEFAULT_MSG
diff --git a/contrib/eigen/cmake/FindComputeCpp.cmake b/contrib/eigen/cmake/FindComputeCpp.cmake
index 07ebed61b9b3eabf8ea5c92ebb2c276bb391c0c8..1c271f0fecba7fcd510176040525088330c341f9 100644
--- a/contrib/eigen/cmake/FindComputeCpp.cmake
+++ b/contrib/eigen/cmake/FindComputeCpp.cmake
@@ -2,7 +2,7 @@
 # FindComputeCpp
 #---------------
 #
-#   Copyright 2016 Codeplay Software Ltd.
+#   Copyright 2016-2018 Codeplay Software Ltd.
 #
 #   Licensed under the Apache License, Version 2.0 (the "License");
 #   you may not use these files except in compliance with the License.
@@ -23,204 +23,359 @@
 #
 #  Tools for finding and building with ComputeCpp.
 #
-#  User must define COMPUTECPP_PACKAGE_ROOT_DIR pointing to the ComputeCpp
-#   installation.
+#  User must define ComputeCpp_DIR pointing to the ComputeCpp
+#  installation.
 #
 #  Latest version of this file can be found at:
 #    https://github.com/codeplaysoftware/computecpp-sdk
 
-# Require CMake version 3.2.2 or higher
-cmake_minimum_required(VERSION 3.2.2)
-
-# Check that a supported host compiler can be found
-if(CMAKE_COMPILER_IS_GNUCXX)
-    # Require at least gcc 4.8
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
-      message(FATAL_ERROR
-        "host compiler - Not found! (gcc version must be at least 4.8)")
-    # Require the GCC dual ABI to be disabled for 5.1 or higher
-    elseif (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.1)
-      set(COMPUTECPP_DISABLE_GCC_DUAL_ABI "True")
-      message(STATUS
-        "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION} (note pre 5.1 gcc ABI enabled)")
-    else()
-      message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}")
-    endif()
-elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    # Require at least clang 3.6
-    if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6)
-      message(FATAL_ERROR
-        "host compiler - Not found! (clang version must be at least 3.6)")
-    else()
-      message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}")
-    endif()
-else()
-  message(WARNING
-    "host compiler - Not found! (ComputeCpp supports GCC and Clang, see readme)")
-endif()
+cmake_minimum_required(VERSION 3.4.3)
+include(FindPackageHandleStandardArgs)
+include(ComputeCppIRMap)
 
-set(COMPUTECPP_64_BIT_DEFAULT ON)
-option(COMPUTECPP_64_BIT_CODE "Compile device code in 64 bit mode"
-        ${COMPUTECPP_64_BIT_DEFAULT})
-mark_as_advanced(COMPUTECPP_64_BIT_CODE)
+set(COMPUTECPP_USER_FLAGS "" CACHE STRING "User flags for compute++")
+separate_arguments(COMPUTECPP_USER_FLAGS)
+mark_as_advanced(COMPUTECPP_USER_FLAGS)
 
-# Find OpenCL package
-find_package(OpenCL REQUIRED)
+set(COMPUTECPP_BITCODE "spir64" CACHE STRING
+  "Bitcode type to use as SYCL target in compute++")
+mark_as_advanced(COMPUTECPP_BITCODE)
 
-# Find ComputeCpp packagee
-if(NOT COMPUTECPP_PACKAGE_ROOT_DIR)
-  message(FATAL_ERROR
-    "ComputeCpp package - Not found! (please set COMPUTECPP_PACKAGE_ROOT_DIR")
-else()
-  message(STATUS "ComputeCpp package - Found")
-endif()
-option(COMPUTECPP_PACKAGE_ROOT_DIR "Path to the ComputeCpp Package")
-
-# Obtain the path to compute++
-find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS
-  ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
-if (EXISTS ${COMPUTECPP_DEVICE_COMPILER})
-  mark_as_advanced(COMPUTECPP_DEVICE_COMPILER)
-  message(STATUS "compute++ - Found")
-else()
-  message(FATAL_ERROR "compute++ - Not found! (${COMPUTECPP_DEVICE_COMPILER})")
-endif()
+include(CMakeFindDependencyMacro)
+find_dependency(OpenCL REQUIRED)
 
-# Obtain the path to computecpp_info
-find_program(COMPUTECPP_INFO_TOOL computecpp_info PATHS
-  ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
-if (EXISTS ${COMPUTECPP_INFO_TOOL})
-  mark_as_advanced(${COMPUTECPP_INFO_TOOL})
-  message(STATUS "computecpp_info - Found")
-else()
-  message(FATAL_ERROR "computecpp_info - Not found! (${COMPUTECPP_INFO_TOOL})")
+# Find ComputeCpp package
+
+if(DEFINED ComputeCpp_DIR)
+  set(computecpp_find_hint ${ComputeCpp_DIR})
+elseif(DEFINED ENV{COMPUTECPP_DIR})
+  set(computecpp_find_hint $ENV{COMPUTECPP_DIR})
 endif()
 
-# Obtain the path to the ComputeCpp runtime library
-find_library(COMPUTECPP_RUNTIME_LIBRARY ComputeCpp PATHS ${COMPUTECPP_PACKAGE_ROOT_DIR}
-  HINTS ${COMPUTECPP_PACKAGE_ROOT_DIR}/lib PATH_SUFFIXES lib
-  DOC "ComputeCpp Runtime Library" NO_DEFAULT_PATH)
+# Used for running executables on the host
+set(computecpp_host_find_hint ${computecpp_find_hint})
 
-if (EXISTS ${COMPUTECPP_RUNTIME_LIBRARY})
-  mark_as_advanced(COMPUTECPP_RUNTIME_LIBRARY)
-  message(STATUS "libComputeCpp.so - Found")
-else()
-  message(FATAL_ERROR "libComputeCpp.so - Not found!")
+if(CMAKE_CROSSCOMPILING)
+  # ComputeCpp_HOST_DIR is used to find executables that are run on the host
+  if(DEFINED ComputeCpp_HOST_DIR)
+    set(computecpp_host_find_hint ${ComputeCpp_HOST_DIR})
+  elseif(DEFINED ENV{COMPUTECPP_HOST_DIR})
+    set(computecpp_host_find_hint $ENV{COMPUTECPP_HOST_DIR})
+  endif()
 endif()
 
-# Obtain the ComputeCpp include directory
-set(COMPUTECPP_INCLUDE_DIRECTORY ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/)
-if (NOT EXISTS ${COMPUTECPP_INCLUDE_DIRECTORY})
-  message(FATAL_ERROR "ComputeCpp includes - Not found!")
-else()
-  message(STATUS "ComputeCpp includes - Found")
-endif()
+find_program(ComputeCpp_DEVICE_COMPILER_EXECUTABLE compute++
+  HINTS ${computecpp_host_find_hint}
+  PATH_SUFFIXES bin
+  NO_SYSTEM_ENVIRONMENT_PATH)
+
+find_program(ComputeCpp_INFO_EXECUTABLE computecpp_info
+  HINTS ${computecpp_host_find_hint}
+  PATH_SUFFIXES bin
+  NO_SYSTEM_ENVIRONMENT_PATH)
 
-# Obtain the package version
-execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-version"
-  OUTPUT_VARIABLE COMPUTECPP_PACKAGE_VERSION
-  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
-  message(FATAL_ERROR "Package version - Error obtaining version!")
+find_library(COMPUTECPP_RUNTIME_LIBRARY
+  NAMES ComputeCpp ComputeCpp_vs2015
+  HINTS ${computecpp_find_hint}
+  PATH_SUFFIXES lib
+  DOC "ComputeCpp Runtime Library")
+
+find_library(COMPUTECPP_RUNTIME_LIBRARY_DEBUG
+  NAMES ComputeCpp_d ComputeCpp ComputeCpp_vs2015_d
+  HINTS ${computecpp_find_hint}
+  PATH_SUFFIXES lib
+  DOC "ComputeCpp Debug Runtime Library")
+
+find_path(ComputeCpp_INCLUDE_DIRS
+  NAMES "CL/sycl.hpp"
+  HINTS ${computecpp_find_hint}/include
+  DOC "The ComputeCpp include directory")
+get_filename_component(ComputeCpp_INCLUDE_DIRS ${ComputeCpp_INCLUDE_DIRS} ABSOLUTE)
+
+get_filename_component(computecpp_canonical_root_dir "${ComputeCpp_INCLUDE_DIRS}/.." ABSOLUTE)
+set(ComputeCpp_ROOT_DIR "${computecpp_canonical_root_dir}" CACHE PATH
+    "The root of the ComputeCpp install")
+
+if(NOT ComputeCpp_INFO_EXECUTABLE)
+  message(WARNING "Can't find computecpp_info - check ComputeCpp_DIR")
 else()
-  mark_as_advanced(COMPUTECPP_PACKAGE_VERSION)
-  message(STATUS "Package version - ${COMPUTECPP_PACKAGE_VERSION}")
+  execute_process(COMMAND ${ComputeCpp_INFO_EXECUTABLE} "--dump-version"
+    OUTPUT_VARIABLE ComputeCpp_VERSION
+    RESULT_VARIABLE ComputeCpp_INFO_EXECUTABLE_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(NOT ComputeCpp_INFO_EXECUTABLE_RESULT EQUAL "0")
+    message(WARNING "Package version - Error obtaining version!")
+  endif()
+
+  execute_process(COMMAND ${ComputeCpp_INFO_EXECUTABLE} "--dump-is-supported"
+    OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED
+    RESULT_VARIABLE ComputeCpp_INFO_EXECUTABLE_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(NOT ComputeCpp_INFO_EXECUTABLE_RESULT EQUAL "0")
+    message(WARNING "platform - Error checking platform support!")
+  else()
+    mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED)
+    if (COMPUTECPP_PLATFORM_IS_SUPPORTED)
+      message(STATUS "platform - your system can support ComputeCpp")
+    else()
+      message(STATUS "platform - your system is not officially supported")
+    endif()
+  endif()
 endif()
 
-# Obtain the device compiler flags
-execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-device-compiler-flags"
-  OUTPUT_VARIABLE COMPUTECPP_DEVICE_COMPILER_FLAGS
-  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
-  message(FATAL_ERROR "compute++ flags - Error obtaining compute++ flags!")
-else()
-  mark_as_advanced(COMPUTECPP_COMPILER_FLAGS)
-  message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
+find_package_handle_standard_args(ComputeCpp
+  REQUIRED_VARS ComputeCpp_ROOT_DIR
+                ComputeCpp_DEVICE_COMPILER_EXECUTABLE
+                ComputeCpp_INFO_EXECUTABLE
+                COMPUTECPP_RUNTIME_LIBRARY
+                COMPUTECPP_RUNTIME_LIBRARY_DEBUG
+                ComputeCpp_INCLUDE_DIRS
+  VERSION_VAR ComputeCpp_VERSION)
+mark_as_advanced(ComputeCpp_ROOT_DIR
+                 ComputeCpp_DEVICE_COMPILER_EXECUTABLE
+                 ComputeCpp_INFO_EXECUTABLE
+                 COMPUTECPP_RUNTIME_LIBRARY
+                 COMPUTECPP_RUNTIME_LIBRARY_DEBUG
+                 ComputeCpp_INCLUDE_DIRS
+                 ComputeCpp_VERSION)
+
+if(NOT ComputeCpp_FOUND)
+  return()
 endif()
 
-set(COMPUTECPP_DEVICE_COMPILER_FLAGS ${COMPUTECPP_DEVICE_COMPILER_FLAGS} -sycl-compress-name -no-serial-memop -DEIGEN_NO_ASSERTION_CHECKING=1)
+list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -O2 -mllvm -inline-threshold=1000 -intelspirmetadata)
+mark_as_advanced(COMPUTECPP_DEVICE_COMPILER_FLAGS)
 
-# Check if the platform is supported
-execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported"
-  OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED
-  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
-  message(FATAL_ERROR "platform - Error checking platform support!")
-else()
-  mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED)
-  if (COMPUTECPP_PLATFORM_IS_SUPPORTED)
-    message(STATUS "platform - your system can support ComputeCpp")
-  else()
-    message(STATUS "platform - your system CANNOT support ComputeCpp")
+if(CMAKE_CROSSCOMPILING)
+  if(NOT COMPUTECPP_DONT_USE_TOOLCHAIN)
+    list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS --gcc-toolchain=${COMPUTECPP_TOOLCHAIN_DIR})
   endif()
+  list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS --sysroot=${COMPUTECPP_SYSROOT_DIR})
+  list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -target ${COMPUTECPP_TARGET_TRIPLE})
+endif()
+
+list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -sycl-target ${COMPUTECPP_BITCODE})
+message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
+
+include(ComputeCppCompilerChecks)
+
+if(NOT TARGET OpenCL::OpenCL)
+  add_library(OpenCL::OpenCL UNKNOWN IMPORTED)
+  set_target_properties(OpenCL::OpenCL PROPERTIES
+    IMPORTED_LOCATION             "${OpenCL_LIBRARIES}"
+    INTERFACE_INCLUDE_DIRECTORIES "${OpenCL_INCLUDE_DIRS}"
+  )
+endif()
+
+if(NOT TARGET ComputeCpp::ComputeCpp)
+  add_library(ComputeCpp::ComputeCpp UNKNOWN IMPORTED)
+  set_target_properties(ComputeCpp::ComputeCpp PROPERTIES
+    IMPORTED_LOCATION_DEBUG          "${COMPUTECPP_RUNTIME_LIBRARY_DEBUG}"
+    IMPORTED_LOCATION_RELWITHDEBINFO "${COMPUTECPP_RUNTIME_LIBRARY}"
+    IMPORTED_LOCATION                "${COMPUTECPP_RUNTIME_LIBRARY}"
+    INTERFACE_INCLUDE_DIRECTORIES    "${ComputeCpp_INCLUDE_DIRS}"
+    INTERFACE_LINK_LIBRARIES         "OpenCL::OpenCL"
+  )
 endif()
 
+# This property allows targets to specify that their sources should be
+# compiled with the integration header included after the user's
+# sources, not before (e.g. when an enum is used in a kernel name, this
+# is not technically valid SYCL code but can work with ComputeCpp)
+define_property(
+  TARGET PROPERTY COMPUTECPP_INCLUDE_AFTER
+  BRIEF_DOCS "Include integration header after user source"
+  FULL_DOCS "Changes compiler arguments such that the source file is
+  actually the integration header, and the .cpp file is included on
+  the command line so that it is seen by the compiler first. Enables
+  non-standards-conformant SYCL code to compile with ComputeCpp."
+)
+define_property(
+  TARGET PROPERTY INTERFACE_COMPUTECPP_FLAGS
+  BRIEF_DOCS "Interface compile flags to provide compute++"
+  FULL_DOCS  "Set additional compile flags to pass to compute++ when compiling
+  any target which links to this one."
+)
+define_property(
+  SOURCE PROPERTY COMPUTECPP_SOURCE_FLAGS
+  BRIEF_DOCS "Source file compile flags for compute++"
+  FULL_DOCS  "Set additional compile flags for compiling the SYCL integration
+  header for the given source file."
+)
+
 ####################
-#   __build_sycl
+#   __build_ir
 ####################
 #
 #  Adds a custom target for running compute++ and adding a dependency for the
-#  resulting integration header.
+#  resulting integration header and kernel binary.
 #
-#  targetName : Name of the target.
-#  sourceFile : Source file to be compiled.
-#  binaryDir : Intermediate directory to output the integration header.
+#  TARGET : Name of the target.
+#  SOURCE : Source file to be compiled.
+#  COUNTER : Counter included in name of custom target. Different counter
+#       values prevent duplicated names of custom target when source files with
+#       the same name, but located in different directories, are used for the
+#       same target.
 #
-function(__build_spir targetName sourceFile binaryDir)
-
-  # Retrieve source file name.
-  get_filename_component(sourceFileName ${sourceFile} NAME)
-
-  # Set the path to the Sycl file.
-  set(outputSyclFile ${binaryDir}/${sourceFileName}.sycl)
-
-  # Add any user-defined include to the device compiler
-  get_property(includeDirectories DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY
-    INCLUDE_DIRECTORIES)
-  set(device_compiler_includes "")
-  foreach(directory ${includeDirectories})
-    set(device_compiler_includes "-I${directory}" ${device_compiler_includes})
-  endforeach()
-  if (CMAKE_INCLUDE_PATH)
-    foreach(directory ${CMAKE_INCLUDE_PATH})
-      set(device_compiler_includes "-I${directory}"
-        ${device_compiler_includes})
+function(__build_ir)
+  set(options)
+  set(one_value_args
+    TARGET
+    SOURCE
+    COUNTER
+  )
+  set(multi_value_args)
+  cmake_parse_arguments(SDK_BUILD_IR
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN}
+  )
+  get_filename_component(sourceFileName ${SDK_BUILD_IR_SOURCE} NAME)
+
+  # Set the path to the integration header.
+  # The .sycl filename must depend on the target so that different targets
+  # using the same source file will be generated with a different rule.
+  set(baseSyclName ${CMAKE_CURRENT_BINARY_DIR}/${SDK_BUILD_IR_TARGET}_${sourceFileName})
+  set(outputSyclFile ${baseSyclName}.sycl)
+  set(outputDeviceFile ${baseSyclName}.${IR_MAP_${COMPUTECPP_BITCODE}})
+  set(depFileName ${baseSyclName}.sycl.d)
+
+  set(include_directories "$<TARGET_PROPERTY:${SDK_BUILD_IR_TARGET},INCLUDE_DIRECTORIES>")
+  set(compile_definitions "$<TARGET_PROPERTY:${SDK_BUILD_IR_TARGET},COMPILE_DEFINITIONS>")
+  set(generated_include_directories
+    $<$<BOOL:${include_directories}>:-I\"$<JOIN:${include_directories},\"\t-I\">\">)
+  set(generated_compile_definitions
+    $<$<BOOL:${compile_definitions}>:-D$<JOIN:${compile_definitions},\t-D>>)
+
+  # Obtain language standard of the file
+  set(device_compiler_cxx_standard)
+  get_target_property(targetCxxStandard ${SDK_BUILD_IR_TARGET} CXX_STANDARD)
+  if (targetCxxStandard MATCHES 17)
+    set(device_compiler_cxx_standard "-std=c++1z")
+  elseif (targetCxxStandard MATCHES 14)
+    set(device_compiler_cxx_standard "-std=c++14")
+  elseif (targetCxxStandard MATCHES 11)
+    set(device_compiler_cxx_standard "-std=c++11")
+  elseif (targetCxxStandard MATCHES 98)
+    message(FATAL_ERROR "SYCL applications cannot be compiled using C++98")
+  else ()
+    set(device_compiler_cxx_standard "")
+  endif()
+
+  get_property(source_compile_flags
+    SOURCE ${SDK_BUILD_IR_SOURCE}
+    PROPERTY COMPUTECPP_SOURCE_FLAGS
+  )
+  separate_arguments(source_compile_flags)
+  if(source_compile_flags)
+    list(APPEND computecpp_source_flags ${source_compile_flags})
+  endif()
+
+  list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS
+    ${device_compiler_cxx_standard}
+    ${COMPUTECPP_USER_FLAGS}
+    ${computecpp_source_flags}
+  )
+
+  set(ir_dependencies ${SDK_BUILD_IR_SOURCE})
+  get_target_property(target_libraries ${SDK_BUILD_IR_TARGET} LINK_LIBRARIES)
+  if(target_libraries)
+    foreach(library ${target_libraries})
+      if(TARGET ${library})
+        list(APPEND ir_dependencies ${library})
+      endif()
     endforeach()
   endif()
 
-  # Convert argument list format
-  separate_arguments(COMPUTECPP_DEVICE_COMPILER_FLAGS)
+  # Depfile support was only added in CMake 3.7
+  # CMake throws an error if it is unsupported by the generator (i. e. not ninja)
+  if((NOT CMAKE_VERSION VERSION_LESS 3.7.0) AND
+          CMAKE_GENERATOR MATCHES "Ninja")
+    file(RELATIVE_PATH relOutputFile ${CMAKE_BINARY_DIR} ${outputDeviceFile})
+    set(generate_depfile -MMD -MF ${depFileName} -MT ${relOutputFile})
+    set(enable_depfile DEPFILE ${depFileName})
+  endif()
 
   # Add custom command for running compute++
   add_custom_command(
-    OUTPUT ${outputSyclFile}
-    COMMAND ${COMPUTECPP_DEVICE_COMPILER}
+    OUTPUT ${outputDeviceFile} ${outputSyclFile}
+    COMMAND ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE}
             ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
-            -isystem ${COMPUTECPP_INCLUDE_DIRECTORY}
-            ${COMPUTECPP_PLATFORM_SPECIFIC_ARGS}
-            ${device_compiler_includes}
-            -o ${outputSyclFile}
-            -c ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}
-    DEPENDS ${sourceFile}
-    WORKING_DIRECTORY ${binaryDir}
-  COMMENT "Building ComputeCpp integration header file ${outputSyclFile}")
-
-  # Add a custom target for the generated integration header
-  add_custom_target(${targetName}_integration_header DEPENDS ${outputSyclFile})
-
-  # Add a dependency on the integration header
-  add_dependencies(${targetName} ${targetName}_integration_header)
-
-  # Set the host compiler C++ standard to C++11
-  set_property(TARGET ${targetName} PROPERTY CXX_STANDARD 11)
-
-  # Disable GCC dual ABI on GCC 5.1 and higher
-  if(COMPUTECPP_DISABLE_GCC_DUAL_ABI)
-    set_property(TARGET ${targetName} APPEND PROPERTY COMPILE_DEFINITIONS
-      "_GLIBCXX_USE_CXX11_ABI=0")
+            ${generated_include_directories}
+            ${generated_compile_definitions}
+            -sycl-ih ${outputSyclFile}
+            -o ${outputDeviceFile}
+            -c ${SDK_BUILD_IR_SOURCE}
+            ${generate_depfile}
+    DEPENDS ${ir_dependencies}
+    IMPLICIT_DEPENDS CXX ${SDK_BUILD_IR_SOURCE}
+    ${enable_depfile}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    COMMENT "Building ComputeCpp integration header file ${outputSyclFile}")
+
+  # Name: (user-defined name)_(source file)_(counter)_ih
+  set(headerTargetName
+    ${SDK_BUILD_IR_TARGET}_${sourceFileName}_${SDK_BUILD_IR_COUNTER}_ih)
+
+  if(NOT MSVC)
+    # Add a custom target for the generated integration header
+    add_custom_target(${headerTargetName} DEPENDS ${outputDeviceFile} ${outputSyclFile})
+    add_dependencies(${SDK_BUILD_IR_TARGET} ${headerTargetName})
+  endif()
+
+  # This property can be set on a per-target basis to indicate that the
+  # integration header should appear after the main source listing
+  get_target_property(includeAfter ${SDK_ADD_SYCL_TARGET} COMPUTECPP_INCLUDE_AFTER)
+
+  if(includeAfter)
+    # Change the source file to the integration header - e.g.
+    # g++ -c source_file_name.cpp.sycl
+    get_target_property(current_sources ${SDK_BUILD_IR_TARGET} SOURCES)
+    # Remove absolute path to source file
+    list(REMOVE_ITEM current_sources ${SDK_BUILD_IR_SOURCE})
+    # Remove relative path to source file
+    string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" ""
+      rel_source_file ${SDK_BUILD_IR_SOURCE}
+    )
+    list(REMOVE_ITEM current_sources ${rel_source_file})
+    # Add SYCL header to source list
+    list(APPEND current_sources ${outputSyclFile})
+    set_property(TARGET ${SDK_BUILD_IR_TARGET}
+      PROPERTY SOURCES ${current_sources})
+    # CMake/gcc don't know what language a .sycl file is, so tell them
+    set_property(SOURCE ${outputSyclFile} PROPERTY LANGUAGE CXX)
+    set(includedFile ${SDK_BUILD_IR_SOURCE})
+    set(cppFile ${outputSyclFile})
+  else()
+    set_property(SOURCE ${outputSyclFile} PROPERTY HEADER_FILE_ONLY ON)
+    set(includedFile ${outputSyclFile})
+    set(cppFile ${SDK_BUILD_IR_SOURCE})
+  endif()
+
+  # Force inclusion of the integration header for the host compiler
+  if(MSVC)
+    # Group SYCL files inside Visual Studio
+    source_group("SYCL" FILES ${outputSyclFile})
+
+    if(includeAfter)
+      # Allow the source file to be edited using Visual Studio.
+      # It will be added as a header file so it won't be compiled.
+      set_property(SOURCE ${SDK_BUILD_IR_SOURCE} PROPERTY HEADER_FILE_ONLY true)
+    endif()
+
+    # Add both source and the sycl files to the VS solution.
+    target_sources(${SDK_BUILD_IR_TARGET} PUBLIC ${SDK_BUILD_IR_SOURCE} ${outputSyclFile})
+
+    set(forceIncludeFlags "/FI${includedFile} /TP")
+  else()
+    set(forceIncludeFlags "-include ${includedFile} -x c++")
   endif()
 
-endfunction()
+  set_property(
+    SOURCE ${cppFile}
+    APPEND_STRING PROPERTY COMPILE_FLAGS "${forceIncludeFlags}"
+  )
+
+endfunction(__build_ir)
 
 #######################
 #  add_sycl_to_target
@@ -229,17 +384,72 @@ endfunction()
 #  Adds a SYCL compilation custom command associated with an existing
 #  target and sets a dependancy on that new command.
 #
-#  targetName : Name of the target to add a SYCL to.
-#  sourceFile : Source file to be compiled for SYCL.
-#  binaryDir : Intermediate directory to output the integration header.
+#  TARGET : Name of the target to add SYCL to.
+#  SOURCES : Source files to be compiled for SYCL.
 #
-function(add_sycl_to_target targetName sourceFile binaryDir)
+function(add_sycl_to_target)
+  set(options)
+  set(one_value_args
+    TARGET
+  )
+  set(multi_value_args
+    SOURCES
+  )
+  cmake_parse_arguments(SDK_ADD_SYCL
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN}
+  )
+
+  set_target_properties(${SDK_ADD_SYCL_TARGET} PROPERTIES LINKER_LANGUAGE CXX)
 
-  # Add custom target to run compute++ and generate the integration header
-  __build_spir(${targetName} ${sourceFile} ${binaryDir})
+  # If the CXX compiler is set to compute++ enable the driver.
+  get_filename_component(cmakeCxxCompilerFileName "${CMAKE_CXX_COMPILER}" NAME)
+  if("${cmakeCxxCompilerFileName}" STREQUAL "compute++")
+    if(MSVC)
+      message(FATAL_ERROR "The compiler driver is not supported by this system,
+                           revert the CXX compiler to your default host compiler.")
+    endif()
 
-  # Link with the ComputeCpp runtime library
-  target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY}
-                        PUBLIC ${OpenCL_LIBRARIES})
+    get_target_property(includeAfter ${SDK_ADD_SYCL_TARGET} COMPUTECPP_INCLUDE_AFTER)
+    if(includeAfter)
+      list(APPEND COMPUTECPP_USER_FLAGS -fsycl-ih-last)
+    endif()
+    list(INSERT COMPUTECPP_DEVICE_COMPILER_FLAGS 0 -sycl-driver)
+    # Prepend COMPUTECPP_DEVICE_COMPILER_FLAGS and append COMPUTECPP_USER_FLAGS
+    foreach(prop COMPILE_OPTIONS INTERFACE_COMPILE_OPTIONS)
+      get_target_property(target_compile_options ${SDK_ADD_SYCL_TARGET} ${prop})
+      if(NOT target_compile_options)
+        set(target_compile_options "")
+      endif()
+      set_property(
+        TARGET ${SDK_ADD_SYCL_TARGET}
+        PROPERTY ${prop}
+        ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
+        ${target_compile_options}
+        ${COMPUTECPP_USER_FLAGS}
+      )
+    endforeach()
+  else()
+    set(fileCounter 0)
+    list(INSERT COMPUTECPP_DEVICE_COMPILER_FLAGS 0 -sycl)
+    # Add custom target to run compute++ and generate the integration header
+    foreach(sourceFile ${SDK_ADD_SYCL_SOURCES})
+      if(NOT IS_ABSOLUTE ${sourceFile})
+        set(sourceFile "${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}")
+      endif()
+      __build_ir(
+        TARGET     ${SDK_ADD_SYCL_TARGET}
+        SOURCE     ${sourceFile}
+        COUNTER    ${fileCounter}
+      )
+      MATH(EXPR fileCounter "${fileCounter} + 1")
+    endforeach()
+  endif()
 
+  set_property(TARGET ${SDK_ADD_SYCL_TARGET}
+    APPEND PROPERTY LINK_LIBRARIES ComputeCpp::ComputeCpp)
+  set_property(TARGET ${SDK_ADD_SYCL_TARGET}
+    APPEND PROPERTY INTERFACE_LINK_LIBRARIES ComputeCpp::ComputeCpp)
 endfunction(add_sycl_to_target)
diff --git a/contrib/eigen/cmake/FindEigen2.cmake b/contrib/eigen/cmake/FindEigen2.cmake
index a834b887286045fcdc381633ec95bf204834aadf..eb2709dc04fd00379b7fcc371767f0402236fcc3 100644
--- a/contrib/eigen/cmake/FindEigen2.cmake
+++ b/contrib/eigen/cmake/FindEigen2.cmake
@@ -17,16 +17,16 @@
 if(NOT Eigen2_FIND_VERSION)
   if(NOT Eigen2_FIND_VERSION_MAJOR)
     set(Eigen2_FIND_VERSION_MAJOR 2)
-  endif(NOT Eigen2_FIND_VERSION_MAJOR)
+  endif()
   if(NOT Eigen2_FIND_VERSION_MINOR)
     set(Eigen2_FIND_VERSION_MINOR 0)
-  endif(NOT Eigen2_FIND_VERSION_MINOR)
+  endif()
   if(NOT Eigen2_FIND_VERSION_PATCH)
     set(Eigen2_FIND_VERSION_PATCH 0)
-  endif(NOT Eigen2_FIND_VERSION_PATCH)
+  endif()
 
   set(Eigen2_FIND_VERSION "${Eigen2_FIND_VERSION_MAJOR}.${Eigen2_FIND_VERSION_MINOR}.${Eigen2_FIND_VERSION_PATCH}")
-endif(NOT Eigen2_FIND_VERSION)
+endif()
 
 macro(_eigen2_check_version)
   file(READ "${EIGEN2_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen2_version_header)
@@ -49,8 +49,8 @@ macro(_eigen2_check_version)
 
     message(STATUS "Eigen2 version ${EIGEN2_VERSION} found in ${EIGEN2_INCLUDE_DIR}, "
                    "but at least version ${Eigen2_FIND_VERSION} is required")
-  endif(NOT EIGEN2_VERSION_OK)
-endmacro(_eigen2_check_version)
+  endif()
+endmacro()
 
 if (EIGEN2_INCLUDE_DIR)
 
@@ -58,7 +58,7 @@ if (EIGEN2_INCLUDE_DIR)
   _eigen2_check_version()
   set(EIGEN2_FOUND ${EIGEN2_VERSION_OK})
 
-else (EIGEN2_INCLUDE_DIR)
+else ()
 
 find_path(EIGEN2_INCLUDE_DIR NAMES Eigen/Core
      PATHS
@@ -69,12 +69,12 @@ find_path(EIGEN2_INCLUDE_DIR NAMES Eigen/Core
 
 if(EIGEN2_INCLUDE_DIR)
   _eigen2_check_version()
-endif(EIGEN2_INCLUDE_DIR)
+endif()
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(Eigen2 DEFAULT_MSG EIGEN2_INCLUDE_DIR EIGEN2_VERSION_OK)
 
 mark_as_advanced(EIGEN2_INCLUDE_DIR)
 
-endif(EIGEN2_INCLUDE_DIR)
+endif()
 
diff --git a/contrib/eigen/cmake/FindEigen3.cmake b/contrib/eigen/cmake/FindEigen3.cmake
index 9e969786089ca8ea3801be8b084c51a5782f09b5..0b36805e75ec37ba2af7a8fe9c8036f73558a618 100644
--- a/contrib/eigen/cmake/FindEigen3.cmake
+++ b/contrib/eigen/cmake/FindEigen3.cmake
@@ -10,8 +10,12 @@
 #  EIGEN3_INCLUDE_DIR - the eigen include directory
 #  EIGEN3_VERSION - eigen version
 #
+# and the following imported target:
+#
+#  Eigen3::Eigen - The header-only Eigen library
+#
 # This module reads hints about search locations from 
-# the following enviroment variables:
+# the following environment variables:
 #
 # EIGEN3_ROOT
 # EIGEN3_ROOT_DIR
@@ -24,16 +28,16 @@
 if(NOT Eigen3_FIND_VERSION)
   if(NOT Eigen3_FIND_VERSION_MAJOR)
     set(Eigen3_FIND_VERSION_MAJOR 2)
-  endif(NOT Eigen3_FIND_VERSION_MAJOR)
+  endif()
   if(NOT Eigen3_FIND_VERSION_MINOR)
     set(Eigen3_FIND_VERSION_MINOR 91)
-  endif(NOT Eigen3_FIND_VERSION_MINOR)
+  endif()
   if(NOT Eigen3_FIND_VERSION_PATCH)
     set(Eigen3_FIND_VERSION_PATCH 0)
-  endif(NOT Eigen3_FIND_VERSION_PATCH)
+  endif()
 
   set(Eigen3_FIND_VERSION "${Eigen3_FIND_VERSION_MAJOR}.${Eigen3_FIND_VERSION_MINOR}.${Eigen3_FIND_VERSION_PATCH}")
-endif(NOT Eigen3_FIND_VERSION)
+endif()
 
 macro(_eigen3_check_version)
   file(READ "${EIGEN3_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen3_version_header)
@@ -48,24 +52,25 @@ macro(_eigen3_check_version)
   set(EIGEN3_VERSION ${EIGEN3_WORLD_VERSION}.${EIGEN3_MAJOR_VERSION}.${EIGEN3_MINOR_VERSION})
   if(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
     set(EIGEN3_VERSION_OK FALSE)
-  else(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+  else()
     set(EIGEN3_VERSION_OK TRUE)
-  endif(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+  endif()
 
   if(NOT EIGEN3_VERSION_OK)
 
     message(STATUS "Eigen3 version ${EIGEN3_VERSION} found in ${EIGEN3_INCLUDE_DIR}, "
                    "but at least version ${Eigen3_FIND_VERSION} is required")
-  endif(NOT EIGEN3_VERSION_OK)
-endmacro(_eigen3_check_version)
+  endif()
+endmacro()
 
 if (EIGEN3_INCLUDE_DIR)
 
   # in cache already
   _eigen3_check_version()
   set(EIGEN3_FOUND ${EIGEN3_VERSION_OK})
+  set(Eigen3_FOUND ${EIGEN3_VERSION_OK})
 
-else (EIGEN3_INCLUDE_DIR)
+else ()
   
   # search first if an Eigen3Config.cmake is available in the system,
   # if successful this would set EIGEN3_INCLUDE_DIR and the rest of
@@ -82,16 +87,21 @@ else (EIGEN3_INCLUDE_DIR)
         ${KDE4_INCLUDE_DIR}
         PATH_SUFFIXES eigen3 eigen
       )
-  endif(NOT EIGEN3_INCLUDE_DIR)
+  endif()
 
   if(EIGEN3_INCLUDE_DIR)
     _eigen3_check_version()
-  endif(EIGEN3_INCLUDE_DIR)
+  endif()
 
   include(FindPackageHandleStandardArgs)
   find_package_handle_standard_args(Eigen3 DEFAULT_MSG EIGEN3_INCLUDE_DIR EIGEN3_VERSION_OK)
 
   mark_as_advanced(EIGEN3_INCLUDE_DIR)
 
-endif(EIGEN3_INCLUDE_DIR)
+endif()
 
+if(EIGEN3_FOUND AND NOT TARGET Eigen3::Eigen)
+  add_library(Eigen3::Eigen INTERFACE IMPORTED)
+  set_target_properties(Eigen3::Eigen PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${EIGEN3_INCLUDE_DIR}")
+endif()
diff --git a/contrib/eigen/cmake/FindFFTW.cmake b/contrib/eigen/cmake/FindFFTW.cmake
index 6c4dc9ab495d5ab56b6b3943b2c53200b35c5ee8..ed55c5fad6baa31628a2327541a03c1701055566 100644
--- a/contrib/eigen/cmake/FindFFTW.cmake
+++ b/contrib/eigen/cmake/FindFFTW.cmake
@@ -22,7 +22,8 @@ if( NOT FFTW_ROOT AND ENV{FFTWDIR} )
 endif()
 
 # Check if we can use PkgConfig
-find_package(PkgConfig)
+include(CMakeFindDependencyMacro)
+find_dependency(PkgConfig)
 
 #Determine from PKG
 if( PKG_CONFIG_FOUND AND NOT FFTW_ROOT )
@@ -101,7 +102,7 @@ else()
     PATHS ${PKG_FFTW_INCLUDE_DIRS} ${INCLUDE_INSTALL_DIR}
   )
 
-endif( FFTW_ROOT )
+endif()
 
 set(FFTW_LIBRARIES ${FFTW_LIB} ${FFTWF_LIB})
 
diff --git a/contrib/eigen/cmake/FindGLEW.cmake b/contrib/eigen/cmake/FindGLEW.cmake
index 54da20f12bdba627f0d90fb8d225e4995b92ea7c..9d486d5ba8a2221b1a764b5abb345015e6a2679e 100644
--- a/contrib/eigen/cmake/FindGLEW.cmake
+++ b/contrib/eigen/cmake/FindGLEW.cmake
@@ -10,47 +10,47 @@
 # Also defined, but not for general use are:
 #  GLEW_GLEW_LIBRARY = the full path to the glew library.
 
-IF (WIN32)
+if (WIN32)
 
-  IF(CYGWIN)
+  if(CYGWIN)
 
-    FIND_PATH( GLEW_INCLUDE_DIR GL/glew.h)
+    find_path( GLEW_INCLUDE_DIR GL/glew.h)
 
-    FIND_LIBRARY( GLEW_GLEW_LIBRARY glew32
+    find_library( GLEW_GLEW_LIBRARY glew32
       ${OPENGL_LIBRARY_DIR}
       /usr/lib/w32api
       /usr/X11R6/lib
     )
 
 
-  ELSE(CYGWIN)
+  else(CYGWIN)
   
-    FIND_PATH( GLEW_INCLUDE_DIR GL/glew.h
+    find_path( GLEW_INCLUDE_DIR GL/glew.h
       $ENV{GLEW_ROOT_PATH}/include
     )
 
-    FIND_LIBRARY( GLEW_GLEW_LIBRARY
+    find_library( GLEW_GLEW_LIBRARY
       NAMES glew glew32
       PATHS
       $ENV{GLEW_ROOT_PATH}/lib
       ${OPENGL_LIBRARY_DIR}
     )
 
-  ENDIF(CYGWIN)
+  endif(CYGWIN)
 
-ELSE (WIN32)
+else (WIN32)
 
-  IF (APPLE)
+  if (APPLE)
 # These values for Apple could probably do with improvement.
-    FIND_PATH( GLEW_INCLUDE_DIR glew.h
+    find_path( GLEW_INCLUDE_DIR glew.h
       /System/Library/Frameworks/GLEW.framework/Versions/A/Headers
       ${OPENGL_LIBRARY_DIR}
     )
-    SET(GLEW_GLEW_LIBRARY "-framework GLEW" CACHE STRING "GLEW library for OSX")
-    SET(GLEW_cocoa_LIBRARY "-framework Cocoa" CACHE STRING "Cocoa framework for OSX")
-  ELSE (APPLE)
+    set(GLEW_GLEW_LIBRARY "-framework GLEW" CACHE STRING "GLEW library for OSX")
+    set(GLEW_cocoa_LIBRARY "-framework Cocoa" CACHE STRING "Cocoa framework for OSX")
+  else (APPLE)
 
-    FIND_PATH( GLEW_INCLUDE_DIR GL/glew.h
+    find_path( GLEW_INCLUDE_DIR GL/glew.h
       /usr/include/GL
       /usr/openwin/share/include
       /usr/openwin/include
@@ -60,44 +60,44 @@ ELSE (WIN32)
       /opt/graphics/OpenGL/contrib/libglew
     )
 
-    FIND_LIBRARY( GLEW_GLEW_LIBRARY GLEW
+    find_library( GLEW_GLEW_LIBRARY GLEW
       /usr/openwin/lib
       /usr/X11R6/lib
     )
 
-  ENDIF (APPLE)
+  endif (APPLE)
 
-ENDIF (WIN32)
+endif (WIN32)
 
-SET( GLEW_FOUND "NO" )
-IF(GLEW_INCLUDE_DIR)
-  IF(GLEW_GLEW_LIBRARY)
+set( GLEW_FOUND "NO" )
+if(GLEW_INCLUDE_DIR)
+  if(GLEW_GLEW_LIBRARY)
     # Is -lXi and -lXmu required on all platforms that have it?
     # If not, we need some way to figure out what platform we are on.
-    SET( GLEW_LIBRARIES
+    set( GLEW_LIBRARIES
       ${GLEW_GLEW_LIBRARY}
       ${GLEW_cocoa_LIBRARY}
     )
-    SET( GLEW_FOUND "YES" )
+    set( GLEW_FOUND "YES" )
 
 #The following deprecated settings are for backwards compatibility with CMake1.4
-    SET (GLEW_LIBRARY ${GLEW_LIBRARIES})
-    SET (GLEW_INCLUDE_PATH ${GLEW_INCLUDE_DIR})
-
-  ENDIF(GLEW_GLEW_LIBRARY)
-ENDIF(GLEW_INCLUDE_DIR)
-
-IF(GLEW_FOUND)
-  IF(NOT GLEW_FIND_QUIETLY)
-    MESSAGE(STATUS "Found Glew: ${GLEW_LIBRARIES}")
-  ENDIF(NOT GLEW_FIND_QUIETLY)
-ELSE(GLEW_FOUND)
-  IF(GLEW_FIND_REQUIRED)
-    MESSAGE(FATAL_ERROR "Could not find Glew")
-  ENDIF(GLEW_FIND_REQUIRED)
-ENDIF(GLEW_FOUND)
-
-MARK_AS_ADVANCED(
+    set (GLEW_LIBRARY ${GLEW_LIBRARIES})
+    set (GLEW_INCLUDE_PATH ${GLEW_INCLUDE_DIR})
+
+  endif(GLEW_GLEW_LIBRARY)
+endif(GLEW_INCLUDE_DIR)
+
+if(GLEW_FOUND)
+  if(NOT GLEW_FIND_QUIETLY)
+    message(STATUS "Found Glew: ${GLEW_LIBRARIES}")
+  endif(NOT GLEW_FIND_QUIETLY)
+else(GLEW_FOUND)
+  if(GLEW_FIND_REQUIRED)
+    message(FATAL_ERROR "Could not find Glew")
+  endif(GLEW_FIND_REQUIRED)
+endif(GLEW_FOUND)
+
+mark_as_advanced(
   GLEW_INCLUDE_DIR
   GLEW_GLEW_LIBRARY
   GLEW_Xmu_LIBRARY
diff --git a/contrib/eigen/cmake/FindGMP.cmake b/contrib/eigen/cmake/FindGMP.cmake
index 1f0273960142810512c933c53e2ff8de7ab9d428..c41eedcf0a5ed3d818fb099d03b22e8e1fc231c8 100644
--- a/contrib/eigen/cmake/FindGMP.cmake
+++ b/contrib/eigen/cmake/FindGMP.cmake
@@ -3,7 +3,7 @@
 
 if (GMP_INCLUDES AND GMP_LIBRARIES)
   set(GMP_FIND_QUIETLY TRUE)
-endif (GMP_INCLUDES AND GMP_LIBRARIES)
+endif ()
 
 find_path(GMP_INCLUDES
   NAMES
diff --git a/contrib/eigen/cmake/FindGSL.cmake b/contrib/eigen/cmake/FindGSL.cmake
index bf411a7f9814843a67465198cb61c82e59c4beff..8632232f9d99b81f6fce5509b43b6e707c0207fe 100644
--- a/contrib/eigen/cmake/FindGSL.cmake
+++ b/contrib/eigen/cmake/FindGSL.cmake
@@ -23,9 +23,9 @@
 # www.mip.informatik.uni-kiel.de
 # --------------------------------
 
-IF(WIN32)
+if(WIN32)
   # JW tested with gsl-1.8, Windows XP, MSVS 7.1
-  SET(GSL_POSSIBLE_ROOT_DIRS
+  set(GSL_POSSIBLE_ROOT_DIRS
     ${GSL_ROOT_DIR}
     $ENV{GSL_ROOT_DIR}
     ${GSL_DIR}
@@ -35,136 +35,136 @@ IF(WIN32)
     $ENV{EXTRA}
     "C:/Program Files/GnuWin32"
     )
-  FIND_PATH(GSL_INCLUDE_DIR
+  find_path(GSL_INCLUDE_DIR
     NAMES gsl/gsl_cdf.h gsl/gsl_randist.h
     PATHS ${GSL_POSSIBLE_ROOT_DIRS}
     PATH_SUFFIXES include
     DOC "GSL header include dir"
     )
   
-  FIND_LIBRARY(GSL_GSL_LIBRARY
+  find_library(GSL_GSL_LIBRARY
     NAMES libgsl.dll.a gsl libgsl
     PATHS  ${GSL_POSSIBLE_ROOT_DIRS}
     PATH_SUFFIXES lib
     DOC "GSL library" )
   
   if(NOT GSL_GSL_LIBRARY)
-	FIND_FILE(GSL_GSL_LIBRARY
+	find_file(GSL_GSL_LIBRARY
 		NAMES libgsl.dll.a
 		PATHS  ${GSL_POSSIBLE_ROOT_DIRS}
 		PATH_SUFFIXES lib
 		DOC "GSL library")
-  endif(NOT GSL_GSL_LIBRARY)
+  endif()
   
-  FIND_LIBRARY(GSL_GSLCBLAS_LIBRARY
+  find_library(GSL_GSLCBLAS_LIBRARY
     NAMES libgslcblas.dll.a gslcblas libgslcblas
     PATHS  ${GSL_POSSIBLE_ROOT_DIRS}
     PATH_SUFFIXES lib
     DOC "GSL cblas library dir" )
   
   if(NOT GSL_GSLCBLAS_LIBRARY)
-	FIND_FILE(GSL_GSLCBLAS_LIBRARY
+	find_file(GSL_GSLCBLAS_LIBRARY
 		NAMES libgslcblas.dll.a
 		PATHS  ${GSL_POSSIBLE_ROOT_DIRS}
 		PATH_SUFFIXES lib
 		DOC "GSL library")
-  endif(NOT GSL_GSLCBLAS_LIBRARY)
+  endif()
   
-  SET(GSL_LIBRARIES ${GSL_GSL_LIBRARY})
+  set(GSL_LIBRARIES ${GSL_GSL_LIBRARY})
 
-  #MESSAGE("DBG\n"
+  #message("DBG\n"
   #  "GSL_GSL_LIBRARY=${GSL_GSL_LIBRARY}\n"
   #  "GSL_GSLCBLAS_LIBRARY=${GSL_GSLCBLAS_LIBRARY}\n"
   #  "GSL_LIBRARIES=${GSL_LIBRARIES}")
 
 
-ELSE(WIN32)
+else(WIN32)
   
-  IF(UNIX) 
-    SET(GSL_CONFIG_PREFER_PATH 
+  if(UNIX) 
+    set(GSL_CONFIG_PREFER_PATH 
       "$ENV{GSL_DIR}/bin"
       "$ENV{GSL_DIR}"
       "$ENV{GSL_HOME}/bin" 
       "$ENV{GSL_HOME}" 
       CACHE STRING "preferred path to GSL (gsl-config)")
-    FIND_PROGRAM(GSL_CONFIG gsl-config
+    find_program(GSL_CONFIG gsl-config
       ${GSL_CONFIG_PREFER_PATH}
       /usr/bin/
       )
-    # MESSAGE("DBG GSL_CONFIG ${GSL_CONFIG}")
+    # message("DBG GSL_CONFIG ${GSL_CONFIG}")
     
-    IF (GSL_CONFIG) 
+    if (GSL_CONFIG) 
       # set CXXFLAGS to be fed into CXX_FLAGS by the user:
-      SET(GSL_CXX_FLAGS "`${GSL_CONFIG} --cflags`")
+      set(GSL_CXX_FLAGS "`${GSL_CONFIG} --cflags`")
       
       # set INCLUDE_DIRS to prefix+include
-      EXEC_PROGRAM(${GSL_CONFIG}
+      exec_program(${GSL_CONFIG}
         ARGS --prefix
         OUTPUT_VARIABLE GSL_PREFIX)
-      SET(GSL_INCLUDE_DIR ${GSL_PREFIX}/include CACHE STRING INTERNAL)
+      set(GSL_INCLUDE_DIR ${GSL_PREFIX}/include CACHE STRING INTERNAL)
 
       # set link libraries and link flags
-      #SET(GSL_LIBRARIES "`${GSL_CONFIG} --libs`")
-      EXEC_PROGRAM(${GSL_CONFIG}
+      #set(GSL_LIBRARIES "`${GSL_CONFIG} --libs`")
+      exec_program(${GSL_CONFIG}
         ARGS --libs
         OUTPUT_VARIABLE GSL_LIBRARIES )
         
       # extract link dirs for rpath  
-      EXEC_PROGRAM(${GSL_CONFIG}
+      exec_program(${GSL_CONFIG}
         ARGS --libs
         OUTPUT_VARIABLE GSL_CONFIG_LIBS )
       
       # extract version
-      EXEC_PROGRAM(${GSL_CONFIG}
+      exec_program(${GSL_CONFIG}
         ARGS --version
         OUTPUT_VARIABLE GSL_FULL_VERSION )
       
       # split version as major/minor
-      STRING(REGEX MATCH "(.)\\..*" GSL_VERSION_MAJOR_ "${GSL_FULL_VERSION}")
-      SET(GSL_VERSION_MAJOR ${CMAKE_MATCH_1})
-      STRING(REGEX MATCH ".\\.(.*)" GSL_VERSION_MINOR_ "${GSL_FULL_VERSION}")
-      SET(GSL_VERSION_MINOR ${CMAKE_MATCH_1})
+      string(REGEX MATCH "(.)\\..*" GSL_VERSION_MAJOR_ "${GSL_FULL_VERSION}")
+      set(GSL_VERSION_MAJOR ${CMAKE_MATCH_1})
+      string(REGEX MATCH ".\\.(.*)" GSL_VERSION_MINOR_ "${GSL_FULL_VERSION}")
+      set(GSL_VERSION_MINOR ${CMAKE_MATCH_1})
 
       # split off the link dirs (for rpath)
       # use regular expression to match wildcard equivalent "-L*<endchar>"
       # with <endchar> is a space or a semicolon
-      STRING(REGEX MATCHALL "[-][L]([^ ;])+" 
+      string(REGEX MATCHALL "[-][L]([^ ;])+" 
         GSL_LINK_DIRECTORIES_WITH_PREFIX 
         "${GSL_CONFIG_LIBS}" )
-      #      MESSAGE("DBG  GSL_LINK_DIRECTORIES_WITH_PREFIX=${GSL_LINK_DIRECTORIES_WITH_PREFIX}")
+      #      message("DBG  GSL_LINK_DIRECTORIES_WITH_PREFIX=${GSL_LINK_DIRECTORIES_WITH_PREFIX}")
 
       # remove prefix -L because we need the pure directory for LINK_DIRECTORIES
       
-      IF (GSL_LINK_DIRECTORIES_WITH_PREFIX)
-        STRING(REGEX REPLACE "[-][L]" "" GSL_LINK_DIRECTORIES ${GSL_LINK_DIRECTORIES_WITH_PREFIX} )
-      ENDIF (GSL_LINK_DIRECTORIES_WITH_PREFIX)
-      SET(GSL_EXE_LINKER_FLAGS "-Wl,-rpath,${GSL_LINK_DIRECTORIES}" CACHE STRING INTERNAL)
-      #      MESSAGE("DBG  GSL_LINK_DIRECTORIES=${GSL_LINK_DIRECTORIES}")
-      #      MESSAGE("DBG  GSL_EXE_LINKER_FLAGS=${GSL_EXE_LINKER_FLAGS}")
+      if (GSL_LINK_DIRECTORIES_WITH_PREFIX)
+        string(REGEX REPLACE "[-][L]" "" GSL_LINK_DIRECTORIES ${GSL_LINK_DIRECTORIES_WITH_PREFIX} )
+      endif (GSL_LINK_DIRECTORIES_WITH_PREFIX)
+      set(GSL_EXE_LINKER_FLAGS "-Wl,-rpath,${GSL_LINK_DIRECTORIES}" CACHE STRING INTERNAL)
+      #      message("DBG  GSL_LINK_DIRECTORIES=${GSL_LINK_DIRECTORIES}")
+      #      message("DBG  GSL_EXE_LINKER_FLAGS=${GSL_EXE_LINKER_FLAGS}")
 
-      #      ADD_DEFINITIONS("-DHAVE_GSL")
-      #      SET(GSL_DEFINITIONS "-DHAVE_GSL")
-      MARK_AS_ADVANCED(
+      #      add_definitions("-DHAVE_GSL")
+      #      set(GSL_DEFINITIONS "-DHAVE_GSL")
+      mark_as_advanced(
         GSL_CXX_FLAGS
         GSL_INCLUDE_DIR
         GSL_LIBRARIES
         GSL_LINK_DIRECTORIES
         GSL_DEFINITIONS
         )
-      MESSAGE(STATUS "Using GSL from ${GSL_PREFIX}")
+      message(STATUS "Using GSL from ${GSL_PREFIX}")
       
-    ELSE(GSL_CONFIG)
-      MESSAGE("FindGSL.cmake: gsl-config not found. Please set it manually. GSL_CONFIG=${GSL_CONFIG}")
-    ENDIF(GSL_CONFIG)
+    else(GSL_CONFIG)
+      message("FindGSL.cmake: gsl-config not found. Please set it manually. GSL_CONFIG=${GSL_CONFIG}")
+    endif(GSL_CONFIG)
 
-  ENDIF(UNIX)
-ENDIF(WIN32)
+  endif(UNIX)
+endif(WIN32)
 
 
-IF(GSL_LIBRARIES)
-  IF(GSL_INCLUDE_DIR OR GSL_CXX_FLAGS)
+if(GSL_LIBRARIES)
+  if(GSL_INCLUDE_DIR OR GSL_CXX_FLAGS)
 
-    SET(GSL_FOUND 1)
+    set(GSL_FOUND 1)
     
-  ENDIF(GSL_INCLUDE_DIR OR GSL_CXX_FLAGS)
-ENDIF(GSL_LIBRARIES)
+  endif(GSL_INCLUDE_DIR OR GSL_CXX_FLAGS)
+endif(GSL_LIBRARIES)
diff --git a/contrib/eigen/cmake/FindGoogleHash.cmake b/contrib/eigen/cmake/FindGoogleHash.cmake
index f6a81a02ce9dbac0710ef2b5e46eb8b9c5aa5999..481eb4dad5fa234672600d5de7f62320ad5984ab 100644
--- a/contrib/eigen/cmake/FindGoogleHash.cmake
+++ b/contrib/eigen/cmake/FindGoogleHash.cmake
@@ -1,7 +1,7 @@
 
 if (GOOGLEHASH_INCLUDES AND GOOGLEHASH_LIBRARIES)
   set(GOOGLEHASH_FIND_QUIETLY TRUE)
-endif (GOOGLEHASH_INCLUDES AND GOOGLEHASH_LIBRARIES)
+endif ()
 
 find_path(GOOGLEHASH_INCLUDES
   NAMES
@@ -15,9 +15,9 @@ if(GOOGLEHASH_INCLUDES)
   file(WRITE ${CMAKE_BINARY_DIR}/googlehash_test.cpp
   "#include <google/sparse_hash_map>\n#include <google/dense_hash_map>\nint main(int argc, char** argv) { google::dense_hash_map<int,float> a; google::sparse_hash_map<int,float> b; return 0;}\n")
   try_compile(GOOGLEHASH_COMPILE ${CMAKE_BINARY_DIR} ${CMAKE_BINARY_DIR}/googlehash_test.cpp OUTPUT_VARIABLE GOOGLEHASH_COMPILE_RESULT)
-endif(GOOGLEHASH_INCLUDES)
+endif()
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(GOOGLEHASH DEFAULT_MSG GOOGLEHASH_INCLUDES GOOGLEHASH_COMPILE)
+find_package_handle_standard_args(GoogleHash DEFAULT_MSG GOOGLEHASH_INCLUDES GOOGLEHASH_COMPILE)
 
 mark_as_advanced(GOOGLEHASH_INCLUDES)
diff --git a/contrib/eigen/cmake/FindHWLOC.cmake b/contrib/eigen/cmake/FindHWLOC.cmake
index a831b5c725acc0b88dfba169e3ed477dd5bf2910..522f5215795ab937f1a815d5b9c1cf046138a265 100644
--- a/contrib/eigen/cmake/FindHWLOC.cmake
+++ b/contrib/eigen/cmake/FindHWLOC.cmake
@@ -65,8 +65,9 @@ endif()
 
 # Optionally use pkg-config to detect include/library dirs (if pkg-config is available)
 # -------------------------------------------------------------------------------------
-include(FindPkgConfig)
-find_package(PkgConfig QUIET)
+include(CMakeFindDependencyMacro)
+# include(FindPkgConfig)
+find_dependency(PkgConfig QUIET)
 if( PKG_CONFIG_EXECUTABLE AND NOT HWLOC_GIVEN_BY_USER )
 
   pkg_search_module(HWLOC hwloc)
@@ -85,7 +86,7 @@ if( PKG_CONFIG_EXECUTABLE AND NOT HWLOC_GIVEN_BY_USER )
     endif()
   endif()
 
-endif( PKG_CONFIG_EXECUTABLE AND NOT HWLOC_GIVEN_BY_USER )
+endif()
 
 if( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT HWLOC_FOUND) OR (HWLOC_GIVEN_BY_USER) )
 
@@ -282,9 +283,9 @@ if( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT HWLOC_FOUND) O
     set(CMAKE_REQUIRED_INCLUDES)
     set(CMAKE_REQUIRED_FLAGS)
     set(CMAKE_REQUIRED_LIBRARIES)
-  endif(HWLOC_LIBRARIES)
+  endif()
 
-endif( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT HWLOC_FOUND) OR (HWLOC_GIVEN_BY_USER) )
+endif()
 
 if (HWLOC_LIBRARIES)
   if (HWLOC_LIBRARY_DIRS)
diff --git a/contrib/eigen/cmake/FindKLU.cmake b/contrib/eigen/cmake/FindKLU.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..6217d14908fe3720af561c2101e91666bda2ee07
--- /dev/null
+++ b/contrib/eigen/cmake/FindKLU.cmake
@@ -0,0 +1,48 @@
+# KLU lib usually requires linking to a blas library.
+# It is up to the user of this module to find a BLAS and link to it.
+
+if (KLU_INCLUDES AND KLU_LIBRARIES)
+  set(KLU_FIND_QUIETLY TRUE)
+endif ()
+
+find_path(KLU_INCLUDES
+  NAMES
+  klu.h
+  PATHS
+  $ENV{KLUDIR}
+  ${INCLUDE_INSTALL_DIR}
+  PATH_SUFFIXES
+  suitesparse
+  ufsparse
+)
+
+find_library(KLU_LIBRARIES klu PATHS $ENV{KLUDIR} ${LIB_INSTALL_DIR})
+
+if(KLU_LIBRARIES)
+
+  if(NOT KLU_LIBDIR)
+    get_filename_component(KLU_LIBDIR ${KLU_LIBRARIES} PATH)
+  endif()
+
+  find_library(COLAMD_LIBRARY colamd PATHS ${KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR})
+  if(COLAMD_LIBRARY)
+    set(KLU_LIBRARIES ${KLU_LIBRARIES} ${COLAMD_LIBRARY})
+  endif ()
+  
+  find_library(AMD_LIBRARY amd PATHS ${KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR})
+  if(AMD_LIBRARY)
+    set(KLU_LIBRARIES ${KLU_LIBRARIES} ${AMD_LIBRARY})
+  endif ()
+
+  find_library(BTF_LIBRARY btf PATHS $ENV{KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR})
+  if(BTF_LIBRARY)
+    set(KLU_LIBRARIES ${KLU_LIBRARIES} ${BTF_LIBRARY})
+  endif()
+
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(KLU DEFAULT_MSG
+                                  KLU_INCLUDES KLU_LIBRARIES)
+
+mark_as_advanced(KLU_INCLUDES KLU_LIBRARIES AMD_LIBRARY COLAMD_LIBRARY BTF_LIBRARY)
diff --git a/contrib/eigen/cmake/FindLAPACK.cmake b/contrib/eigen/cmake/FindLAPACK.cmake
index 2fcae2199f5f55f707089108301eb2c0a350671d..3fd738807090183de5f57ac2466582cc7126c4a2 100644
--- a/contrib/eigen/cmake/FindLAPACK.cmake
+++ b/contrib/eigen/cmake/FindLAPACK.cmake
@@ -26,6 +26,7 @@
 
 
 include(CheckFunctionExists)
+include(CMakeFindDependencyMacro)
 
 # This macro checks for the existence of the combination of fortran libraries
 # given by _list.  If the combination is found, this macro checks (using the
@@ -75,8 +76,8 @@ macro(check_lapack_libraries DEFINITIONS LIBRARIES _prefix _name _flags _list _b
       mark_as_advanced(${_prefix}_${_library}_LIBRARY)
       set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
       set(_libraries_found ${${_prefix}_${_library}_LIBRARY})
-    endif(_libraries_found)
-  endforeach(_library ${_list})
+    endif()
+  endforeach()
   if(_libraries_found)
     set(_libraries_found ${${LIBRARIES}})
   endif()
@@ -88,7 +89,7 @@ macro(check_lapack_libraries DEFINITIONS LIBRARIES _prefix _name _flags _list _b
     set(${LIBRARIES}    ${_libraries_found})
     # Some C++ linkers require the f2c library to link with Fortran libraries.
     # I do not know which ones, thus I just add the f2c library if it is available.
-    find_package( F2C QUIET )
+    find_dependency( F2C QUIET )
     if ( F2C_FOUND )
       set(${DEFINITIONS}  ${${DEFINITIONS}} ${F2C_DEFINITIONS})
       set(${LIBRARIES}    ${${LIBRARIES}} ${F2C_LIBRARIES})
@@ -103,7 +104,7 @@ macro(check_lapack_libraries DEFINITIONS LIBRARIES _prefix _name _flags _list _b
     set(CMAKE_REQUIRED_LIBRARIES    "")
     mark_as_advanced(${_prefix}_${_name}_${_combined_name}_f2c_WORKS)
     set(_libraries_work ${${_prefix}_${_name}_${_combined_name}_f2c_WORKS})
-  endif(_libraries_found AND NOT _libraries_work)
+  endif()
 
   # If not found, test this combination of libraries with a C interface.
   # A few implementations (ie ACML) provide a C interface. Unfortunately, there is no standard.
@@ -117,7 +118,7 @@ macro(check_lapack_libraries DEFINITIONS LIBRARIES _prefix _name _flags _list _b
     set(CMAKE_REQUIRED_LIBRARIES "")
     mark_as_advanced(${_prefix}_${_name}${_combined_name}_WORKS)
     set(_libraries_work ${${_prefix}_${_name}${_combined_name}_WORKS})
-  endif(_libraries_found AND NOT _libraries_work)
+  endif()
 
   # on failure
   if(NOT _libraries_work)
@@ -126,7 +127,7 @@ macro(check_lapack_libraries DEFINITIONS LIBRARIES _prefix _name _flags _list _b
   endif()
   #message("DEBUG: ${DEFINITIONS} = ${${DEFINITIONS}}")
   #message("DEBUG: ${LIBRARIES} = ${${LIBRARIES}}")
-endmacro(check_lapack_libraries)
+endmacro()
 
 
 #
@@ -135,9 +136,9 @@ endmacro(check_lapack_libraries)
 
 # LAPACK requires BLAS
 if(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED)
-  find_package(BLAS)
+  find_dependency(BLAS)
 else()
-  find_package(BLAS REQUIRED)
+  find_dependency(BLAS REQUIRED)
 endif()
 
 if (NOT BLAS_FOUND)
@@ -216,7 +217,7 @@ else()
       "${BLAS_LIBRARIES}"
       "${CGAL_TAUCS_LIBRARIES_DIR} ENV LAPACK_LIB_DIR"
       )
-    endif ( NOT LAPACK_LIBRARIES )
+    endif ()
 
     # Generic LAPACK library?
     # This configuration *must* be the last try as this library is notably slow.
@@ -242,14 +243,14 @@ else()
   if(NOT LAPACK_FIND_QUIETLY)
     if(LAPACK_FOUND)
       message(STATUS "A library with LAPACK API found.")
-    else(LAPACK_FOUND)
+    else()
       if(LAPACK_FIND_REQUIRED)
         message(FATAL_ERROR "A required library with LAPACK API not found. Please specify library location.")
       else()
         message(STATUS "A library with LAPACK API not found. Please specify library location.")
       endif()
-    endif(LAPACK_FOUND)
-  endif(NOT LAPACK_FIND_QUIETLY)
+    endif()
+  endif()
 
   # Add variables to cache
   set( LAPACK_INCLUDE_DIR   "${LAPACK_INCLUDE_DIR}"
@@ -270,4 +271,4 @@ else()
   #message("DEBUG: LAPACK_LIBRARIES_DIR = ${LAPACK_LIBRARIES_DIR}")
   #message("DEBUG: LAPACK_FOUND = ${LAPACK_FOUND}")
 
-endif(NOT BLAS_FOUND)
+endif()
diff --git a/contrib/eigen/cmake/FindMPFR.cmake b/contrib/eigen/cmake/FindMPFR.cmake
index aa4c2cd7d29c65f88dd63b5507b9f1aeeeb3f7a6..d8da9d6ff81105913c58741831e38d6abb803653 100644
--- a/contrib/eigen/cmake/FindMPFR.cmake
+++ b/contrib/eigen/cmake/FindMPFR.cmake
@@ -32,16 +32,16 @@ find_path(MPFR_INCLUDES
 if(NOT MPFR_FIND_VERSION)
   if(NOT MPFR_FIND_VERSION_MAJOR)
     set(MPFR_FIND_VERSION_MAJOR 1)
-  endif(NOT MPFR_FIND_VERSION_MAJOR)
+  endif()
   if(NOT MPFR_FIND_VERSION_MINOR)
     set(MPFR_FIND_VERSION_MINOR 0)
-  endif(NOT MPFR_FIND_VERSION_MINOR)
+  endif()
   if(NOT MPFR_FIND_VERSION_PATCH)
     set(MPFR_FIND_VERSION_PATCH 0)
-  endif(NOT MPFR_FIND_VERSION_PATCH)
+  endif()
 
   set(MPFR_FIND_VERSION "${MPFR_FIND_VERSION_MAJOR}.${MPFR_FIND_VERSION_MINOR}.${MPFR_FIND_VERSION_PATCH}")
-endif(NOT MPFR_FIND_VERSION)
+endif()
 
 
 if(MPFR_INCLUDES)
@@ -65,11 +65,11 @@ if(MPFR_INCLUDES)
     set(MPFR_VERSION_OK FALSE)
     message(STATUS "MPFR version ${MPFR_VERSION} found in ${MPFR_INCLUDES}, "
                    "but at least version ${MPFR_FIND_VERSION} is required")
-  else(${MPFR_VERSION} VERSION_LESS ${MPFR_FIND_VERSION})
+  else()
     set(MPFR_VERSION_OK TRUE)
-  endif(${MPFR_VERSION} VERSION_LESS ${MPFR_FIND_VERSION})
+  endif()
 
-endif(MPFR_INCLUDES)
+endif()
 
 # Set MPFR_LIBRARIES
 
diff --git a/contrib/eigen/cmake/FindMPREAL.cmake b/contrib/eigen/cmake/FindMPREAL.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..947a1ce88678ddb4ffd7f49d38ac30e731afb332
--- /dev/null
+++ b/contrib/eigen/cmake/FindMPREAL.cmake
@@ -0,0 +1,103 @@
+# Try to find the MPFR C++ (MPREAL) library
+# See http://www.holoborodko.com/pavel/mpreal/
+#
+# This module supports requiring a minimum version, e.g. you can do
+#   find_package(MPREAL 1.8.6)
+# to require version 1.8.6 or newer of MPREAL C++.
+#
+# Once done this will define
+#
+#  MPREAL_FOUND - system has MPREAL lib with correct version
+#  MPREAL_INCLUDES - MPREAL required include directories
+#  MPREAL_LIBRARIES - MPREAL required libraries
+#  MPREAL_VERSION - MPREAL version
+
+# Copyright (c) 2020 The Eigen Authors.
+# Redistribution and use is allowed according to the terms of the BSD license.
+
+include(CMakeFindDependencyMacro)
+find_dependency(MPFR)
+find_dependency(GMP)
+
+# Set MPREAL_INCLUDES
+find_path(MPREAL_INCLUDES
+  NAMES
+  mpreal.h
+  PATHS
+  $ENV{GMPDIR}
+  ${INCLUDE_INSTALL_DIR}
+)
+
+# Set MPREAL_FIND_VERSION to 1.0.0 if no minimum version is specified
+
+if(NOT MPREAL_FIND_VERSION)
+  if(NOT MPREAL_FIND_VERSION_MAJOR)
+    set(MPREAL_FIND_VERSION_MAJOR 1)
+  endif()
+  if(NOT MPREAL_FIND_VERSION_MINOR)
+    set(MPREAL_FIND_VERSION_MINOR 0)
+  endif()
+  if(NOT MPREAL_FIND_VERSION_PATCH)
+    set(MPREAL_FIND_VERSION_PATCH 0)
+  endif()
+
+  set(MPREAL_FIND_VERSION "${MPREAL_FIND_VERSION_MAJOR}.${MPREAL_FIND_VERSION_MINOR}.${MPREAL_FIND_VERSION_PATCH}")
+endif()
+
+# Check bugs
+# - https://github.com/advanpix/mpreal/issues/7
+# - https://github.com/advanpix/mpreal/issues/9
+set(MPREAL_TEST_PROGRAM "
+#include <mpreal.h>
+#include <algorithm>
+int main(int argc, char** argv) {
+  const mpfr::mpreal one  =    1.0;
+  const mpfr::mpreal zero =    0.0;
+  using namespace std;
+  const mpfr::mpreal smaller = min(one, zero);
+  return 0;
+}")
+
+if(MPREAL_INCLUDES)
+
+  # Set MPREAL_VERSION
+  
+  file(READ "${MPREAL_INCLUDES}/mpreal.h" _mpreal_version_header)
+  
+  string(REGEX MATCH "define[ \t]+MPREAL_VERSION_MAJOR[ \t]+([0-9]+)" _mpreal_major_version_match "${_mpreal_version_header}")
+  set(MPREAL_MAJOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+MPREAL_VERSION_MINOR[ \t]+([0-9]+)" _mpreal_minor_version_match "${_mpreal_version_header}")
+  set(MPREAL_MINOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+MPREAL_VERSION_PATCHLEVEL[ \t]+([0-9]+)" _mpreal_patchlevel_version_match "${_mpreal_version_header}")
+  set(MPREAL_PATCHLEVEL_VERSION "${CMAKE_MATCH_1}")
+  
+  set(MPREAL_VERSION ${MPREAL_MAJOR_VERSION}.${MPREAL_MINOR_VERSION}.${MPREAL_PATCHLEVEL_VERSION})
+  
+  # Check whether found version exceeds minimum version
+  
+  if(${MPREAL_VERSION} VERSION_LESS ${MPREAL_FIND_VERSION})
+    set(MPREAL_VERSION_OK FALSE)
+    message(STATUS "MPREAL version ${MPREAL_VERSION} found in ${MPREAL_INCLUDES}, "
+                   "but at least version ${MPREAL_FIND_VERSION} is required")
+  else()
+    set(MPREAL_VERSION_OK TRUE)
+    
+    list(APPEND MPREAL_INCLUDES "${MPFR_INCLUDES}" "${GMP_INCLUDES}")
+    list(REMOVE_DUPLICATES MPREAL_INCLUDES)
+    
+    list(APPEND MPREAL_LIBRARIES "${MPFR_LIBRARIES}" "${GMP_LIBRARIES}")
+    list(REMOVE_DUPLICATES MPREAL_LIBRARIES)
+    
+    # Make sure it compiles with the current compiler.
+    unset(MPREAL_WORKS CACHE)
+    include(CheckCXXSourceCompiles)
+    set(CMAKE_REQUIRED_INCLUDES "${MPREAL_INCLUDES}")
+    set(CMAKE_REQUIRED_LIBRARIES "${MPREAL_LIBRARIES}")
+    check_cxx_source_compiles("${MPREAL_TEST_PROGRAM}" MPREAL_WORKS)
+  endif()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MPREAL DEFAULT_MSG
+                                  MPREAL_INCLUDES MPREAL_VERSION_OK MPREAL_WORKS)
+mark_as_advanced(MPREAL_INCLUDES)
diff --git a/contrib/eigen/cmake/FindMetis.cmake b/contrib/eigen/cmake/FindMetis.cmake
index da2f1f1d7b056f97a185b9da672bfc2cd4cbb220..747f88273d3dcdf114d3f1d938b7c1cd6b191b72 100644
--- a/contrib/eigen/cmake/FindMetis.cmake
+++ b/contrib/eigen/cmake/FindMetis.cmake
@@ -238,7 +238,7 @@ if(METIS_LIBRARIES)
   set(CMAKE_REQUIRED_INCLUDES)
   set(CMAKE_REQUIRED_FLAGS)
   set(CMAKE_REQUIRED_LIBRARIES)
-endif(METIS_LIBRARIES)
+endif()
 
 if (METIS_LIBRARIES)
   list(GET METIS_LIBRARIES 0 first_lib)
@@ -258,7 +258,8 @@ mark_as_advanced(METIS_DIR_FOUND)
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(METIS DEFAULT_MSG
   METIS_LIBRARIES
-  METIS_WORKS)
+  METIS_WORKS
+  METIS_INCLUDE_DIRS)
 #
 # TODO: Add possibility to check for specific functions in the library
 #
diff --git a/contrib/eigen/cmake/FindPTSCOTCH.cmake b/contrib/eigen/cmake/FindPTSCOTCH.cmake
index 1396d05824d182d756a15ab9db14397689ea8565..6ccc743e68cc5711fce90bf74ccb611cebd4759d 100644
--- a/contrib/eigen/cmake/FindPTSCOTCH.cmake
+++ b/contrib/eigen/cmake/FindPTSCOTCH.cmake
@@ -79,20 +79,21 @@ if( PTSCOTCH_FIND_COMPONENTS )
 endif()
 
 # PTSCOTCH depends on Threads, try to find it
+include(CMakeFindDependencyMacro)
 if (NOT THREADS_FOUND)
   if (PTSCOTCH_FIND_REQUIRED)
-    find_package(Threads REQUIRED)
+    find_dependency(Threads REQUIRED)
   else()
-    find_package(Threads)
+    find_dependency(Threads)
   endif()
 endif()
 
 # PTSCOTCH depends on MPI, try to find it
 if (NOT MPI_FOUND)
   if (PTSCOTCH_FIND_REQUIRED)
-    find_package(MPI REQUIRED)
+    find_dependency(MPI REQUIRED)
   else()
-    find_package(MPI)
+    find_dependency(MPI)
   endif()
 endif()
 
@@ -148,18 +149,18 @@ else()
     foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find})
       set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND")
       find_path(PTSCOTCH_${ptscotch_hdr}_DIRS
-	NAMES ${ptscotch_hdr}
-	HINTS ${PTSCOTCH_DIR}
-	PATH_SUFFIXES "include" "include/scotch")
+        NAMES ${ptscotch_hdr}
+        HINTS ${PTSCOTCH_DIR}
+        PATH_SUFFIXES "include" "include/scotch")
       mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS)
     endforeach()
   else()
     foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find})
       set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND")
       find_path(PTSCOTCH_${ptscotch_hdr}_DIRS
-	NAMES ${ptscotch_hdr}
-	HINTS ${_inc_env}
-	PATH_SUFFIXES "scotch")
+        NAMES ${ptscotch_hdr}
+        HINTS ${_inc_env}
+        PATH_SUFFIXES "scotch")
       mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS)
     endforeach()
   endif()
@@ -171,7 +172,6 @@ foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find})
   if (PTSCOTCH_${ptscotch_hdr}_DIRS)
     list(APPEND PTSCOTCH_INCLUDE_DIRS "${PTSCOTCH_${ptscotch_hdr}_DIRS}")
   else ()
-    set(PTSCOTCH_INCLUDE_DIRS "PTSCOTCH_INCLUDE_DIRS-NOTFOUND")
     if (NOT PTSCOTCH_FIND_QUIETLY)
       message(STATUS "Looking for ptscotch -- ${ptscotch_hdr} not found")
     endif()
@@ -229,16 +229,16 @@ else()
     foreach(ptscotch_lib ${PTSCOTCH_libs_to_find})
       set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND")
       find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY
-	NAMES ${ptscotch_lib}
-	HINTS ${PTSCOTCH_DIR}
-	PATH_SUFFIXES lib lib32 lib64)
+        NAMES ${ptscotch_lib}
+        HINTS ${PTSCOTCH_DIR}
+        PATH_SUFFIXES lib lib32 lib64)
     endforeach()
   else()
     foreach(ptscotch_lib ${PTSCOTCH_libs_to_find})
       set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND")
       find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY
-	NAMES ${ptscotch_lib}
-	HINTS ${_lib_env})
+        NAMES ${ptscotch_lib}
+        HINTS ${_lib_env})
     endforeach()
   endif()
 endif()
@@ -255,7 +255,6 @@ foreach(ptscotch_lib ${PTSCOTCH_libs_to_find})
     list(APPEND PTSCOTCH_LIBRARIES "${PTSCOTCH_${ptscotch_lib}_LIBRARY}")
     list(APPEND PTSCOTCH_LIBRARY_DIRS "${${ptscotch_lib}_lib_path}")
   else ()
-    list(APPEND PTSCOTCH_LIBRARIES "${PTSCOTCH_${ptscotch_lib}_LIBRARY}")
     if (NOT PTSCOTCH_FIND_QUIETLY)
       message(STATUS "Looking for ptscotch -- lib ${ptscotch_lib} not found")
     endif()
@@ -355,7 +354,7 @@ if(PTSCOTCH_LIBRARIES)
   set(CMAKE_REQUIRED_INCLUDES)
   set(CMAKE_REQUIRED_FLAGS)
   set(CMAKE_REQUIRED_LIBRARIES)
-endif(PTSCOTCH_LIBRARIES)
+endif()
 
 if (PTSCOTCH_LIBRARIES)
   list(GET PTSCOTCH_LIBRARIES 0 first_lib)
diff --git a/contrib/eigen/cmake/FindPastix.cmake b/contrib/eigen/cmake/FindPastix.cmake
index 470477fdccb95ef8977536589dba5851c16fa7d7..db1427b0a4b2ee16452aa4110c2a2bae36e62ab0 100644
--- a/contrib/eigen/cmake/FindPastix.cmake
+++ b/contrib/eigen/cmake/FindPastix.cmake
@@ -118,7 +118,7 @@ if( PASTIX_FIND_COMPONENTS )
     if (${component} STREQUAL "SCOTCH")
       set(PASTIX_LOOK_FOR_SCOTCH ON)
     endif()
-    if (${component} STREQUAL "SCOTCH")
+    if (${component} STREQUAL "PTSCOTCH")
       set(PASTIX_LOOK_FOR_PTSCOTCH ON)
     endif()
     if (${component} STREQUAL "METIS")
@@ -133,14 +133,14 @@ endif()
 
 # Required dependencies
 # ---------------------
-
+include(CMakeFindDependencyMacro)
 if (NOT PASTIX_FIND_QUIETLY)
   message(STATUS "Looking for PASTIX - Try to detect pthread")
 endif()
 if (PASTIX_FIND_REQUIRED)
-  find_package(Threads REQUIRED QUIET)
+  find_dependency(Threads REQUIRED QUIET)
 else()
-  find_package(Threads QUIET)
+  find_dependency(Threads QUIET)
 endif()
 set(PASTIX_EXTRA_LIBRARIES "")
 if( THREADS_FOUND )
@@ -198,9 +198,9 @@ if (NOT PASTIX_FIND_QUIETLY)
   message(STATUS "Looking for PASTIX - Try to detect HWLOC")
 endif()
 if (PASTIX_FIND_REQUIRED)
-  find_package(HWLOC REQUIRED QUIET)
+  find_dependency(HWLOC REQUIRED QUIET)
 else()
-  find_package(HWLOC QUIET)
+  find_dependency(HWLOC QUIET)
 endif()
 
 # PASTIX depends on BLAS
@@ -209,9 +209,9 @@ if (NOT PASTIX_FIND_QUIETLY)
   message(STATUS "Looking for PASTIX - Try to detect BLAS")
 endif()
 if (PASTIX_FIND_REQUIRED)
-  find_package(BLASEXT REQUIRED QUIET)
+  find_dependency(BLASEXT REQUIRED QUIET)
 else()
-  find_package(BLASEXT QUIET)
+  find_dependency(BLASEXT QUIET)
 endif()
 
 # Optional dependencies
@@ -230,15 +230,15 @@ if (NOT MPI_FOUND AND PASTIX_LOOK_FOR_MPI)
     set(MPI_C_COMPILER mpicc)
   endif()
   if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_MPI)
-    find_package(MPI REQUIRED QUIET)
+    find_dependency(MPI REQUIRED QUIET)
   else()
-    find_package(MPI QUIET)
+    find_dependency(MPI QUIET)
   endif()
   if (MPI_FOUND)
     mark_as_advanced(MPI_LIBRARY)
     mark_as_advanced(MPI_EXTRA_LIBRARY)
   endif()
-endif (NOT MPI_FOUND AND PASTIX_LOOK_FOR_MPI)
+endif ()
 
 # PASTIX may depend on STARPU
 #----------------------------
@@ -272,14 +272,14 @@ if( NOT STARPU_FOUND AND PASTIX_LOOK_FOR_STARPU)
   endif()
   # set the list of optional dependencies we may discover
   if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_STARPU)
-    find_package(STARPU ${PASTIX_STARPU_VERSION} REQUIRED
+    find_dependency(STARPU ${PASTIX_STARPU_VERSION} REQUIRED
       COMPONENTS ${STARPU_COMPONENT_LIST})
   else()
-    find_package(STARPU ${PASTIX_STARPU_VERSION}
+    find_dependency(STARPU ${PASTIX_STARPU_VERSION}
       COMPONENTS ${STARPU_COMPONENT_LIST})
   endif()
 
-endif( NOT STARPU_FOUND AND PASTIX_LOOK_FOR_STARPU)
+endif()
 
 # PASTIX may depends on SCOTCH
 #-----------------------------
@@ -288,9 +288,9 @@ if (NOT SCOTCH_FOUND AND PASTIX_LOOK_FOR_SCOTCH)
     message(STATUS "Looking for PASTIX - Try to detect SCOTCH")
   endif()
   if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_SCOTCH)
-    find_package(SCOTCH REQUIRED QUIET)
+    find_dependency(SCOTCH REQUIRED QUIET)
   else()
-    find_package(SCOTCH QUIET)
+    find_dependency(SCOTCH QUIET)
   endif()
 endif()
 
@@ -301,9 +301,9 @@ if (NOT PTSCOTCH_FOUND AND PASTIX_LOOK_FOR_PTSCOTCH)
     message(STATUS "Looking for PASTIX - Try to detect PTSCOTCH")
   endif()
   if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_PTSCOTCH)
-    find_package(PTSCOTCH REQUIRED QUIET)
+    find_dependency(PTSCOTCH REQUIRED QUIET)
   else()
-    find_package(PTSCOTCH QUIET)
+    find_dependency(PTSCOTCH QUIET)
   endif()
 endif()
 
@@ -314,9 +314,9 @@ if (NOT METIS_FOUND AND PASTIX_LOOK_FOR_METIS)
     message(STATUS "Looking for PASTIX - Try to detect METIS")
   endif()
   if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_METIS)
-    find_package(METIS REQUIRED QUIET)
+    find_dependency(METIS REQUIRED QUIET)
   else()
-    find_package(METIS QUIET)
+    find_dependency(METIS QUIET)
   endif()
 endif()
 
@@ -478,7 +478,7 @@ foreach(pastix_lib ${PASTIX_libs_to_find})
   endif()
   mark_as_advanced(PASTIX_${pastix_lib}_LIBRARY)
 
-endforeach(pastix_lib ${PASTIX_libs_to_find})
+endforeach()
 
 # check a function to validate the find
 if(PASTIX_LIBRARIES)
@@ -681,7 +681,7 @@ if(PASTIX_LIBRARIES)
   set(CMAKE_REQUIRED_INCLUDES)
   set(CMAKE_REQUIRED_FLAGS)
   set(CMAKE_REQUIRED_LIBRARIES)
-endif(PASTIX_LIBRARIES)
+endif()
 
 if (PASTIX_LIBRARIES)
   list(GET PASTIX_LIBRARIES 0 first_lib)
diff --git a/contrib/eigen/cmake/FindSPQR.cmake b/contrib/eigen/cmake/FindSPQR.cmake
index 1e958c3c1b7210602e2cdb4c4e92d42187f58156..d6fb2e13d98065505bd959298ebeb9d488cb5507 100644
--- a/contrib/eigen/cmake/FindSPQR.cmake
+++ b/contrib/eigen/cmake/FindSPQR.cmake
@@ -6,7 +6,7 @@
 
 if (SPQR_INCLUDES AND SPQR_LIBRARIES)
   set(SPQR_FIND_QUIETLY TRUE)
-endif (SPQR_INCLUDES AND SPQR_LIBRARIES)
+endif ()
 
 find_path(SPQR_INCLUDES
   NAMES
@@ -33,7 +33,7 @@ if(SPQR_LIBRARIES)
     set(SPQR_LIBRARIES ${SPQR_LIBRARIES} ${CHOLMOD_LIBRARY})
   endif()
   
-endif(SPQR_LIBRARIES)
+endif()
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(SPQR DEFAULT_MSG SPQR_INCLUDES SPQR_LIBRARIES)
diff --git a/contrib/eigen/cmake/FindScotch.cmake b/contrib/eigen/cmake/FindScotch.cmake
index 89d295ac2eb782e87c3ca8cf003b048c00bb2974..11b971a926443aa415fb28759f6aa2f04f703e41 100644
--- a/contrib/eigen/cmake/FindScotch.cmake
+++ b/contrib/eigen/cmake/FindScotch.cmake
@@ -71,11 +71,12 @@ if( SCOTCH_FIND_COMPONENTS )
 endif()
 
 # SCOTCH may depend on Threads, try to find it
+include(CMakeFindDependencyMacro)
 if (NOT THREADS_FOUND)
   if (SCOTCH_FIND_REQUIRED)
-    find_package(Threads REQUIRED)
+    find_dependency(Threads REQUIRED)
   else()
-    find_package(Threads)
+    find_dependency(Threads)
   endif()
 endif()
 
@@ -301,7 +302,7 @@ if(SCOTCH_LIBRARIES)
   set(CMAKE_REQUIRED_INCLUDES)
   set(CMAKE_REQUIRED_FLAGS)
   set(CMAKE_REQUIRED_LIBRARIES)
-endif(SCOTCH_LIBRARIES)
+endif()
 
 if (SCOTCH_LIBRARIES)
   list(GET SCOTCH_LIBRARIES 0 first_lib)
diff --git a/contrib/eigen/cmake/FindStandardMathLibrary.cmake b/contrib/eigen/cmake/FindStandardMathLibrary.cmake
index 711b0e4b4f1d82323123088b4ef04cd0836b5f9f..1d1e5b3a91fd89f821b8fca3e884ab4dd59e5a43 100644
--- a/contrib/eigen/cmake/FindStandardMathLibrary.cmake
+++ b/contrib/eigen/cmake/FindStandardMathLibrary.cmake
@@ -10,6 +10,7 @@
 #                               pass the "-lm" linker flag.
 #
 # Copyright (c) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+#               2020 Susi Lehtola <susi.lehtola@gmail.com>
 # Redistribution and use is allowed according to the terms of the 2-clause BSD license.
 
 
@@ -17,10 +18,15 @@ include(CheckCXXSourceCompiles)
 
 # a little test program for c++ math functions.
 # notice the std:: is required on some platforms such as QNX
+# notice the (void) is required if -Wall (-Wunused-value) is added to CMAKE_CXX_FLAG
 
+# We read in the arguments from standard input to avoid the compiler optimizing away the calls
 set(find_standard_math_library_test_program
-"#include<cmath>
-int main() { std::sin(0.0); std::log(0.0f); }")
+"
+#include<cmath>
+int main(int argc, char **){
+  return int(std::sin(double(argc)) + std::log(double(argc)));
+}")
 
 # first try compiling/linking the test program without any linker flags
 
diff --git a/contrib/eigen/cmake/FindSuperLU.cmake b/contrib/eigen/cmake/FindSuperLU.cmake
index f38146e06c229d99df6bd80c6b00407e5a69a425..4b779f51613fcee770ab86cdb7e51020a3898a73 100644
--- a/contrib/eigen/cmake/FindSuperLU.cmake
+++ b/contrib/eigen/cmake/FindSuperLU.cmake
@@ -4,7 +4,7 @@
 
 if (SUPERLU_INCLUDES AND SUPERLU_LIBRARIES)
   set(SUPERLU_FIND_QUIETLY TRUE)
-endif (SUPERLU_INCLUDES AND SUPERLU_LIBRARIES)
+endif ()
 
 find_path(SUPERLU_INCLUDES
   NAMES
@@ -90,7 +90,7 @@ endif()
 endif()
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(SUPERLU
+find_package_handle_standard_args(SuperLU
                                   REQUIRED_VARS SUPERLU_INCLUDES SUPERLU_LIBRARIES SUPERLU_VERSION_OK
                                   VERSION_VAR SUPERLU_VERSION_VAR)
 
diff --git a/contrib/eigen/cmake/FindTriSYCL.cmake b/contrib/eigen/cmake/FindTriSYCL.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..81042390729788ca8a644e52ba901af74d355120
--- /dev/null
+++ b/contrib/eigen/cmake/FindTriSYCL.cmake
@@ -0,0 +1,173 @@
+#.rst:
+# FindTriSYCL
+#---------------
+#
+# TODO : insert Copyright and licence
+
+#########################
+#  FindTriSYCL.cmake
+#########################
+#
+#  Tools for finding and building with TriSYCL.
+#
+#  User must define TRISYCL_INCLUDE_DIR pointing to the triSYCL
+#  include directory.
+#
+#  Latest version of this file can be found at:
+#    https://github.com/triSYCL/triSYCL
+
+# Requite CMake version 3.5 or higher
+cmake_minimum_required (VERSION 3.5)
+
+# Check that a supported host compiler can be found
+if(CMAKE_COMPILER_IS_GNUCXX)
+  # Require at least gcc 5.4
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.4)
+    message(FATAL_ERROR
+      "host compiler - Not found! (gcc version must be at least 5.4)")
+  else()
+    message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}")
+  endif()
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  # Require at least clang 3.9
+  if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.9)
+    message(FATAL_ERROR
+      "host compiler - Not found! (clang version must be at least 3.9)")
+  else()
+    message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}")
+  endif()
+else()
+  message(WARNING
+    "host compiler - Not found! (triSYCL supports GCC and Clang)")
+endif()
+
+#triSYCL options
+option(TRISYCL_OPENMP "triSYCL multi-threading with OpenMP" ON)
+option(TRISYCL_OPENCL "triSYCL OpenCL interoperability mode" OFF)
+option(TRISYCL_NO_ASYNC "triSYCL use synchronous kernel execution" OFF)
+option(TRISYCL_DEBUG "triSCYL use debug mode" OFF)
+option(TRISYCL_DEBUG_STRUCTORS "triSYCL trace of object lifetimes" OFF)
+option(TRISYCL_TRACE_KERNEL "triSYCL trace of kernel execution" OFF)
+
+mark_as_advanced(TRISYCL_OPENMP)
+mark_as_advanced(TRISYCL_OPENCL)
+mark_as_advanced(TRISYCL_NO_ASYNC)
+mark_as_advanced(TRISYCL_DEBUG)
+mark_as_advanced(TRISYCL_DEBUG_STRUCTORS)
+mark_as_advanced(TRISYCL_TRACE_KERNEL)
+
+#triSYCL definitions
+set(CL_SYCL_LANGUAGE_VERSION 220 CACHE STRING
+  "Host language version to be used by trisYCL (default is: 220)")
+set(TRISYCL_CL_LANGUAGE_VERSION 220 CACHE STRING
+  "Device language version to be used by trisYCL (default is: 220)")
+# triSYCL now requires c++17
+set(CMAKE_CXX_STANDARD 17)
+set(CXX_STANDARD_REQUIRED ON)
+
+
+# Find OpenCL package
+include(CMakeFindDependencyMacro)
+if(TRISYCL_OPENCL)
+  find_dependency(OpenCL REQUIRED)
+  if(UNIX)
+    set(BOOST_COMPUTE_INCPATH /usr/include/compute CACHE PATH
+      "Path to Boost.Compute headers (default is: /usr/include/compute)")
+  endif()
+endif()
+
+# Find OpenMP package
+if(TRISYCL_OPENMP)
+  find_dependency(OpenMP REQUIRED)
+endif()
+
+# Find Boost
+find_dependency(Boost 1.58 REQUIRED COMPONENTS chrono log)
+
+# If debug or trace we need boost log
+if(TRISYCL_DEBUG OR TRISYCL_DEBUG_STRUCTORS OR TRISYCL_TRACE_KERNEL)
+  set(LOG_NEEDED ON)
+else()
+  set(LOG_NEEDED OFF)
+endif()
+
+find_dependency(Threads REQUIRED)
+
+# Find triSYCL directory
+if (TRISYCL_INCLUDES AND TRISYCL_LIBRARIES)
+  set(TRISYCL_FIND_QUIETLY TRUE)
+endif ()
+
+find_path(TRISYCL_INCLUDE_DIR
+  NAMES sycl.hpp
+  PATHS $ENV{TRISYCLDIR} $ENV{TRISYCLDIR}/include ${INCLUDE_INSTALL_DIR}
+  PATH_SUFFIXES triSYCL
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(TriSYCL DEFAULT_MSG
+                                  TRISYCL_INCLUDE_DIR)
+
+if(NOT TRISYCL_INCLUDE_DIR)
+  message(FATAL_ERROR
+    "triSYCL include directory - Not found! (please set TRISYCL_INCLUDE_DIR")
+else()
+  message(STATUS "triSYCL include directory - Found ${TRISYCL_INCLUDE_DIR}")
+endif()
+
+include(CMakeParseArguments)
+#######################
+#  add_sycl_to_target
+#######################
+function(add_sycl_to_target)
+  set(options)
+  set(one_value_args
+    TARGET
+  )
+  set(multi_value_args
+    SOURCES
+  )
+  cmake_parse_arguments(ADD_SYCL_ARGS
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN}
+  )
+
+  # Add include directories to the "#include <>" paths
+  target_include_directories (${ADD_SYCL_ARGS_TARGET} PUBLIC
+    ${TRISYCL_INCLUDE_DIR}
+    ${Boost_INCLUDE_DIRS}
+    $<$<BOOL:${TRISYCL_OPENCL}>:${OpenCL_INCLUDE_DIRS}>
+    $<$<BOOL:${TRISYCL_OPENCL}>:${BOOST_COMPUTE_INCPATH}>)
+
+  # Link dependencies
+  target_link_libraries(${ADD_SYCL_ARGS_TARGET}
+    $<$<BOOL:${TRISYCL_OPENCL}>:${OpenCL_LIBRARIES}>
+    Threads::Threads
+    $<$<BOOL:${LOG_NEEDED}>:Boost::log>
+    Boost::chrono)
+
+  # Compile definitions
+  target_compile_definitions(${ADD_SYCL_ARGS_TARGET} PUBLIC
+    EIGEN_SYCL_TRISYCL
+    $<$<BOOL:${TRISYCL_NO_ASYNC}>:TRISYCL_NO_ASYNC>
+    $<$<BOOL:${TRISYCL_OPENCL}>:TRISYCL_OPENCL>
+    $<$<BOOL:${TRISYCL_DEBUG}>:TRISYCL_DEBUG>
+    $<$<BOOL:${TRISYCL_DEBUG_STRUCTORS}>:TRISYCL_DEBUG_STRUCTORS>
+    $<$<BOOL:${TRISYCL_TRACE_KERNEL}>:TRISYCL_TRACE_KERNEL>
+    $<$<BOOL:${LOG_NEEDED}>:BOOST_LOG_DYN_LINK>)
+
+  # C++ and OpenMP requirements
+  target_compile_options(${ADD_SYCL_ARGS_TARGET} PUBLIC
+    ${TRISYCL_COMPILE_OPTIONS}
+    $<$<BOOL:${TRISYCL_OPENMP}>:${OpenMP_CXX_FLAGS}>)
+
+  if(${TRISYCL_OPENMP} AND (NOT WIN32))
+    # Does not support generator expressions
+    set_target_properties(${ADD_SYCL_ARGS_TARGET}
+      PROPERTIES
+      LINK_FLAGS ${OpenMP_CXX_FLAGS})
+  endif()
+
+endfunction()
diff --git a/contrib/eigen/cmake/FindUmfpack.cmake b/contrib/eigen/cmake/FindUmfpack.cmake
index 53cf0b49b2e23d3b8e8334e2ee53e69e313f6fbd..91cf6372f798dbdb86d46bfc8d7f56b913524395 100644
--- a/contrib/eigen/cmake/FindUmfpack.cmake
+++ b/contrib/eigen/cmake/FindUmfpack.cmake
@@ -3,7 +3,7 @@
 
 if (UMFPACK_INCLUDES AND UMFPACK_LIBRARIES)
   set(UMFPACK_FIND_QUIETLY TRUE)
-endif (UMFPACK_INCLUDES AND UMFPACK_LIBRARIES)
+endif ()
 
 find_path(UMFPACK_INCLUDES
   NAMES
@@ -22,7 +22,7 @@ if(UMFPACK_LIBRARIES)
 
   if(NOT UMFPACK_LIBDIR)
     get_filename_component(UMFPACK_LIBDIR ${UMFPACK_LIBRARIES} PATH)
-  endif(NOT UMFPACK_LIBDIR)
+  endif()
 
   find_library(COLAMD_LIBRARY colamd PATHS ${UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
   if(COLAMD_LIBRARY)
@@ -44,7 +44,7 @@ if(UMFPACK_LIBRARIES)
     set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${CHOLMOD_LIBRARY})
   endif()
 
-endif(UMFPACK_LIBRARIES)
+endif()
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(UMFPACK DEFAULT_MSG
diff --git a/contrib/eigen/cmake/RegexUtils.cmake b/contrib/eigen/cmake/RegexUtils.cmake
index b59dfc340f524d25add382306ba143e662baabdb..f0a15248bb8f5ad912ad492d642bd0b3e3c6d2e0 100644
--- a/contrib/eigen/cmake/RegexUtils.cmake
+++ b/contrib/eigen/cmake/RegexUtils.cmake
@@ -1,19 +1,19 @@
 function(escape_string_as_regex _str_out _str_in)
-  STRING(REGEX REPLACE "\\\\" "\\\\\\\\" FILETEST2 "${_str_in}")
-  STRING(REGEX REPLACE "([.$+*?|-])" "\\\\\\1" FILETEST2 "${FILETEST2}")
-  STRING(REGEX REPLACE "\\^" "\\\\^" FILETEST2 "${FILETEST2}")
-  STRING(REGEX REPLACE "\\(" "\\\\(" FILETEST2 "${FILETEST2}")
-  STRING(REGEX REPLACE "\\)" "\\\\)" FILETEST2 "${FILETEST2}")
-  STRING(REGEX REPLACE "\\[" "\\\\[" FILETEST2 "${FILETEST2}")
-  STRING(REGEX REPLACE "\\]" "\\\\]" FILETEST2 "${FILETEST2}")
-  SET(${_str_out} "${FILETEST2}" PARENT_SCOPE)
+  string(REGEX REPLACE "\\\\" "\\\\\\\\" FILETEST2 "${_str_in}")
+  string(REGEX REPLACE "([.$+*?|-])" "\\\\\\1" FILETEST2 "${FILETEST2}")
+  string(REGEX REPLACE "\\^" "\\\\^" FILETEST2 "${FILETEST2}")
+  string(REGEX REPLACE "\\(" "\\\\(" FILETEST2 "${FILETEST2}")
+  string(REGEX REPLACE "\\)" "\\\\)" FILETEST2 "${FILETEST2}")
+  string(REGEX REPLACE "\\[" "\\\\[" FILETEST2 "${FILETEST2}")
+  string(REGEX REPLACE "\\]" "\\\\]" FILETEST2 "${FILETEST2}")
+  set(${_str_out} "${FILETEST2}" PARENT_SCOPE)
 endfunction()
 
 function(test_escape_string_as_regex)
-  SET(test1 "\\.^$-+*()[]?|")
+  set(test1 "\\.^$-+*()[]?|")
   escape_string_as_regex(test2 "${test1}")
-  SET(testRef "\\\\\\.\\^\\$\\-\\+\\*\\(\\)\\[\\]\\?\\|")
+  set(testRef "\\\\\\.\\^\\$\\-\\+\\*\\(\\)\\[\\]\\?\\|")
   if(NOT test2 STREQUAL testRef)
 	message("Error in the escape_string_for_regex function : \n   ${test1} was escaped as ${test2}, should be ${testRef}")
-  endif(NOT test2 STREQUAL testRef)
+  endif()
 endfunction()
\ No newline at end of file
diff --git a/contrib/eigen/cmake/language_support.cmake b/contrib/eigen/cmake/language_support.cmake
deleted file mode 100644
index ddba509459fdc3677002789f558c11fc3f0cc57f..0000000000000000000000000000000000000000
--- a/contrib/eigen/cmake/language_support.cmake
+++ /dev/null
@@ -1,67 +0,0 @@
-# cmake/modules/language_support.cmake
-#
-# Temporary additional general language support is contained within this
-# file.  
-
-# This additional function definition is needed to provide a workaround for
-# CMake bug 9220.
-
-# On debian testing (cmake 2.6.2), I get return code zero when calling 
-# cmake the first time, but cmake crashes when running a second time
-# as follows:
-#
-#  -- The Fortran compiler identification is unknown
-#  CMake Error at /usr/share/cmake-2.6/Modules/CMakeFortranInformation.cmake:7 (GET_FILENAME_COMPONENT):
-#    get_filename_component called with incorrect number of arguments
-#  Call Stack (most recent call first):
-#    CMakeLists.txt:3 (enable_language)
-#
-# My workaround is to invoke cmake twice.  If both return codes are zero, 
-# it is safe to invoke ENABLE_LANGUAGE(Fortran OPTIONAL)
-
-function(workaround_9220 language language_works)
-  #message("DEBUG: language = ${language}")
-  set(text
-    "project(test NONE)
-    cmake_minimum_required(VERSION 2.8.0)
-    set (CMAKE_Fortran_FLAGS \"${CMAKE_Fortran_FLAGS}\")
-    set (CMAKE_EXE_LINKER_FLAGS \"${CMAKE_EXE_LINKER_FLAGS}\")
-    enable_language(${language})
-  ")
-  file(REMOVE_RECURSE ${CMAKE_BINARY_DIR}/language_tests/${language})
-  file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/language_tests/${language})
-  file(WRITE ${CMAKE_BINARY_DIR}/language_tests/${language}/CMakeLists.txt
-    ${text})
-  execute_process(
-    COMMAND ${CMAKE_COMMAND} . -G "${CMAKE_GENERATOR}"
-    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/language_tests/${language}
-    RESULT_VARIABLE return_code
-    OUTPUT_QUIET
-    ERROR_QUIET
-    )
-
-  if(return_code EQUAL 0)
-    # Second run
-    execute_process (
-      COMMAND ${CMAKE_COMMAND} . -G "${CMAKE_GENERATOR}"
-      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/language_tests/${language}
-      RESULT_VARIABLE return_code
-      OUTPUT_QUIET
-      ERROR_QUIET
-      )
-    if(return_code EQUAL 0)
-      set(${language_works} ON PARENT_SCOPE)
-    else(return_code EQUAL 0)
-      set(${language_works} OFF PARENT_SCOPE)
-    endif(return_code EQUAL 0)
-  else(return_code EQUAL 0)
-    set(${language_works} OFF PARENT_SCOPE)
-  endif(return_code EQUAL 0)
-endfunction(workaround_9220)
-
-# Temporary tests of the above function.
-#workaround_9220(CXX CXX_language_works)
-#message("CXX_language_works = ${CXX_language_works}")
-#workaround_9220(CXXp CXXp_language_works)
-#message("CXXp_language_works = ${CXXp_language_works}")
-
diff --git a/contrib/eigen/debug/gdb/printers.py b/contrib/eigen/debug/gdb/printers.py
index 0d67a5f998b022afccd06713a33b73b15a12260b..24961d1159bf95db201ccd79bbd97a2866d54a2b 100644
--- a/contrib/eigen/debug/gdb/printers.py
+++ b/contrib/eigen/debug/gdb/printers.py
@@ -10,8 +10,7 @@
 
 # Pretty printers for Eigen::Matrix
 # This is still pretty basic as the python extension to gdb is still pretty basic. 
-# It cannot handle complex eigen types and it doesn't support any of the other eigen types
-# Such as quaternion or some other type. 
+# It cannot handle complex eigen types and it doesn't support many of the other eigen types
 # This code supports fixed size as well as dynamic size matrices
 
 # To use it:
@@ -29,7 +28,45 @@
 import gdb
 import re
 import itertools
+from bisect import bisect_left
 
+# Basic row/column iteration code for use with Sparse and Dense matrices
+class _MatrixEntryIterator(object):
+	
+	def __init__ (self, rows, cols, rowMajor):
+		self.rows = rows
+		self.cols = cols
+		self.currentRow = 0
+		self.currentCol = 0
+		self.rowMajor = rowMajor
+
+	def __iter__ (self):
+		return self
+
+	def next(self):
+                return self.__next__()  # Python 2.x compatibility
+
+	def __next__(self):
+		row = self.currentRow
+		col = self.currentCol
+		if self.rowMajor == 0:
+			if self.currentCol >= self.cols:
+				raise StopIteration
+				
+			self.currentRow = self.currentRow + 1
+			if self.currentRow >= self.rows:
+				self.currentRow = 0
+				self.currentCol = self.currentCol + 1
+		else:
+			if self.currentRow >= self.rows:
+				raise StopIteration
+				
+			self.currentCol = self.currentCol + 1
+			if self.currentCol >= self.cols:
+				self.currentCol = 0
+				self.currentRow = self.currentRow + 1
+
+		return (row, col)
 
 class EigenMatrixPrinter:
 	"Print Eigen Matrix or Array of some kind"
@@ -77,42 +114,15 @@ class EigenMatrixPrinter:
 			self.data = self.data['array']
 			self.data = self.data.cast(self.innerType.pointer())
 			
-	class _iterator:
+	class _iterator(_MatrixEntryIterator):
 		def __init__ (self, rows, cols, dataPtr, rowMajor):
-			self.rows = rows
-			self.cols = cols
-			self.dataPtr = dataPtr
-			self.currentRow = 0
-			self.currentCol = 0
-			self.rowMajor = rowMajor
-			
-		def __iter__ (self):
-			return self
+			super(EigenMatrixPrinter._iterator, self).__init__(rows, cols, rowMajor)
 
-		def next(self):
-                        return self.__next__()  # Python 2.x compatibility
+			self.dataPtr = dataPtr
 
 		def __next__(self):
 			
-			row = self.currentRow
-			col = self.currentCol
-			if self.rowMajor == 0:
-				if self.currentCol >= self.cols:
-					raise StopIteration
-					
-				self.currentRow = self.currentRow + 1
-				if self.currentRow >= self.rows:
-					self.currentRow = 0
-					self.currentCol = self.currentCol + 1
-			else:
-				if self.currentRow >= self.rows:
-					raise StopIteration
-					
-				self.currentCol = self.currentCol + 1
-				if self.currentCol >= self.cols:
-					self.currentCol = 0
-					self.currentRow = self.currentRow + 1
-				
+			row, col = super(EigenMatrixPrinter._iterator, self).__next__()
 			
 			item = self.dataPtr.dereference()
 			self.dataPtr = self.dataPtr + 1
@@ -129,6 +139,95 @@ class EigenMatrixPrinter:
 	def to_string(self):
 		return "Eigen::%s<%s,%d,%d,%s> (data ptr: %s)" % (self.variety, self.innerType, self.rows, self.cols, "RowMajor" if self.rowMajor else  "ColMajor", self.data)
 
+class EigenSparseMatrixPrinter:
+	"Print an Eigen SparseMatrix"
+
+	def __init__(self, val):
+		"Extract all the necessary information"
+
+		type = val.type
+		if type.code == gdb.TYPE_CODE_REF:
+			type = type.target()
+		self.type = type.unqualified().strip_typedefs()
+		tag = self.type.tag
+		regex = re.compile('\<.*\>')
+		m = regex.findall(tag)[0][1:-1]
+		template_params = m.split(',')
+		template_params = [x.replace(" ", "") for x in template_params]
+
+		self.options = 0
+		if len(template_params) > 1:
+			self.options = template_params[1];
+		
+		self.rowMajor = (int(self.options) & 0x1)
+		
+		self.innerType = self.type.template_argument(0)
+		
+		self.val = val
+
+		self.data = self.val['m_data']
+		self.data = self.data.cast(self.innerType.pointer())
+
+	class _iterator(_MatrixEntryIterator):
+		def __init__ (self, rows, cols, val, rowMajor):
+			super(EigenSparseMatrixPrinter._iterator, self).__init__(rows, cols, rowMajor)
+
+			self.val = val
+			
+		def __next__(self):
+			
+			row, col = super(EigenSparseMatrixPrinter._iterator, self).__next__()
+				
+			# repeat calculations from SparseMatrix.h:
+			outer = row if self.rowMajor else col
+			inner = col if self.rowMajor else row
+			start = self.val['m_outerIndex'][outer]
+			end = ((start + self.val['m_innerNonZeros'][outer]) if self.val['m_innerNonZeros'] else
+			       self.val['m_outerIndex'][outer+1])
+
+			# and from CompressedStorage.h:
+			data = self.val['m_data']
+			if start >= end:
+				item = 0
+			elif (end > start) and (inner == data['m_indices'][end-1]):
+				item = data['m_values'][end-1]
+			else:
+				# create Python index list from the target range within m_indices
+				indices = [data['m_indices'][x] for x in range(int(start), int(end)-1)]
+				# find the index with binary search
+				idx = int(start) + bisect_left(indices, inner)
+				if ((idx < end) and (data['m_indices'][idx] == inner)):
+					item = data['m_values'][idx]
+				else:
+					item = 0
+
+			return ('[%d,%d]' % (row, col), item)
+
+	def children(self):
+		if self.data:
+			return self._iterator(self.rows(), self.cols(), self.val, self.rowMajor)
+
+		return iter([])   # empty matrix, for now
+
+
+	def rows(self):
+		return self.val['m_outerSize'] if self.rowMajor else self.val['m_innerSize']
+
+	def cols(self):
+		return self.val['m_innerSize'] if self.rowMajor else self.val['m_outerSize']
+
+	def to_string(self):
+
+		if self.data:
+			status = ("not compressed" if self.val['m_innerNonZeros'] else "compressed")
+		else:
+			status = "empty"
+		dimensions  = "%d x %d" % (self.rows(), self.cols())
+		layout      = "row" if self.rowMajor else "column"
+
+		return "Eigen::SparseMatrix<%s>, %s, %s major, %s" % (
+			self.innerType, dimensions, layout, status )
+
 class EigenQuaternionPrinter:
 	"Print an Eigen Quaternion"
 	
@@ -156,7 +255,7 @@ class EigenQuaternionPrinter:
 			return self
 	
 		def next(self):
-                        return self.__next__()  # Python 2.x compatibility
+			return self.__next__()  # Python 2.x compatibility
 
 		def __next__(self):
 			element = self.currentElement
@@ -180,6 +279,7 @@ class EigenQuaternionPrinter:
 def build_eigen_dictionary ():
 	pretty_printers_dict[re.compile('^Eigen::Quaternion<.*>$')] = lambda val: EigenQuaternionPrinter(val)
 	pretty_printers_dict[re.compile('^Eigen::Matrix<.*>$')] = lambda val: EigenMatrixPrinter("Matrix", val)
+	pretty_printers_dict[re.compile('^Eigen::SparseMatrix<.*>$')] = lambda val: EigenSparseMatrixPrinter(val)
 	pretty_printers_dict[re.compile('^Eigen::Array<.*>$')]  = lambda val: EigenMatrixPrinter("Array",  val)
 
 def register_eigen_printers(obj):
diff --git a/contrib/eigen/debug/msvc/eigen_autoexp_part.dat b/contrib/eigen/debug/msvc/eigen_autoexp_part.dat
index 07aa43739e5a8e200c1eed1c9ecc348c2a32e17d..35ef5807ccd3824c6976dfbcea8ef4f19721b412 100644
--- a/contrib/eigen/debug/msvc/eigen_autoexp_part.dat
+++ b/contrib/eigen/debug/msvc/eigen_autoexp_part.dat
@@ -14,7 +14,7 @@
 ; * - Eigen::Matrix<*,-1,+,*,*,*>
 ; * - Eigen::Matrix<*,+,+,*,*,*>
 ; *
-; * Matrices are displayed properly independantly of the memory
+; * Matrices are displayed properly independently of the memory
 ; * alignment (RowMajor vs. ColMajor).
 ; *
 ; * This file is distributed WITHOUT ANY WARRANTY. Please ensure
diff --git a/contrib/eigen/demos/CMakeLists.txt b/contrib/eigen/demos/CMakeLists.txt
index b0d2eddbb8eaaf87a668274fbe63bddfb02ac792..deb560f09706638f7c866b36c0c1f59a7d5b7b58 100644
--- a/contrib/eigen/demos/CMakeLists.txt
+++ b/contrib/eigen/demos/CMakeLists.txt
@@ -7,7 +7,7 @@ if(NOT EIGEN_TEST_NOQT)
   if(QT4_FOUND)
     add_subdirectory(mandelbrot)
     add_subdirectory(opengl)
-  else(QT4_FOUND)
+  else()
     message(STATUS "Qt4 not found, so disabling the mandelbrot and opengl demos")
-  endif(QT4_FOUND)
+  endif()
 endif()
diff --git a/contrib/eigen/demos/mandelbrot/CMakeLists.txt b/contrib/eigen/demos/mandelbrot/CMakeLists.txt
index 5c500e064dec6a01c8b203e250f746d26b7e626f..ae6001dbca64d639a474682ff484b9e8debadc65 100644
--- a/contrib/eigen/demos/mandelbrot/CMakeLists.txt
+++ b/contrib/eigen/demos/mandelbrot/CMakeLists.txt
@@ -5,7 +5,7 @@ set(CMAKE_INCLUDE_CURRENT_DIR ON)
 if (CMAKE_COMPILER_IS_GNUCXX)
    set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
    add_definitions ( "-DNDEBUG" )
-endif (CMAKE_COMPILER_IS_GNUCXX)
+endif ()
 
 include_directories( ${QT_INCLUDE_DIR} )
 
diff --git a/contrib/eigen/demos/mix_eigen_and_c/README b/contrib/eigen/demos/mix_eigen_and_c/README
index 21dba8679e36c67477342f26642703adb66c6251..d9cc9275db780b49780e68f043ef02464ecf6a44 100644
--- a/contrib/eigen/demos/mix_eigen_and_c/README
+++ b/contrib/eigen/demos/mix_eigen_and_c/README
@@ -6,4 +6,4 @@ To try this with GCC, do:
   gcc example.c binary_library.o -o example -lstdc++
   ./example
 
-TODO: add CMakeLists, add more explanations here
\ No newline at end of file
+TODO: add CMakeLists, add more explanations here
diff --git a/contrib/eigen/demos/mix_eigen_and_c/binary_library.h b/contrib/eigen/demos/mix_eigen_and_c/binary_library.h
index 0b983ad3a6035273437afba11ebea1d151869c81..9b63fac2f905c0c352aa99e469c863fcd9d5d894 100644
--- a/contrib/eigen/demos/mix_eigen_and_c/binary_library.h
+++ b/contrib/eigen/demos/mix_eigen_and_c/binary_library.h
@@ -68,4 +68,4 @@ extern "C"
 
 #ifdef __cplusplus
 } // end extern "C"
-#endif
\ No newline at end of file
+#endif
diff --git a/contrib/eigen/doc/A05_PortingFrom2To3.dox b/contrib/eigen/doc/A05_PortingFrom2To3.dox
deleted file mode 100644
index 51555f9967e7d8ef5371438f8fe355eb9344bdf4..0000000000000000000000000000000000000000
--- a/contrib/eigen/doc/A05_PortingFrom2To3.dox
+++ /dev/null
@@ -1,299 +0,0 @@
-namespace Eigen {
-
-/** \page Eigen2ToEigen3 Porting from Eigen2 to Eigen3
-
-This page lists the most important API changes between Eigen2 and Eigen3,
-and gives tips to help porting your application from Eigen2 to Eigen3.
-
-\eigenAutoToc
-
-\section CompatibilitySupport Eigen2 compatibility support
-
-Up to version 3.2 %Eigen provides <a href="http://eigen.tuxfamily.org/dox-3.2/Eigen2SupportModes.html">Eigen2 support modes</a>. These are removed now, because they were barely used anymore and became hard to maintain after internal re-designs.
-You can still use them by first <a href="http://eigen.tuxfamily.org/dox-3.2/Eigen2ToEigen3.html">porting your code to Eigen 3.2</a>.
-
-\section Using The USING_PART_OF_NAMESPACE_EIGEN macro
-
-The USING_PART_OF_NAMESPACE_EIGEN macro has been removed. In Eigen 3, just do:
-\code
-using namespace Eigen;
-\endcode
-
-\section ComplexDot Dot products over complex numbers
-
-This is the single trickiest change between Eigen 2 and Eigen 3. It only affects code using \c std::complex numbers as scalar type.
-
-Eigen 2's dot product was linear in the first variable. Eigen 3's dot product is linear in the second variable. In other words, the Eigen 2 code \code x.dot(y) \endcode is equivalent to the Eigen 3 code \code y.dot(x) \endcode In yet other words, dot products are complex-conjugated in Eigen 3 compared to Eigen 2. The switch to the new convention was commanded by common usage, especially with the notation \f$ x^Ty \f$ for dot products of column-vectors.
-
-\section VectorBlocks Vector blocks
-
-<table class="manual">
-<tr><th>Eigen 2</th><th>Eigen 3</th></th>
-<tr><td>\code
-vector.start(length)
-vector.start<length>()
-vector.end(length)
-vector.end<length>()
-\endcode</td><td>\code
-vector.head(length)
-vector.head<length>()
-vector.tail(length)
-vector.tail<length>()
-\endcode</td></tr>
-</table>
-
-
-\section Corners Matrix Corners
-
-<table class="manual">
-<tr><th>Eigen 2</th><th>Eigen 3</th></th>
-<tr><td>\code
-matrix.corner(TopLeft,r,c)
-matrix.corner(TopRight,r,c)
-matrix.corner(BottomLeft,r,c)
-matrix.corner(BottomRight,r,c)
-matrix.corner<r,c>(TopLeft)
-matrix.corner<r,c>(TopRight)
-matrix.corner<r,c>(BottomLeft)
-matrix.corner<r,c>(BottomRight)
-\endcode</td><td>\code
-matrix.topLeftCorner(r,c)
-matrix.topRightCorner(r,c)
-matrix.bottomLeftCorner(r,c)
-matrix.bottomRightCorner(r,c)
-matrix.topLeftCorner<r,c>()
-matrix.topRightCorner<r,c>()
-matrix.bottomLeftCorner<r,c>()
-matrix.bottomRightCorner<r,c>()
-\endcode</td>
-</tr>
-</table>
-
-Notice that Eigen3 also provides these new convenience methods: topRows(), bottomRows(), leftCols(), rightCols(). See in class DenseBase.
-
-\section CoefficientWiseOperations Coefficient wise operations
-
-In Eigen2, coefficient wise operations which have no proper mathematical definition (as a coefficient wise product)
-were achieved using the .cwise() prefix, e.g.:
-\code a.cwise() * b \endcode
-In Eigen3 this .cwise() prefix has been superseded by a new kind of matrix type called
-Array for which all operations are performed coefficient wise. You can easily view a matrix as an array and vice versa using
-the MatrixBase::array() and ArrayBase::matrix() functions respectively. Here is an example:
-\code
-Vector4f a, b, c;
-c = a.array() * b.array();
-\endcode
-Note that the .array() function is not at all a synonym of the deprecated .cwise() prefix.
-While the .cwise() prefix changed the behavior of the following operator, the array() function performs
-a permanent conversion to the array world. Therefore, for binary operations such as the coefficient wise product,
-both sides must be converted to an \em array as in the above example. On the other hand, when you
-concatenate multiple coefficient wise operations you only have to do the conversion once, e.g.:
-\code
-Vector4f a, b, c;
-c = a.array().abs().pow(3) * b.array().abs().sin();
-\endcode
-With Eigen2 you would have written:
-\code
-c = (a.cwise().abs().cwise().pow(3)).cwise() * (b.cwise().abs().cwise().sin());
-\endcode
-
-\section PartAndExtract Triangular and self-adjoint matrices
-
-In Eigen 2 you had to play with the part, extract, and marked functions to deal with triangular and selfadjoint matrices. In Eigen 3, all these functions have been removed in favor of the concept of \em views:
-
-<table class="manual">
-<tr><th>Eigen 2</th><th>Eigen 3</th></tr>
-<tr><td>\code
-A.part<UpperTriangular>();
-A.part<StrictlyLowerTriangular>(); \endcode</td>
-<td>\code
-A.triangularView<Upper>()
-A.triangularView<StrictlyLower>()\endcode</td></tr>
-<tr><td>\code
-A.extract<UpperTriangular>();
-A.extract<StrictlyLowerTriangular>();\endcode</td>
-<td>\code
-A.triangularView<Upper>()
-A.triangularView<StrictlyLower>()\endcode</td></tr>
-<tr><td>\code
-A.marked<UpperTriangular>();
-A.marked<StrictlyLowerTriangular>();\endcode</td>
-<td>\code
-A.triangularView<Upper>()
-A.triangularView<StrictlyLower>()\endcode</td></tr>
-<tr><td colspan="2"></td></tr>
-<tr><td>\code
-A.part<SelfAdfjoint|UpperTriangular>();
-A.extract<SelfAdfjoint|LowerTriangular>();\endcode</td>
-<td>\code
-A.selfadjointView<Upper>()
-A.selfadjointView<Lower>()\endcode</td></tr>
-<tr><td colspan="2"></td></tr>
-<tr><td>\code
-UpperTriangular
-LowerTriangular
-UnitUpperTriangular
-UnitLowerTriangular
-StrictlyUpperTriangular
-StrictlyLowerTriangular
-\endcode</td><td>\code
-Upper
-Lower
-UnitUpper
-UnitLower
-StrictlyUpper
-StrictlyLower
-\endcode</td>
-</tr>
-</table>
-
-\sa class TriangularView, class SelfAdjointView
-
-\section TriangularSolveInPlace Triangular in-place solving
-
-<table class="manual">
-<tr><th>Eigen 2</th><th>Eigen 3</th></tr>
-<tr><td>\code A.triangularSolveInPlace<XxxTriangular>(Y);\endcode</td><td>\code A.triangularView<Xxx>().solveInPlace(Y);\endcode</td></tr>
-</table>
-
-
-\section Decompositions Matrix decompositions
-
-Some of Eigen 2's matrix decompositions have been renamed in Eigen 3, while some others have been removed and are replaced by other decompositions in Eigen 3.
-
-<table class="manual">
-  <tr>
-    <th>Eigen 2</th>
-    <th>Eigen 3</th>
-    <th>Notes</th>
-  </tr>
-  <tr>
-    <td>LU</td>
-    <td>FullPivLU</td>
-    <td class="alt">See also the new PartialPivLU, it's much faster</td>
-  </tr>
-  <tr>
-    <td>QR</td>
-    <td>HouseholderQR</td>
-    <td class="alt">See also the new ColPivHouseholderQR, it's more reliable</td>
-  </tr>
-  <tr>
-    <td>SVD</td>
-    <td>JacobiSVD</td>
-    <td class="alt">We currently don't have a bidiagonalizing SVD; of course this is planned.</td>
-  </tr>
-  <tr>
-    <td>EigenSolver and friends</td>
-    <td>\code #include<Eigen/Eigenvalues> \endcode </td>
-    <td class="alt">Moved to separate module</td>
-  </tr>
-</table>
-
-\section LinearSolvers Linear solvers
-
-<table class="manual">
-<tr><th>Eigen 2</th><th>Eigen 3</th><th>Notes</th></tr>
-<tr><td>\code A.lu();\endcode</td>
-<td>\code A.fullPivLu();\endcode</td>
-<td class="alt">Now A.lu() returns a PartialPivLU</td></tr>
-<tr><td>\code A.lu().solve(B,&X);\endcode</td>
-<td>\code X = A.lu().solve(B);
- X = A.fullPivLu().solve(B);\endcode</td>
-<td class="alt">The returned by value is fully optimized</td></tr>
-<tr><td>\code A.llt().solve(B,&X);\endcode</td>
-<td>\code X = A.llt().solve(B);
- X = A.selfadjointView<Lower>.llt().solve(B);
- X = A.selfadjointView<Upper>.llt().solve(B);\endcode</td>
-<td class="alt">The returned by value is fully optimized and \n
-the selfadjointView API allows you to select the \n
-triangular part to work on (default is lower part)</td></tr>
-<tr><td>\code A.llt().solveInPlace(B);\endcode</td>
-<td>\code B = A.llt().solve(B);
- B = A.selfadjointView<Lower>.llt().solve(B);
- B = A.selfadjointView<Upper>.llt().solve(B);\endcode</td>
-<td class="alt">In place solving</td></tr>
-<tr><td>\code A.ldlt().solve(B,&X);\endcode</td>
-<td>\code X = A.ldlt().solve(B);
- X = A.selfadjointView<Lower>.ldlt().solve(B);
- X = A.selfadjointView<Upper>.ldlt().solve(B);\endcode</td>
-<td class="alt">The returned by value is fully optimized and \n
-the selfadjointView API allows you to select the \n
-triangular part to work on</td></tr>
-</table>
-
-\section GeometryModule Changes in the Geometry module
-
-The Geometry module is the one that changed the most. If you rely heavily on it, it's probably a good idea to use the <a href="http://eigen.tuxfamily.org/dox-3.2/Eigen2SupportModes.html">"Eigen 2 support modes"</a> to perform your migration.
-
-\section Transform The Transform class
-
-In Eigen 2, the Transform class didn't really know whether it was a projective or affine transformation. In Eigen 3, it takes a new \a Mode template parameter, which indicates whether it's \a Projective or \a Affine transform. There is no default value.
-
-The Transform3f (etc) typedefs are no more. In Eigen 3, the Transform typedefs explicitly refer to the \a Projective and \a Affine modes:
-
-<table class="manual">
-<tr><th>Eigen 2</th><th>Eigen 3</th><th>Notes</th></tr>
-<tr>
-  <td> Transform3f </td>
-  <td> Affine3f or Projective3f </td>
-  <td> Of course 3f is just an example here </td>
-</tr>
-</table>
-
-
-\section LazyVsNoalias Lazy evaluation and noalias
-
-In Eigen all operations are performed in a lazy fashion except the matrix products which are always evaluated into a temporary by default.
-In Eigen2, lazy evaluation could be enforced by tagging a product using the .lazy() function. However, in complex expressions it was not
-easy to determine where to put the lazy() function. In Eigen3, the lazy() feature has been superseded by the MatrixBase::noalias() function
-which can be used on the left hand side of an assignment when no aliasing can occur. Here is an example:
-\code
-MatrixXf a, b, c;
-...
-c.noalias() += 2 * a.transpose() * b;
-\endcode
-However, the noalias mechanism does not cover all the features of the old .lazy(). Indeed, in some extremely rare cases,
-it might be useful to explicit request for a lay product, i.e., for a product which will be evaluated one coefficient at once, on request,
-just like any other expressions. To this end you can use the MatrixBase::lazyProduct() function, however we strongly discourage you to
-use it unless you are sure of what you are doing, i.e., you have rigourosly measured a speed improvement.
-
-\section AlignMacros Alignment-related macros
-
-The EIGEN_ALIGN_128 macro has been renamed to EIGEN_ALIGN16. Don't be surprised, it's just that we switched to counting in bytes ;-)
-
-The \link TopicPreprocessorDirectivesPerformance EIGEN_DONT_ALIGN \endlink option still exists in Eigen 3, but it has a new cousin: \link TopicPreprocessorDirectivesPerformance  EIGEN_DONT_ALIGN_STATICALLY.\endlink It allows to get rid of all static alignment issues while keeping alignment of dynamic-size heap-allocated arrays. Vectorization of statically allocated arrays is still preserved (unless you define \link TopicPreprocessorDirectivesPerformance EIGEN_UNALIGNED_VECTORIZE \endlink =0), at the cost of unaligned memory stores.
-
-\section AlignedMap Aligned Map objects
-
-A common issue with Eigen 2 was that when mapping an array with Map, there was no way to tell Eigen that your array was aligned. There was a ForceAligned option but it didn't mean that; it was just confusing and has been removed.
-
-New in Eigen3 is the #Aligned option. See the documentation of class Map. Use it like this:
-\code
-Map<Vector4f, Aligned> myMappedVector(some_aligned_array);
-\endcode
-There also are related convenience static methods, which actually are the preferred way as they take care of such things as constness:
-\code
-result = Vector4f::MapAligned(some_aligned_array);
-\endcode
-
-\section StdContainers STL Containers
-
-In Eigen2, <tt>\#include\<Eigen/StdVector\></tt> tweaked std::vector to automatically align elements. The problem was that that was quite invasive. In Eigen3, we only override standard behavior if you use Eigen::aligned_allocator<T> as your allocator type. So for example, if you use std::vector<Matrix4f>, you need to do the following change (note that aligned_allocator is under namespace Eigen):
-
-<table class="manual">
-<tr><th>Eigen 2</th><th>Eigen 3</th></tr>
-<tr>
-  <td> \code std::vector<Matrix4f> \endcode </td>
-  <td> \code std::vector<Matrix4f, aligned_allocator<Matrix4f> > \endcode </td>
-</tr>
-</table>
-
-\section eiPrefix Internal ei_ prefix
-
-In Eigen2, global internal functions and structures were prefixed by \c ei_. In Eigen3, they all have been moved into the more explicit \c internal namespace. So, e.g., \c ei_sqrt(x) now becomes \c internal::sqrt(x). Of course it is not recommended to rely on Eigen's internal features.
-
-
-
-*/
-
-}
diff --git a/contrib/eigen/doc/AsciiQuickReference.txt b/contrib/eigen/doc/AsciiQuickReference.txt
index 0ca54cef3e1abf561914f877b2069ff73c8950d3..18b4446c672730c42095b7d8716cfdcd33de2ec6 100644
--- a/contrib/eigen/doc/AsciiQuickReference.txt
+++ b/contrib/eigen/doc/AsciiQuickReference.txt
@@ -50,6 +50,12 @@ VectorXi::LinSpaced(((hi-low)/step)+1,      // low:step:hi
 // Matrix slicing and blocks. All expressions listed here are read/write.
 // Templated size versions are faster. Note that Matlab is 1-based (a size N
 // vector is x(1)...x(N)).
+/******************************************************************************/
+/*                  PLEASE HELP US IMPROVING THIS SECTION                     */
+/* Eigen 3.4 supports a much improved API for sub-matrices, including,        */
+/* slicing and indexing from arrays:                                          */
+/* http://eigen.tuxfamily.org/dox-devel/group__TutorialSlicingIndexing.html   */
+/******************************************************************************/
 // Eigen                           // Matlab
 x.head(n)                          // x(1:n)
 x.head<n>()                        // x(1:n)
@@ -88,6 +94,11 @@ R.row(i) = P.col(j);               // R(i, :) = P(:, j)
 R.col(j1).swap(mat1.col(j2));      // R(:, [j1 j2]) = R(:, [j2, j1])
 
 // Views, transpose, etc;
+/******************************************************************************/
+/*                  PLEASE HELP US IMPROVING THIS SECTION                     */
+/* Eigen 3.4 supports a new API for reshaping:                                */
+/* http://eigen.tuxfamily.org/dox-devel/group__TutorialReshape.html           */
+/******************************************************************************/
 // Eigen                           // Matlab
 R.adjoint()                        // R'
 R.transpose()                      // R.' or conj(R')       // Read-write
diff --git a/contrib/eigen/doc/CMakeLists.txt b/contrib/eigen/doc/CMakeLists.txt
index 8ff7559885e1bc2369bb7ba8fa2873569ea6ce90..0f9ef23824fe16dfcecdfa618aa139d85a725b74 100644
--- a/contrib/eigen/doc/CMakeLists.txt
+++ b/contrib/eigen/doc/CMakeLists.txt
@@ -7,11 +7,14 @@ project(EigenDoc)
 if(CMAKE_COMPILER_IS_GNUCXX)
   if(CMAKE_SYSTEM_NAME MATCHES Linux)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O1 -g1")
-  endif(CMAKE_SYSTEM_NAME MATCHES Linux)
-endif(CMAKE_COMPILER_IS_GNUCXX)
+  endif()
+endif()
 
-option(EIGEN_INTERNAL_DOCUMENTATION "Build internal documentation" OFF)
+# some examples and snippets needs c++11, so let's check it once
+check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
 
+option(EIGEN_INTERNAL_DOCUMENTATION "Build internal documentation" OFF)
+option(EIGEN_DOC_USE_MATHJAX "Use MathJax for rendering math in HTML docs" ON)
 
 # Set some Doxygen flags
 set(EIGEN_DOXY_PROJECT_NAME             "Eigen")
@@ -19,11 +22,18 @@ set(EIGEN_DOXY_OUTPUT_DIRECTORY_SUFFIX  "")
 set(EIGEN_DOXY_INPUT                    "\"${Eigen_SOURCE_DIR}/Eigen\" \"${Eigen_SOURCE_DIR}/doc\"")
 set(EIGEN_DOXY_HTML_COLORSTYLE_HUE      "220")
 set(EIGEN_DOXY_TAGFILES                 "")
+
 if(EIGEN_INTERNAL_DOCUMENTATION)
   set(EIGEN_DOXY_INTERNAL                 "YES")
-else(EIGEN_INTERNAL_DOCUMENTATION)
+else()
   set(EIGEN_DOXY_INTERNAL                 "NO")
-endif(EIGEN_INTERNAL_DOCUMENTATION)
+endif()
+
+if (EIGEN_DOC_USE_MATHJAX)
+  set(EIGEN_DOXY_USE_MATHJAX "YES")
+else ()
+  set(EIGEN_DOXY_USE_MATHJAX "NO")
+endif()
 
 configure_file(
   ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in
@@ -103,7 +113,7 @@ add_custom_target(doc ALL
   COMMAND doxygen Doxyfile-unsupported
   COMMAND ${CMAKE_COMMAND} -E copy ${Eigen_BINARY_DIR}/doc/html/group__TopicUnalignedArrayAssert.html ${Eigen_BINARY_DIR}/doc/html/TopicUnalignedArrayAssert.html
   COMMAND ${CMAKE_COMMAND} -E rename html eigen-doc
-  COMMAND ${CMAKE_COMMAND} -E remove eigen-doc/eigen-doc.tgz
+  COMMAND ${CMAKE_COMMAND} -E remove eigen-doc/eigen-doc.tgz eigen-doc/unsupported/_formulas.log eigen-doc/_formulas.log
   COMMAND ${CMAKE_COMMAND} -E tar cfz eigen-doc.tgz eigen-doc
   COMMAND ${CMAKE_COMMAND} -E rename eigen-doc.tgz eigen-doc/eigen-doc.tgz
   COMMAND ${CMAKE_COMMAND} -E rename eigen-doc html
diff --git a/contrib/eigen/doc/CoeffwiseMathFunctionsTable.dox b/contrib/eigen/doc/CoeffwiseMathFunctionsTable.dox
index 12a565b166478672f13018c536f5dbe6a5abe815..3f5c564462bc2d3c5c243a02abd94f99f1aed05a 100644
--- a/contrib/eigen/doc/CoeffwiseMathFunctionsTable.dox
+++ b/contrib/eigen/doc/CoeffwiseMathFunctionsTable.dox
@@ -73,6 +73,20 @@ This also means that, unless specified, if the function \c std::foo is available
   </td>
   <td>All engines (fc,fd)</td>
 </tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_arg
+  a.\link ArrayBase::arg arg\endlink(); \n
+  \link Eigen::arg arg\endlink(a); \n
+  m.\link MatrixBase::cwiseArg cwiseArg\endlink();
+  </td>
+  <td>phase angle of complex number</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/complex/arg">std::arg</a>; \n
+  arg(a[i]);
+  </td>
+  <td>All engines (fc,fd)</td>
+</tr>
 <tr>
 <th colspan="4">Exponential functions</th>
 </tr>
@@ -320,6 +334,42 @@ This also means that, unless specified, if the function \c std::foo is available
   tanh(a[i]);</td>
   <td></td>
 </tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_asinh
+  a.\link ArrayBase::asinh asinh\endlink(); \n
+  \link Eigen::asinh asinh\endlink(a);
+  </td>
+  <td>computes inverse hyperbolic sine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/asinh">std::asinh</a>; \n
+  asinh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_acosh
+  a.\link ArrayBase::acosh cohs\endlink(); \n
+  \link Eigen::acosh acosh\endlink(a);
+  </td>
+  <td>computes hyperbolic cosine</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/acosh">std::acosh</a>; \n
+  acosh(a[i]);</td>
+  <td></td>
+</tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_atanh
+  a.\link ArrayBase::atanh atanh\endlink(); \n
+  \link Eigen::atanh atanh\endlink(a);
+  </td>
+  <td>computes hyperbolic tangent</td>
+  <td class="code">
+  using <a href="http://en.cppreference.com/w/cpp/numeric/math/atanh">std::atanh</a>; \n
+  atanh(a[i]);</td>
+  <td></td>
+</tr>
 <tr>
 <th colspan="4">Nearest integer floating point operations</th>
 </tr>
@@ -358,6 +408,17 @@ This also means that, unless specified, if the function \c std::foo is available
   plus \c using <a href="http://en.cppreference.com/w/cpp/numeric/math/round">\c std::round </a>; \cpp11</td>
   <td>SSE4,AVX,ZVector (f,d)</td>
 </tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_rint
+  a.\link ArrayBase::rint rint\endlink(); \n
+  \link Eigen::rint rint\endlink(a);
+  </td>
+  <td>nearest integer, \n rounding to nearest even in halfway cases</td>
+  <td>built-in generic implementation using <a href="http://en.cppreference.com/w/cpp/numeric/math/rint">\c std::rint </a>; \cpp11
+  or <a href="http://en.cppreference.com/w/c/numeric/math/rint">\c rintf </a>; </td>
+  <td>SSE4,AVX (f,d)</td>
+</tr>
 <tr>
 <th colspan="4">Floating point manipulation functions</th>
 </tr>
@@ -507,7 +568,8 @@ This also means that, unless specified, if the function \c std::foo is available
 <tr>
   <td class="code">
   \anchor cwisetable_zeta
-  \link Eigen::zeta zeta\endlink(a,b);
+  \link Eigen::zeta zeta\endlink(a,b); \n
+  a.\link ArrayBase::zeta zeta\endlink(b);
   </td>
   <td><a href="https://en.wikipedia.org/wiki/Hurwitz_zeta_function">Hurwitz zeta function</a>
   \n \f$ \zeta(a_i,b_i)=\sum_{k=0}^{\infty}(b_i+k)^{\text{-}a_i} \f$</td>
@@ -516,6 +578,18 @@ This also means that, unless specified, if the function \c std::foo is available
   </td>
   <td></td>
 </tr>
+<tr>
+  <td class="code">
+  \anchor cwisetable_ndtri
+  a.\link ArrayBase::ndtri ndtri\endlink(); \n
+  \link Eigen::ndtri ndtri\endlink(a);
+  </td>
+  <td>Inverse of the CDF of the Normal distribution function</td>
+  <td>
+  built-in for float and double
+  </td>
+  <td></td>
+</tr>
 <tr><td colspan="4"></td></tr>
 </table>
 
diff --git a/contrib/eigen/doc/DenseDecompositionBenchmark.dox b/contrib/eigen/doc/DenseDecompositionBenchmark.dox
index 7be9c70cd5d2f3e1e2e83eaf4f2ed7a629d27b3e..8f9570b7aa227fe98df0d58ec09edf2659184d8c 100644
--- a/contrib/eigen/doc/DenseDecompositionBenchmark.dox
+++ b/contrib/eigen/doc/DenseDecompositionBenchmark.dox
@@ -35,7 +35,7 @@ Timings are in \b milliseconds, and factors are relative to the LLT decompositio
  + For large problem sizes, only the decomposition implementing a cache-friendly blocking strategy scale well. Those include LLT, PartialPivLU, HouseholderQR, and BDCSVD. This explain why for a 4k x 4k matrix, HouseholderQR is faster than LDLT. In the future, LDLT and ColPivHouseholderQR will also implement blocking strategies.
  + CompleteOrthogonalDecomposition is based on ColPivHouseholderQR and they thus achieve the same level of performance.
 
-The above table has been generated by the <a href="https://bitbucket.org/eigen/eigen/raw/default/bench/dense_solvers.cpp">bench/dense_solvers.cpp</a> file, feel-free to hack it to generate a table matching your hardware, compiler, and favorite problem sizes.
+The above table has been generated by the <a href="https://gitlab.com/libeigen/eigen/raw/master/bench/dense_solvers.cpp">bench/dense_solvers.cpp</a> file, feel-free to hack it to generate a table matching your hardware, compiler, and favorite problem sizes.
 
 */
 
diff --git a/contrib/eigen/doc/Doxyfile.in b/contrib/eigen/doc/Doxyfile.in
index d1deb9592a6c40b239f0d38c523c2b9e0d1b3175..bc1e03c40455c16a1df0fadd3e69349126eaf4e6 100644
--- a/contrib/eigen/doc/Doxyfile.in
+++ b/contrib/eigen/doc/Doxyfile.in
@@ -229,7 +229,8 @@ ALIASES                = "only_for_vectors=This is only for vectors (either row-
                          "blank= " \
                          "cpp11=<span class='cpp11'>[c++11]</span>" \
                          "cpp14=<span class='cpp14'>[c++14]</span>" \
-                         "cpp17=<span class='cpp17'>[c++17]</span>"
+                         "cpp17=<span class='cpp17'>[c++17]</span>" \
+                         "newin{1}=<span class='newin3x'>New in %Eigen \1.</span>"
                          
 
 ALIASES += "eigenAutoToc=  "
@@ -409,7 +410,7 @@ EXTRACT_PACKAGE        = NO
 # If the EXTRACT_STATIC tag is set to YES all static members of a file
 # will be included in the documentation.
 
-EXTRACT_STATIC         = NO
+EXTRACT_STATIC         = YES
 
 # If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
 # defined locally in source files will be included in the documentation.
@@ -736,6 +737,14 @@ EXCLUDE                = "${Eigen_SOURCE_DIR}/Eigen/src/Core/products" \
                          "${Eigen_SOURCE_DIR}/unsupported/doc/examples" \
                          "${Eigen_SOURCE_DIR}/unsupported/doc/snippets"
 
+# Forward declarations of class templates cause the title of the main page for
+# the class template to not contain the template signature.  This only happens
+# when the \class command is used to document the class.  Possibly caused
+# by https://github.com/doxygen/doxygen/issues/7698.  Confirmed fixed by
+# doxygen release 1.8.19.
+
+EXCLUDE += "${Eigen_SOURCE_DIR}/Eigen/src/Core/util/ForwardDeclarations.h"
+
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
 # from the input.
@@ -1245,7 +1254,7 @@ FORMULA_TRANSPARENT    = YES
 # output. When enabled you may also need to install MathJax separately and
 # configure the path to it using the MATHJAX_RELPATH option.
 
-USE_MATHJAX            = NO
+USE_MATHJAX            = @EIGEN_DOXY_USE_MATHJAX@
 
 # When MathJax is enabled you need to specify the location relative to the
 # HTML output directory using the MATHJAX_RELPATH option. The destination
@@ -1257,12 +1266,12 @@ USE_MATHJAX            = NO
 # However, it is strongly recommended to install a local
 # copy of MathJax from http://www.mathjax.org before deployment.
 
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+MATHJAX_RELPATH        = https://cdn.mathjax.org/mathjax/latest
 
 # The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension
 # names that should be enabled during MathJax rendering.
 
-MATHJAX_EXTENSIONS     =
+MATHJAX_EXTENSIONS     = TeX/AMSmath TeX/AMSsymbols
 
 # When the SEARCHENGINE tag is enabled doxygen will generate a search box
 # for the HTML output. The underlying search engine uses javascript
@@ -1591,6 +1600,8 @@ PREDEFINED             = EIGEN_EMPTY_STRUCT \
                          EIGEN_QT_SUPPORT \
                          EIGEN_STRONG_INLINE=inline \
                          EIGEN_DEVICE_FUNC= \
+                         EIGEN_HAS_CXX11=1 \
+                         EIGEN_HAS_CXX11_MATH=1 \
                          "EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR)=template<typename OtherDerived> const CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived> METHOD(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const;" \
                          "EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS)=CwiseBinaryOp<internal::scalar_product_op<LHS::Scalar,RHS::Scalar>, const LHS, const RHS>"\
                          "EIGEN_CAT2(a,b)= a ## b"\
@@ -1770,7 +1781,7 @@ UML_LOOK               = YES
 # the class node. If there are many fields or methods and many nodes the
 # graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS
 # threshold limits the number of items for each type to make the size more
-# managable. Set this to 0 for no limit. Note that the threshold may be
+# manageable. Set this to 0 for no limit. Note that the threshold may be
 # exceeded by 50% before the limit is enforced.
 
 UML_LIMIT_NUM_FIELDS   = 10
diff --git a/contrib/eigen/doc/FixedSizeVectorizable.dox b/contrib/eigen/doc/FixedSizeVectorizable.dox
index 49e38af76832f4dbdee751da1cb0697e0ca5d661..0012465cae603be78f73ceb9cebf9b6d0f2c6064 100644
--- a/contrib/eigen/doc/FixedSizeVectorizable.dox
+++ b/contrib/eigen/doc/FixedSizeVectorizable.dox
@@ -1,6 +1,6 @@
 namespace Eigen {
 
-/** \eigenManualPage TopicFixedSizeVectorizable Fixed-size vectorizable Eigen objects
+/** \eigenManualPage TopicFixedSizeVectorizable Fixed-size vectorizable %Eigen objects
 
 The goal of this page is to explain what we mean by "fixed-size vectorizable".
 
@@ -23,15 +23,15 @@ Examples include:
 
 \section FixedSizeVectorizable_explanation Explanation
 
-First, "fixed-size" should be clear: an Eigen object has fixed size if its number of rows and its number of columns are fixed at compile-time. So for example Matrix3f has fixed size, but MatrixXf doesn't (the opposite of fixed-size is dynamic-size).
+First, "fixed-size" should be clear: an %Eigen object has fixed size if its number of rows and its number of columns are fixed at compile-time. So for example \ref Matrix3f has fixed size, but \ref MatrixXf doesn't (the opposite of fixed-size is dynamic-size).
 
-The array of coefficients of a fixed-size Eigen object is a plain "static array", it is not dynamically allocated. For example, the data behind a Matrix4f is just a "float array[16]".
+The array of coefficients of a fixed-size %Eigen object is a plain "static array", it is not dynamically allocated. For example, the data behind a \ref Matrix4f is just a "float array[16]".
 
 Fixed-size objects are typically very small, which means that we want to handle them with zero runtime overhead -- both in terms of memory usage and of speed.
 
-Now, vectorization (both SSE and AltiVec) works with 128-bit packets. Moreover, for performance reasons, these packets need to be have 128-bit alignment.
+Now, vectorization works with 128-bit packets (e.g., SSE, AltiVec, NEON), 256-bit packets (e.g., AVX), or 512-bit packets (e.g., AVX512). Moreover, for performance reasons, these packets are most efficiently read and written if they have the same alignment as the packet size, that is 16 bytes, 32 bytes, and 64 bytes respectively.
 
-So it turns out that the only way that fixed-size Eigen objects can be vectorized, is if their size is a multiple of 128 bits, or 16 bytes. Eigen will then request 16-byte alignment for these objects, and henceforth rely on these objects being aligned so no runtime check for alignment is performed.
+So it turns out that the best way that fixed-size %Eigen objects can be vectorized, is if their size is a multiple of 16 bytes (or more). %Eigen will then request 16-byte alignment (or more) for these objects, and henceforth rely on these objects being aligned to achieve maximal efficiency.
 
 */
 
diff --git a/contrib/eigen/doc/FunctionsTakingEigenTypes.dox b/contrib/eigen/doc/FunctionsTakingEigenTypes.dox
index 152dda47d91560a9b363637b13664b056d0ee866..6b4e49214c295cb67eb3cf3a01767661684a0765 100644
--- a/contrib/eigen/doc/FunctionsTakingEigenTypes.dox
+++ b/contrib/eigen/doc/FunctionsTakingEigenTypes.dox
@@ -79,7 +79,7 @@ These examples are just intended to give the reader a first impression of how fu
 
 \section TopicUsingRefClass How to write generic, but non-templated function?
 
-In all the previous examples, the functions had to be template functions. This approach allows to write very generic code, but it is often desirable to write non templated function and still keep some level of genericity to avoid stupid copies of the arguments. The typical example is to write functions accepting both a MatrixXf or a block of a MatrixXf. This exactly the purpose of the Ref class. Here is a simple example:
+In all the previous examples, the functions had to be template functions. This approach allows to write very generic code, but it is often desirable to write non templated functions and still keep some level of genericity to avoid stupid copies of the arguments. The typical example is to write functions accepting both a MatrixXf or a block of a MatrixXf. This is exactly the purpose of the Ref class. Here is a simple example:
 
 <table class="example">
 <tr><th>Example:</th><th>Output:</th></tr>
@@ -133,7 +133,7 @@ In this special case, the example is fine and will be working because both param
 
 \section TopicPlainFunctionsFailing In which cases do functions taking a plain Matrix or Array argument fail?
 
-Here, we consider a slightly modified version of the function given above. This time, we do not want to return the result but pass an additional non-const paramter which allows us to store the result. A first naive implementation might look as follows.
+Here, we consider a slightly modified version of the function given above. This time, we do not want to return the result but pass an additional non-const parameter which allows us to store the result. A first naive implementation might look as follows.
 \code
 // Note: This code is flawed!
 void cov(const MatrixXf& x, const MatrixXf& y, MatrixXf& C)
@@ -176,7 +176,7 @@ The implementation above does now not only work with temporary expressions but i
 
 \section TopicResizingInGenericImplementations How to resize matrices in generic implementations?
 
-One might think we are done now, right? This is not completely true because in order for our covariance function to be generically applicable, we want the follwing code to work
+One might think we are done now, right? This is not completely true because in order for our covariance function to be generically applicable, we want the following code to work
 \code
 MatrixXf x = MatrixXf::Random(100,3);
 MatrixXf y = MatrixXf::Random(100,3);
diff --git a/contrib/eigen/doc/HiPerformance.dox b/contrib/eigen/doc/HiPerformance.dox
index ab6cdfd44888072e9b0cd55324f86dcbabfc99e9..9cee3351c21cae07130bf3f75b9262a6f6c7c528 100644
--- a/contrib/eigen/doc/HiPerformance.dox
+++ b/contrib/eigen/doc/HiPerformance.dox
@@ -105,7 +105,7 @@ m1.noalias() += m2 * m3; \endcode</td>
 <td>First of all, here the .noalias() in the first expression is useless because
     m2*m3 will be evaluated anyway. However, note how this expression can be rewritten
     so that no temporary is required. (tip: for very small fixed size matrix
-    it is slighlty better to rewrite it like this: m1.noalias() = m2 * m3; m1 += m4;</td>
+    it is slightly better to rewrite it like this: m1.noalias() = m2 * m3; m1 += m4;</td>
 </tr>
 <tr class="alt">
 <td>\code
diff --git a/contrib/eigen/doc/InsideEigenExample.dox b/contrib/eigen/doc/InsideEigenExample.dox
index ed053c69d8a3e64b4e7a309fbc90174ab9d298b7..ea2275bf2485a700b47a7f1c8f3785fa4a65fb5e 100644
--- a/contrib/eigen/doc/InsideEigenExample.dox
+++ b/contrib/eigen/doc/InsideEigenExample.dox
@@ -212,6 +212,11 @@ Thus, the operator+ hasn't performed any actual computation. To summarize, the o
 
 \section Assignment The assignment
 
+<div class="warningbox">
+<strong>PLEASE HELP US IMPROVING THIS SECTION.</strong>
+This page reflects how %Eigen worked until 3.2, but since %Eigen 3.3 the assignment is more sophisticated as it involves an Assignment expression, and the creation of so called evaluator which are responsible for the evaluation of each kind of expressions.
+</div>
+
 At this point, the expression \a v + \a w has finished evaluating, so, in the process of compiling the line of code
 \code
 u = v + w;
diff --git a/contrib/eigen/doc/LeastSquares.dox b/contrib/eigen/doc/LeastSquares.dox
index 24dfe4b4f47b7f0cef94401820f0598f5016d119..ddbf38dec9b27f7fd5bac0d74efc5af9dfe77ca8 100644
--- a/contrib/eigen/doc/LeastSquares.dox
+++ b/contrib/eigen/doc/LeastSquares.dox
@@ -30,14 +30,17 @@ computing least squares solutions:
 </table>
 
 This is example from the page \link TutorialLinearAlgebra Linear algebra and decompositions \endlink.
+If you just need to solve the least squares problem, but are not interested in the SVD per se, a
+faster alternative method is CompleteOrthogonalDecomposition. 
 
 
 \section LeastSquaresQR Using the QR decomposition
 
 The solve() method in QR decomposition classes also computes the least squares solution. There are
-three QR decomposition classes: HouseholderQR (no pivoting, so fast but unstable),
-ColPivHouseholderQR (column pivoting, thus a bit slower but more accurate) and FullPivHouseholderQR
-(full pivoting, so slowest and most stable). Here is an example with column pivoting:
+three QR decomposition classes: HouseholderQR (no pivoting, fast but unstable if your matrix is
+not rull rank), ColPivHouseholderQR (column pivoting, thus a bit slower but more stable) and
+FullPivHouseholderQR (full pivoting, so slowest and slightly more stable than ColPivHouseholderQR).
+Here is an example with column pivoting:
 
 <table class="example">
 <tr><th>Example:</th><th>Output:</th></tr>
@@ -61,9 +64,11 @@ Finding the least squares solution of \a Ax = \a b is equivalent to solving the
 </tr>
 </table>
 
-If the matrix \a A is ill-conditioned, then this is not a good method, because the condition number
+This method is usually the fastest, especially when \a A is "tall and skinny". However, if the
+matrix \a A is even mildly ill-conditioned, this is not a good method, because the condition number
 of <i>A</i><sup>T</sup><i>A</i> is the square of the condition number of \a A. This means that you
-lose twice as many digits using normal equation than if you use the other methods.
+lose roughly twice as many digits of accuracy using the normal equation, compared to the more stable
+methods mentioned above.
 
 */
 
diff --git a/contrib/eigen/doc/Manual.dox b/contrib/eigen/doc/Manual.dox
index 342b145fde4584c027754d6a5e9a2cc82a38cafc..84f0db6458435f768f25f78228a7dae972a46d1e 100644
--- a/contrib/eigen/doc/Manual.dox
+++ b/contrib/eigen/doc/Manual.dox
@@ -15,7 +15,6 @@ namespace Eigen {
 
 
 /** \page UserManual_Generalities General topics
-  - \subpage Eigen2ToEigen3
   - \subpage TopicFunctionTakingEigenTypes
   - \subpage TopicPreprocessorDirectives
   - \subpage TopicAssertions
@@ -64,42 +63,46 @@ namespace Eigen {
     \ingroup DenseMatrixManipulation_chapter */
 /** \addtogroup TutorialBlockOperations
     \ingroup DenseMatrixManipulation_chapter */
+/** \addtogroup TutorialSlicingIndexing
+    \ingroup DenseMatrixManipulation_chapter */
 /** \addtogroup TutorialAdvancedInitialization
     \ingroup DenseMatrixManipulation_chapter */
 /** \addtogroup TutorialReductionsVisitorsBroadcasting
     \ingroup DenseMatrixManipulation_chapter */
-/** \addtogroup TutorialMapClass
+/** \addtogroup TutorialReshape
     \ingroup DenseMatrixManipulation_chapter */
-/** \addtogroup TutorialReshapeSlicing
+/** \addtogroup TutorialSTL
+    \ingroup DenseMatrixManipulation_chapter */
+/** \addtogroup TutorialMapClass
     \ingroup DenseMatrixManipulation_chapter */
 /** \addtogroup TopicAliasing
     \ingroup DenseMatrixManipulation_chapter */
 /** \addtogroup TopicStorageOrders
     \ingroup DenseMatrixManipulation_chapter */
-    
+
 /** \addtogroup DenseMatrixManipulation_Alignement
-    \ingroup DenseMatrixManipulation_chapter */
-/** \addtogroup TopicUnalignedArrayAssert
-    \ingroup DenseMatrixManipulation_Alignement */
-/** \addtogroup TopicFixedSizeVectorizable
-    \ingroup DenseMatrixManipulation_Alignement */
-/** \addtogroup TopicStructHavingEigenMembers
-    \ingroup DenseMatrixManipulation_Alignement */
-/** \addtogroup TopicStlContainers
-    \ingroup DenseMatrixManipulation_Alignement */
-/** \addtogroup TopicPassingByValue
-    \ingroup DenseMatrixManipulation_Alignement */
-/** \addtogroup TopicWrongStackAlignment
-    \ingroup DenseMatrixManipulation_Alignement */
+    \ingroup DenseMatrixManipulation_chapter        */
+/**     \addtogroup TopicUnalignedArrayAssert
+        \ingroup DenseMatrixManipulation_Alignement */
+/**     \addtogroup TopicFixedSizeVectorizable
+        \ingroup DenseMatrixManipulation_Alignement */
+/**     \addtogroup TopicStructHavingEigenMembers
+        \ingroup DenseMatrixManipulation_Alignement */
+/**     \addtogroup TopicStlContainers
+        \ingroup DenseMatrixManipulation_Alignement */
+/**     \addtogroup TopicPassingByValue
+        \ingroup DenseMatrixManipulation_Alignement */
+/**     \addtogroup TopicWrongStackAlignment
+        \ingroup DenseMatrixManipulation_Alignement */
     
 /** \addtogroup DenseMatrixManipulation_Reference
-    \ingroup DenseMatrixManipulation_chapter */
-/** \addtogroup Core_Module
-    \ingroup DenseMatrixManipulation_Reference */  
-/** \addtogroup Jacobi_Module
-    \ingroup DenseMatrixManipulation_Reference */ 
-/** \addtogroup Householder_Module
-    \ingroup DenseMatrixManipulation_Reference */ 
+    \ingroup DenseMatrixManipulation_chapter       */
+/**     \addtogroup Core_Module
+        \ingroup DenseMatrixManipulation_Reference */  
+/**     \addtogroup Jacobi_Module
+        \ingroup DenseMatrixManipulation_Reference */ 
+/**     \addtogroup Householder_Module
+        \ingroup DenseMatrixManipulation_Reference */ 
 
 /** \addtogroup CoeffwiseMathFunctions
     \ingroup DenseMatrixManipulation_chapter */
diff --git a/contrib/eigen/doc/Overview.dox b/contrib/eigen/doc/Overview.dox
index dbb49bd2183338cfe17fe7e0bd24319d74743752..43a12871e0fbd0932e8e4b98c0b93605bae1c0b6 100644
--- a/contrib/eigen/doc/Overview.dox
+++ b/contrib/eigen/doc/Overview.dox
@@ -4,8 +4,6 @@ namespace Eigen {
 
 This is the API documentation for Eigen3. You can <a href="eigen-doc.tgz">download</a> it as a tgz archive for offline reading.
 
-You're already an Eigen2 user? Here is a \link Eigen2ToEigen3 Eigen2 to Eigen3 guide \endlink to help porting your application.
-
 For a first contact with Eigen, the best place is to have a look at the \link GettingStarted getting started \endlink page that show you how to write and compile your first program with Eigen.
 
 Then, the \b quick \b reference \b pages give you a quite complete description of the API in a very condensed format that is specially useful to recall the syntax of a particular feature, or to have a quick look at the API. They currently cover the two following feature sets, and more will come in the future:
diff --git a/contrib/eigen/doc/PassingByValue.dox b/contrib/eigen/doc/PassingByValue.dox
index bf4d0ef4ba41ece65039b4a85ec9adf19e7e2fc3..9254fe6d88e6da635a2a0f06a23315ed1b587cee 100644
--- a/contrib/eigen/doc/PassingByValue.dox
+++ b/contrib/eigen/doc/PassingByValue.dox
@@ -4,21 +4,21 @@ namespace Eigen {
 
 Passing objects by value is almost always a very bad idea in C++, as this means useless copies, and one should pass them by reference instead.
 
-With Eigen, this is even more important: passing \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen objects" by value is not only inefficient, it can be illegal or make your program crash! And the reason is that these Eigen objects have alignment modifiers that aren't respected when they are passed by value.
+With %Eigen, this is even more important: passing \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen objects" by value is not only inefficient, it can be illegal or make your program crash! And the reason is that these %Eigen objects have alignment modifiers that aren't respected when they are passed by value.
 
-So for example, a function like this, where v is passed by value:
+For example, a function like this, where \c v is passed by value:
 
 \code
 void my_function(Eigen::Vector2d v);
 \endcode
 
-needs to be rewritten as follows, passing v by reference:
+needs to be rewritten as follows, passing \c v by const reference:
 
 \code
 void my_function(const Eigen::Vector2d& v);
 \endcode
 
-Likewise if you have a class having a Eigen object as member:
+Likewise if you have a class having an %Eigen object as member:
 
 \code
 struct Foo
diff --git a/contrib/eigen/doc/Pitfalls.dox b/contrib/eigen/doc/Pitfalls.dox
index fda402572071e4705b2f06cb88ba341203e2ddfb..85282bd6f544e02bf7c4330837adde410e2771ff 100644
--- a/contrib/eigen/doc/Pitfalls.dox
+++ b/contrib/eigen/doc/Pitfalls.dox
@@ -17,7 +17,7 @@ especially if you got wrong results in statements where the destination appears
 \section TopicPitfalls_alignment_issue Alignment Issues (runtime assertion)
 
 %Eigen does explicit vectorization, and while that is appreciated by many users, that also leads to some issues in special situations where data alignment is compromised.
-Indeed, since C++17,  C++ does not have quite good enough support for explicit data alignment.
+Indeed, prior to C++17,  C++ does not have quite good enough support for explicit data alignment.
 In that case your program hits an assertion failure (that is, a "controlled crash") with a message that tells you to consult this page:
 \code
 http://eigen.tuxfamily.org/dox/group__TopicUnalignedArrayAssert.html
@@ -40,7 +40,17 @@ for(...) { ... w = C * v;  ...}
 
 In this example, the type of C is not a \c MatrixXd but an abstract expression representing a matrix product and storing references to \c A and \c B.
 Therefore, the product of \c A*B will be carried out multiple times, once per iteration of the for loop.
-Moreover, if the coefficients of A or B change during the iteration, then C will evaluate to different values.
+Moreover, if the coefficients of `A` or `B` change during the iteration, then `C` will evaluate to different values as in the following example:
+
+\code
+MatrixXd A = ..., B = ...;
+auto C = A*B;
+MatrixXd R1 = C;
+A = ...;
+MatrixXd R2 = C;
+\endcode
+for which we end up with `R1` &ne; `R2`.
+
 
 Here is another example leading to a segfault:
 \code
@@ -72,7 +82,7 @@ Note that DenseBase::eval() is smart enough to avoid copies when the underlying
 \section TopicPitfalls_header_issues Header Issues (failure to compile)
 
 With all libraries, one must check the documentation for which header to include.
-The same is true with %Eigen, but slightly worse: with %Eigen, a method in a class may require an additional <code>#include</code> over what the class itself requires!
+The same is true with %Eigen, but slightly worse: with %Eigen, a method in a class may require an additional \c \#include over what the class itself requires!
 For example, if you want to use the \c cross() method on a vector (it computes a cross-product) then you need to:
 \code
 #include<Eigen/Geometry>
@@ -114,5 +124,26 @@ There are at least two ways around this:
   - If the value you are passing is guaranteed to be around for the life of the functor, you can use boost::ref() to wrap the value as you pass it to boost::bind. Generally this is not a solution for values on the stack as if the functor ever gets passed to a lower or independent scope, the object may be gone by the time it's attempted to be used.
   - The other option is to make your functions take a reference counted pointer like boost::shared_ptr as the argument. This avoids needing to worry about managing the lifetime of the object being passed.
 
+
+\section TopicPitfalls_matrix_bool Matrices with boolean coefficients
+
+The current behaviour of using \c Matrix with boolean coefficients is inconsistent and likely to change in future versions of Eigen, so please use it carefully!
+
+A simple example for such an inconsistency is 
+
+\code
+template<int Size>
+void foo() {
+  Eigen::Matrix<bool, Size, Size> A, B, C;
+  A.setOnes();
+  B.setOnes();
+
+  C = A * B - A * B;
+  std::cout << C << "\n";
+}
+\endcode
+
+since calling \c foo<3>() prints the zero matrix while calling \c foo<10>() prints the identity matrix.
+
 */
 }
diff --git a/contrib/eigen/doc/PreprocessorDirectives.dox b/contrib/eigen/doc/PreprocessorDirectives.dox
index 40cceb94455b18659e18c162cf11de9765150250..0f545b08610444d345be22457ee2f60b452aa7a3 100644
--- a/contrib/eigen/doc/PreprocessorDirectives.dox
+++ b/contrib/eigen/doc/PreprocessorDirectives.dox
@@ -51,7 +51,7 @@ are doing.
 
 \section TopicPreprocessorDirectivesCppVersion C++ standard features
 
-By default, %Eigen strive to automatically detect and enable langage features at compile-time based on
+By default, %Eigen strive to automatically detect and enable language features at compile-time based on
 the information provided by the compiler.
 
  - \b EIGEN_MAX_CPP_VER - disables usage of C++ features requiring a version greater than EIGEN_MAX_CPP_VER.
@@ -66,7 +66,7 @@ functions by defining EIGEN_HAS_C99_MATH=1.
    Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
  - \b EIGEN_HAS_CXX11_MATH - controls the implementation of some functions such as round, logp1, isinf, isnan, etc.
    Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
- - \b EIGEN_HAS_RVALUE_REFERENCES - defines whetehr rvalue references are supported
+ - \b EIGEN_HAS_RVALUE_REFERENCES - defines whether rvalue references are supported
    Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
  - \b EIGEN_HAS_STD_RESULT_OF - defines whether std::result_of is supported
    Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
@@ -78,6 +78,7 @@ functions by defining EIGEN_HAS_C99_MATH=1.
    Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
  - \b EIGEN_HAS_CXX11_NOEXCEPT - defines whether noexcept is supported
    Automatic detection disabled if EIGEN_MAX_CPP_VER<11.
+ - \b EIGEN_NO_IO - Disables any usage and support for `<iostreams>`.
 
 \section TopicPreprocessorDirectivesAssertions Assertions
 
@@ -106,7 +107,7 @@ run time. However, these assertions do cost time and can thus be turned off.
  Let us emphasize that \c EIGEN_MAX_*_ALIGN_BYTES define only a diserable upper bound. In practice data is aligned to largest power-of-two common divisor of \c EIGEN_MAX_STATIC_ALIGN_BYTES and the size of the data, such that memory is not wasted.
  - \b \c EIGEN_DONT_PARALLELIZE - if defined, this disables multi-threading. This is only relevant if you enabled OpenMP.
    See \ref TopicMultiThreading for details.
- - \b EIGEN_DONT_VECTORIZE - disables explicit vectorization when defined. Not defined by default, unless 
+ - \b \c EIGEN_DONT_VECTORIZE - disables explicit vectorization when defined. Not defined by default, unless 
    alignment is disabled by %Eigen's platform test or the user defining \c EIGEN_DONT_ALIGN.
  - \b \c EIGEN_UNALIGNED_VECTORIZE - disables/enables vectorization with unaligned stores. Default is 1 (enabled).
    If set to 0 (disabled), then expression for which the destination cannot be aligned are not vectorized (e.g., unaligned
@@ -116,17 +117,21 @@ run time. However, these assertions do cost time and can thus be turned off.
    Define it to 0 to disable.
  - \b \c EIGEN_UNROLLING_LIMIT - defines the size of a loop to enable meta unrolling. Set it to zero to disable
    unrolling. The size of a loop here is expressed in %Eigen's own notion of "number of FLOPS", it does not
-   correspond to the number of iterations or the number of instructions. The default is value 100.
+   correspond to the number of iterations or the number of instructions. The default is value 110.
  - \b \c EIGEN_STACK_ALLOCATION_LIMIT - defines the maximum bytes for a buffer to be allocated on the stack. For internal
    temporary buffers, dynamic memory allocation is employed as a fall back. For fixed-size matrices or arrays, exceeding
    this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB.
+ - \b \c EIGEN_NO_CUDA - disables CUDA support when defined. Might be useful in .cu files for which Eigen is used on the host only,
+   and never called from device code.
  - \b \c EIGEN_STRONG_INLINE - This macro is used to qualify critical functions and methods that we expect the compiler to inline.
    By default it is defined to \c __forceinline for MSVC and ICC, and to \c inline for other compilers. A tipical usage is to
    define it to \c inline for MSVC users wanting faster compilation times, at the risk of performance degradations in some rare
    cases for which MSVC inliner fails to do a good job.
+ - \b \c EIGEN_DEFAULT_L1_CACHE_SIZE - Sets the default L1 cache size that is used in Eigen's GEBP kernel when the correct cache size cannot be determined at runtime.
+ - \b \c EIGEN_DEFAULT_L2_CACHE_SIZE - Sets the default L2 cache size that is used in Eigen's GEBP kernel when the correct cache size cannot be determined at runtime.
+ - \b \c EIGEN_DEFAULT_L3_CACHE_SIZE - Sets the default L3 cache size that is used in Eigen's GEBP kernel when the correct cache size cannot be determined at runtime.
 
-
- - \c EIGEN_DONT_ALIGN - Deprecated, it is a synonym for \c EIGEN_MAX_ALIGN_BYTES=0. It disables alignment completely. %Eigen will not try to align its objects and does not expect that any objects passed to it are aligned. This will turn off vectorization if \b EIGEN_UNALIGNED_VECTORIZE=1. Not defined by default.
+ - \c EIGEN_DONT_ALIGN - Deprecated, it is a synonym for \c EIGEN_MAX_ALIGN_BYTES=0. It disables alignment completely. %Eigen will not try to align its objects and does not expect that any objects passed to it are aligned. This will turn off vectorization if \b \c EIGEN_UNALIGNED_VECTORIZE=1. Not defined by default.
  - \c EIGEN_DONT_ALIGN_STATICALLY - Deprecated, it is a synonym for \c EIGEN_MAX_STATIC_ALIGN_BYTES=0. It disables alignment of arrays on the stack. Not defined by default, unless \c EIGEN_DONT_ALIGN is defined.
 
 
@@ -141,18 +146,18 @@ following macros are supported; none of them are defined by default.
  - \b EIGEN_CWISE_PLUGIN - filename of plugin for extending the Cwise class.
  - \b EIGEN_DENSEBASE_PLUGIN - filename of plugin for extending the DenseBase class.
  - \b EIGEN_DYNAMICSPARSEMATRIX_PLUGIN - filename of plugin for extending the DynamicSparseMatrix class.
+ - \b EIGEN_FUNCTORS_PLUGIN - filename of plugin for adding new functors and specializations of functor_traits.
+ - \b EIGEN_MAPBASE_PLUGIN - filename of plugin for extending the MapBase class.
  - \b EIGEN_MATRIX_PLUGIN - filename of plugin for extending the Matrix class.
  - \b EIGEN_MATRIXBASE_PLUGIN - filename of plugin for extending the MatrixBase class.
  - \b EIGEN_PLAINOBJECTBASE_PLUGIN - filename of plugin for extending the PlainObjectBase class.
- - \b EIGEN_MAPBASE_PLUGIN - filename of plugin for extending the MapBase class.
  - \b EIGEN_QUATERNION_PLUGIN - filename of plugin for extending the Quaternion class.
  - \b EIGEN_QUATERNIONBASE_PLUGIN - filename of plugin for extending the QuaternionBase class.
  - \b EIGEN_SPARSEMATRIX_PLUGIN - filename of plugin for extending the SparseMatrix class.
  - \b EIGEN_SPARSEMATRIXBASE_PLUGIN - filename of plugin for extending the SparseMatrixBase class.
  - \b EIGEN_SPARSEVECTOR_PLUGIN - filename of plugin for extending the SparseVector class.
  - \b EIGEN_TRANSFORM_PLUGIN - filename of plugin for extending the Transform class.
- - \b EIGEN_FUNCTORS_PLUGIN - filename of plugin for adding new functors and specializations of functor_traits.
-
+ - \b EIGEN_VECTORWISEOP_PLUGIN - filename of plugin for extending the VectorwiseOp class.
 
 \section TopicPreprocessorDirectivesDevelopers Macros for Eigen developers
 
diff --git a/contrib/eigen/doc/QuickReference.dox b/contrib/eigen/doc/QuickReference.dox
index 44f5410db798a75e539a8aadc1f850cc051dc08c..c5dfce42195a5f64d660d10ce346110f5ba8efbc 100644
--- a/contrib/eigen/doc/QuickReference.dox
+++ b/contrib/eigen/doc/QuickReference.dox
@@ -68,7 +68,7 @@ Array<float,4,1>                <=>   Array4f
 
 Conversion between the matrix and array worlds:
 \code
-Array44f a1, a1;
+Array44f a1, a2;
 Matrix4f m1, m2;
 m1 = a1 * a2;                     // coeffwise product, implicit conversion from array to matrix.
 a1 = m1 * m2;                     // matrix product, implicit conversion from matrix to array.
@@ -261,6 +261,8 @@ x.setIdentity();
 Vector3f::UnitX() // 1 0 0
 Vector3f::UnitY() // 0 1 0
 Vector3f::UnitZ() // 0 0 1
+Vector4f::Unit(i)
+x.setUnit(i);
 \endcode
   </td>
   <td>
@@ -278,6 +280,7 @@ N/A
 
 
 VectorXf::Unit(size,i)
+x.setUnit(size,i);
 VectorXf::Unit(4,1) == Vector4f(0,1,0,0)
                     == Vector4f::UnitY()
 \endcode
@@ -285,7 +288,12 @@ VectorXf::Unit(4,1) == Vector4f(0,1,0,0)
 </tr>
 </table>
 
-
+Note that it is allowed to call any of the \c set* functions to a dynamic-sized vector or matrix without passing new sizes.
+For instance:
+\code
+MatrixXi M(3,3);
+M.setIdentity();
+\endcode
 
 \subsection QuickRef_Map Mapping external arrays
 
@@ -472,13 +480,14 @@ The main difference between the two API is that the one based on cwise* methods
 while the second one (based on .array()) returns an array expression.
 Recall that .array() has no cost, it only changes the available API and interpretation of the data.
 
-It is also very simple to apply any user defined function \c foo using DenseBase::unaryExpr together with <a href="http://en.cppreference.com/w/cpp/utility/functional/ptr_fun">std::ptr_fun</a> (c++03), <a href="http://en.cppreference.com/w/cpp/utility/functional/ref">std::ref</a> (c++11), or <a href="http://en.cppreference.com/w/cpp/language/lambda">lambdas</a> (c++11):
+It is also very simple to apply any user defined function \c foo using DenseBase::unaryExpr together with <a href="http://en.cppreference.com/w/cpp/utility/functional/ptr_fun">std::ptr_fun</a> (c++03, deprecated or removed in newer C++ versions), <a href="http://en.cppreference.com/w/cpp/utility/functional/ref">std::ref</a> (c++11), or <a href="http://en.cppreference.com/w/cpp/language/lambda">lambdas</a> (c++11):
 \code
 mat1.unaryExpr(std::ptr_fun(foo));
 mat1.unaryExpr(std::ref(foo));
 mat1.unaryExpr([](double x) { return foo(x); });
 \endcode
 
+Please note that it's not possible to pass a raw function pointer to \c unaryExpr, so please warp it as shown above.
 
 <a href="#" class="top">top</a>
 \section QuickRef_Reductions Reductions
@@ -521,6 +530,12 @@ if((array1 < array2).any()) ... // if there exist a pair i,j such that array1(i,
 
 <a href="#" class="top">top</a>\section QuickRef_Blocks Sub-matrices
 
+<div class="warningbox">
+<strong>PLEASE HELP US IMPROVING THIS SECTION.</strong>
+%Eigen 3.4 supports a much improved API for sub-matrices, including,
+slicing and indexing from arrays: \ref TutorialSlicingIndexing
+</div>
+
 Read-write access to a \link DenseBase::col(Index) column \endlink
 or a \link DenseBase::row(Index) row \endlink of a matrix (or array):
 \code
@@ -576,6 +591,11 @@ Read-write access to sub-matrices:</td></tr>
 
 <a href="#" class="top">top</a>\section QuickRef_Misc Miscellaneous operations
 
+<div class="warningbox">
+<strong>PLEASE HELP US IMPROVING THIS SECTION.</strong>
+%Eigen 3.4 supports a new API for reshaping: \ref TutorialReshape
+</div>
+
 \subsection QuickRef_Reverse Reverse
 Vectors, rows, and/or columns of a matrix can be reversed (see DenseBase::reverse(), DenseBase::reverseInPlace(), VectorwiseOp::reverse()).
 \code
diff --git a/contrib/eigen/doc/QuickStartGuide.dox b/contrib/eigen/doc/QuickStartGuide.dox
index ea32c3b3d03072105350159a5ce511403aba5dbe..4192b28b7da7defdc084982be74e74f0a71f4aaf 100644
--- a/contrib/eigen/doc/QuickStartGuide.dox
+++ b/contrib/eigen/doc/QuickStartGuide.dox
@@ -66,9 +66,9 @@ The output is as follows:
 
 \section GettingStartedExplanation2 Explanation of the second example
 
-The second example starts by declaring a 3-by-3 matrix \c m which is initialized using the \link DenseBase::Random(Index,Index) Random() \endlink method with random values between -1 and 1. The next line applies a linear mapping such that the values are between 10 and 110. The function call \link DenseBase::Constant(Index,Index,const Scalar&) MatrixXd::Constant\endlink(3,3,1.2) returns a 3-by-3 matrix expression having all coefficients equal to 1.2. The rest is standard arithmetics.
+The second example starts by declaring a 3-by-3 matrix \c m which is initialized using the \link DenseBase::Random(Index,Index) Random() \endlink method with random values between -1 and 1. The next line applies a linear mapping such that the values are between 10 and 110. The function call \link DenseBase::Constant(Index,Index,const Scalar&) MatrixXd::Constant\endlink(3,3,1.2) returns a 3-by-3 matrix expression having all coefficients equal to 1.2. The rest is standard arithmetic.
 
-The next line of the \c main function introduces a new type: \c VectorXd. This represents a (column) vector of arbitrary size. Here, the vector \c v is created to contain \c 3 coefficients which are left unitialized. The one but last line uses the so-called comma-initializer, explained in \ref TutorialAdvancedInitialization, to set all coefficients of the vector \c v to be as follows:
+The next line of the \c main function introduces a new type: \c VectorXd. This represents a (column) vector of arbitrary size. Here, the vector \c v is created to contain \c 3 coefficients which are left uninitialized. The one but last line uses the so-called comma-initializer, explained in \ref TutorialAdvancedInitialization, to set all coefficients of the vector \c v to be as follows:
 
 \f[
 v =
diff --git a/contrib/eigen/doc/SparseLinearSystems.dox b/contrib/eigen/doc/SparseLinearSystems.dox
index fc33b93e7e563b6ba2a3d8c2ca5c2d6015f291bb..38754e4afbee9248f94a7cc28cafc9660337f52f 100644
--- a/contrib/eigen/doc/SparseLinearSystems.dox
+++ b/contrib/eigen/doc/SparseLinearSystems.dox
@@ -70,6 +70,9 @@ They are summarized in the following tables:
 <tr><td>UmfPackLU</td><td>\link UmfPackSupport_Module UmfPackSupport \endlink</td><td>Direct LU factorization</td><td>Square</td><td>Fill-in reducing, Leverage fast dense algebra</td>
     <td>Requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td>
     <td></td></tr>
+<tr><td>KLU</td><td>\link KLUSupport_Module KLUSupport \endlink</td><td>Direct LU factorization</td><td>Square</td><td>Fill-in reducing, suitted for circuit simulation</td>
+    <td>Requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td>
+    <td></td></tr>
 <tr><td>SuperLU</td><td>\link SuperLUSupport_Module SuperLUSupport \endlink</td><td>Direct LU factorization</td><td>Square</td><td>Fill-in reducing, Leverage fast dense algebra</td>
     <td>Requires the <a href="http://crd-legacy.lbl.gov/~xiaoye/SuperLU/">SuperLU</a> library, (BSD-like)</td>
     <td></td></tr>
diff --git a/contrib/eigen/doc/SparseQuickReference.dox b/contrib/eigen/doc/SparseQuickReference.dox
index a25622e800e0df278456d6d99badc47d0807a993..9779f3f9c88bd52e39f45f37a54f582de2ebe2a7 100644
--- a/contrib/eigen/doc/SparseQuickReference.dox
+++ b/contrib/eigen/doc/SparseQuickReference.dox
@@ -80,7 +80,7 @@ sm1.setZero();
 
 
 \section SparseBasicInfos Matrix properties
-Beyond the basic functions rows() and cols(), there are some useful functions that are available to easily get some informations from the matrix. 
+Beyond the basic functions rows() and cols(), there are some useful functions that are available to easily get some information from the matrix. 
 <table class="manual">
 <tr>
   <td> \code
@@ -244,7 +244,7 @@ As stated earlier, for a read-write sub-matrix (RW), the evaluation can be done
 <td>
 \code
 sm1.valuePtr();      // Pointer to the values
-sm1.innerIndextr();  // Pointer to the indices.
+sm1.innerIndexPtr();  // Pointer to the indices.
 sm1.outerIndexPtr(); // Pointer to the beginning of each inner vector
 \endcode
 </td>
diff --git a/contrib/eigen/doc/StlContainers.dox b/contrib/eigen/doc/StlContainers.dox
index e0f8714a95a20ee3c0d6dfc059c4b359c66e88e6..0342573d0b88c39f92071a608b2f7a7d14b8b39a 100644
--- a/contrib/eigen/doc/StlContainers.dox
+++ b/contrib/eigen/doc/StlContainers.dox
@@ -6,31 +6,39 @@ namespace Eigen {
 
 \section StlContainers_summary Executive summary
 
-Using STL containers on \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", or classes having members of such types, requires taking the following two steps:
+If you're compiling in \cpp17 mode only with a sufficiently recent compiler (e.g., GCC>=7, clang>=5, MSVC>=19.12), then everything is taken care by the compiler and you can stop reading.
 
-\li A 16-byte-aligned allocator must be used. Eigen does provide one ready for use: aligned_allocator.
-\li If you want to use the std::vector container, you need to \#include <Eigen/StdVector>.
+Otherwise, using STL containers on \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", or classes having members of such types, requires the use of an over-aligned allocator.
+That is, an allocator capable of allocating buffers with 16, 32, or even 64 bytes alignment.
+%Eigen does provide one ready for use: aligned_allocator.
 
-These issues arise only with \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types" and \ref TopicStructHavingEigenMembers "structures having such Eigen objects as member". For other Eigen types, such as Vector3f or MatrixXd, no special care is needed when using STL containers.
+Prior to \cpp11, if you want to use the `std::vector` container, then you also have to <code> \#include <Eigen/StdVector> </code>.
+
+These issues arise only with \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types" and \ref TopicStructHavingEigenMembers "structures having such Eigen objects as member".
+For other %Eigen types, such as Vector3f or MatrixXd, no special care is needed when using STL containers.
 
 \section allocator Using an aligned allocator
 
-STL containers take an optional template parameter, the allocator type. When using STL containers on \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you need tell the container to use an allocator that will always allocate memory at 16-byte-aligned locations. Fortunately, Eigen does provide such an allocator: Eigen::aligned_allocator.
+STL containers take an optional template parameter, the allocator type. When using STL containers on \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you need tell the container to use an allocator that will always allocate memory at 16-byte-aligned (or more) locations. Fortunately, %Eigen does provide such an allocator: Eigen::aligned_allocator.
 
 For example, instead of
 \code
-std::map<int, Eigen::Vector4f>
+std::map<int, Eigen::Vector4d>
 \endcode
 you need to use
 \code
-std::map<int, Eigen::Vector4f, std::less<int>, 
-         Eigen::aligned_allocator<std::pair<const int, Eigen::Vector4f> > >
+std::map<int, Eigen::Vector4d, std::less<int>, 
+         Eigen::aligned_allocator<std::pair<const int, Eigen::Vector4d> > >
 \endcode
-Note that the third parameter "std::less<int>" is just the default value, but we have to include it because we want to specify the fourth parameter, which is the allocator type.
+Note that the third parameter `std::less<int>` is just the default value, but we have to include it because we want to specify the fourth parameter, which is the allocator type.
 
 \section StlContainers_vector The case of std::vector
 
-The situation with std::vector was even worse (explanation below) so we had to specialize it for the Eigen::aligned_allocator type. In practice you \b must use the Eigen::aligned_allocator (not another aligned allocator), \b and \#include <Eigen/StdVector>.
+This section is for c++98/03 users only. \cpp11 (or above) users can stop reading here.
+
+So in c++98/03, the situation with `std::vector` is more complicated because of a bug in the standard (explanation below).
+To workaround the issue, we had to specialize it for the Eigen::aligned_allocator type.
+In practice you \b must use the Eigen::aligned_allocator (not another aligned allocator), \b and \#include <Eigen/StdVector>.
 
 Here is an example:
 \code
@@ -39,12 +47,16 @@ Here is an example:
 std::vector<Eigen::Vector4f,Eigen::aligned_allocator<Eigen::Vector4f> >
 \endcode
 
+<span class="note">\b Explanation: The `resize()` method of `std::vector` takes a `value_type` argument (defaulting to `value_type()`). So with `std::vector<Eigen::Vector4d>`, some Eigen::Vector4d objects will be passed by value, which discards any alignment modifiers, so a Eigen::Vector4d can be created at an unaligned location.
+In order to avoid that, the only solution we saw was to specialize `std::vector` to make it work on a slight modification of, here, Eigen::Vector4d, that is able to deal properly with this situation.
+</span>
+
 \subsection vector_spec An alternative - specializing std::vector for Eigen types
 
 As an alternative to the recommended approach described above, you have the option to specialize std::vector for Eigen types requiring alignment. 
-The advantage is that you won't need to declare std::vector all over with Eigen::allocator. One drawback on the other hand side is that
-the specialization needs to be defined before all code pieces in which e.g. std::vector<Vector2d> is used. Otherwise, without knowing the specialization
-the compiler will compile that particular instance with the default std::allocator and you program is most likely to crash.
+The advantage is that you won't need to declare std::vector all over with Eigen::aligned_allocator. One drawback on the other hand side is that
+the specialization needs to be defined before all code pieces in which e.g. `std::vector<Vector2d>` is used. Otherwise, without knowing the specialization
+the compiler will compile that particular instance with the default `std::allocator` and you program is most likely to crash.
 
 Here is an example:
 \code
@@ -54,8 +66,7 @@ EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(Matrix2d)
 std::vector<Eigen::Vector2d>
 \endcode
 
-<span class="note">\b Explanation: The resize() method of std::vector takes a value_type argument (defaulting to value_type()). So with std::vector<Eigen::Vector4f>, some Eigen::Vector4f objects will be passed by value, which discards any alignment modifiers, so a Eigen::Vector4f can be created at an unaligned location. In order to avoid that, the only solution we saw was to specialize std::vector to make it work on a slight modification of, here, Eigen::Vector4f, that is able to deal properly with this situation.
-</span>
+
 
 */
 
diff --git a/contrib/eigen/doc/StructHavingEigenMembers.dox b/contrib/eigen/doc/StructHavingEigenMembers.dox
index 7fbed0eb01c38405170f40028762676c55203438..87016cdc995c90b7aedb7fa0d4426fb20989b5ac 100644
--- a/contrib/eigen/doc/StructHavingEigenMembers.dox
+++ b/contrib/eigen/doc/StructHavingEigenMembers.dox
@@ -6,7 +6,12 @@ namespace Eigen {
 
 \section StructHavingEigenMembers_summary Executive Summary
 
-If you define a structure having members of \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you must overload its "operator new" so that it generates 16-bytes-aligned pointers. Fortunately, %Eigen provides you with a macro EIGEN_MAKE_ALIGNED_OPERATOR_NEW that does that for you.
+
+If you define a structure having members of \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you must ensure that calling operator new on it allocates properly aligned buffers.
+If you're compiling in \cpp17 mode only with a sufficiently recent compiler (e.g., GCC>=7, clang>=5, MSVC>=19.12), then everything is taken care by the compiler and you can stop reading.
+
+Otherwise, you have to overload its `operator new` so that it generates properly aligned pointers (e.g., 32-bytes-aligned for Vector4d and AVX).
+Fortunately, %Eigen provides you with a macro `EIGEN_MAKE_ALIGNED_OPERATOR_NEW` that does that for you.
 
 \section StructHavingEigenMembers_what What kind of code needs to be changed?
 
@@ -29,13 +34,13 @@ In other words: you have a class that has as a member a \ref TopicFixedSizeVecto
 
 \section StructHavingEigenMembers_how How should such code be modified?
 
-Very easy, you just need to put a EIGEN_MAKE_ALIGNED_OPERATOR_NEW macro in a public part of your class, like this:
+Very easy, you just need to put a `EIGEN_MAKE_ALIGNED_OPERATOR_NEW` macro in a public part of your class, like this:
 
 \code
 class Foo
 {
   ...
-  Eigen::Vector2d v;
+  Eigen::Vector4d v;
   ...
 public:
   EIGEN_MAKE_ALIGNED_OPERATOR_NEW
@@ -46,7 +51,9 @@ public:
 Foo *foo = new Foo;
 \endcode
 
-This macro makes "new Foo" always return an aligned pointer.
+This macro makes `new Foo` always return an aligned pointer.
+
+In \cpp17, this macro is empty.
 
 If this approach is too intrusive, see also the \ref StructHavingEigenMembers_othersolutions "other solutions".
 
@@ -58,7 +65,7 @@ OK let's say that your code looks like this:
 class Foo
 {
   ...
-  Eigen::Vector2d v;
+  Eigen::Vector4d v;
   ...
 };
 
@@ -67,45 +74,59 @@ class Foo
 Foo *foo = new Foo;
 \endcode
 
-A Eigen::Vector2d consists of 2 doubles, which is 128 bits. Which is exactly the size of a SSE packet, which makes it possible to use SSE for all sorts of operations on this vector. But SSE instructions (at least the ones that %Eigen uses, which are the fast ones) require 128-bit alignment. Otherwise you get a segmentation fault.
+A Eigen::Vector4d consists of 4 doubles, which is 256 bits.
+This is exactly the size of an AVX register, which makes it possible to use AVX for all sorts of operations on this vector.
+But AVX instructions (at least the ones that %Eigen uses, which are the fast ones) require 256-bit alignment.
+Otherwise you get a segmentation fault.
 
-For this reason, Eigen takes care by itself to require 128-bit alignment for Eigen::Vector2d, by doing two things:
-\li Eigen requires 128-bit alignment for the Eigen::Vector2d's array (of 2 doubles). With GCC, this is done with a __attribute__ ((aligned(16))).
-\li Eigen overloads the "operator new" of Eigen::Vector2d so it will always return 128-bit aligned pointers.
+For this reason, %Eigen takes care by itself to require 256-bit alignment for Eigen::Vector4d, by doing two things:
+\li %Eigen requires 256-bit alignment for the Eigen::Vector4d's array (of 4 doubles). With \cpp11 this is done with the <a href="https://en.cppreference.com/w/cpp/keyword/alignas">alignas</a> keyword, or compiler's extensions for c++98/03.
+\li %Eigen overloads the `operator new` of Eigen::Vector4d so it will always return 256-bit aligned pointers. (removed in \cpp17)
 
-Thus, normally, you don't have to worry about anything, Eigen handles alignment for you...
+Thus, normally, you don't have to worry about anything, %Eigen handles alignment of operator new for you...
 
-... except in one case. When you have a class Foo like above, and you dynamically allocate a new Foo as above, then, since Foo doesn't have aligned "operator new", the returned pointer foo is not necessarily 128-bit aligned.
+... except in one case. When you have a `class Foo` like above, and you dynamically allocate a new `Foo` as above, then, since `Foo` doesn't have aligned `operator new`, the returned pointer foo is not necessarily 256-bit aligned.
 
-The alignment attribute of the member v is then relative to the start of the class, foo. If the foo pointer wasn't aligned, then foo->v won't be aligned either!
+The alignment attribute of the member `v` is then relative to the start of the class `Foo`. If the `foo` pointer wasn't aligned, then `foo->v` won't be aligned either!
 
-The solution is to let class Foo have an aligned "operator new", as we showed in the previous section.
+The solution is to let `class Foo` have an aligned `operator new`, as we showed in the previous section.
+
+This explanation also holds for SSE/NEON/MSA/Altivec/VSX targets, which require 16-bytes alignment, and AVX512 which requires 64-bytes alignment for fixed-size objects multiple of 64 bytes (e.g., Eigen::Matrix4d).
 
 \section StructHavingEigenMembers_movetotop Should I then put all the members of Eigen types at the beginning of my class?
 
-That's not required. Since Eigen takes care of declaring 128-bit alignment, all members that need it are automatically 128-bit aligned relatively to the class. So code like this works fine:
+That's not required. Since %Eigen takes care of declaring adequate alignment, all members that need it are automatically aligned relatively to the class. So code like this works fine:
 
 \code
 class Foo
 {
   double x;
-  Eigen::Vector2d v;
+  Eigen::Vector4d v;
 public:
   EIGEN_MAKE_ALIGNED_OPERATOR_NEW
 };
 \endcode
 
+That said, as usual, it is recommended to sort the members so that alignment does not waste memory.
+In the above example, with AVX, the compiler will have to reserve 24 empty bytes between `x` and `v`.
+
+
 \section StructHavingEigenMembers_dynamicsize What about dynamic-size matrices and vectors?
 
 Dynamic-size matrices and vectors, such as Eigen::VectorXd, allocate dynamically their own array of coefficients, so they take care of requiring absolute alignment automatically. So they don't cause this issue. The issue discussed here is only with \ref TopicFixedSizeVectorizable  "fixed-size vectorizable matrices and vectors".
 
+
 \section StructHavingEigenMembers_bugineigen So is this a bug in Eigen?
 
-No, it's not our bug. It's more like an inherent problem of the C++98 language specification, and seems to be taken care of in the upcoming language revision: <a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2341.pdf">see this document</a>.
+No, it's not our bug. It's more like an inherent problem of the c++ language specification that has been solved in c++17 through the feature known as <a href="http://wg21.link/p0035r4">dynamic memory allocation for over-aligned data</a>.
+
 
-\section StructHavingEigenMembers_conditional What if I want to do this conditionnally (depending on template parameters) ?
+\section StructHavingEigenMembers_conditional What if I want to do this conditionally (depending on template parameters) ?
 
-For this situation, we offer the macro EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign). It will generate aligned operators like EIGEN_MAKE_ALIGNED_OPERATOR_NEW if NeedsToAlign is true. It will generate operators with the default alignment if NeedsToAlign is false.
+For this situation, we offer the macro `EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)`.
+It will generate aligned operators like `EIGEN_MAKE_ALIGNED_OPERATOR_NEW` if `NeedsToAlign` is true.
+It will generate operators with the default alignment if `NeedsToAlign` is false.
+In \cpp17, this macro is empty.
 
 Example:
 
@@ -130,7 +151,7 @@ Foo<3> *foo3 = new Foo<3>; // foo3 has only the system default alignment guarant
 
 \section StructHavingEigenMembers_othersolutions Other solutions
 
-In case putting the EIGEN_MAKE_ALIGNED_OPERATOR_NEW macro everywhere is too intrusive, there exists at least two other solutions.
+In case putting the `EIGEN_MAKE_ALIGNED_OPERATOR_NEW` macro everywhere is too intrusive, there exists at least two other solutions.
 
 \subsection othersolutions1 Disabling alignment
 
@@ -139,22 +160,13 @@ The first is to disable alignment requirement for the fixed size members:
 class Foo
 {
   ...
-  Eigen::Matrix<double,2,1,Eigen::DontAlign> v;
+  Eigen::Matrix<double,4,1,Eigen::DontAlign> v;
   ...
 };
 \endcode
-This has for effect to disable vectorization when using \c v.
-If a function of Foo uses it several times, then it still possible to re-enable vectorization by copying it into an aligned temporary vector:
-\code
-void Foo::bar()
-{
-  Eigen::Vector2d av(v);
-  // use av instead of v
-  ...
-  // if av changed, then do:
-  v = av;
-}
-\endcode
+This `v` is fully compatible with aligned Eigen::Vector4d.
+This has only for effect to make load/stores to `v` more expensive (usually slightly, but that's hardware dependent).
+
 
 \subsection othersolutions2 Private structure
 
@@ -164,7 +176,7 @@ The second consist in storing the fixed-size objects into a private struct which
 struct Foo_d
 {
   EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-  Vector2d v;
+  Vector4d v;
   ...
 };
 
@@ -183,7 +195,8 @@ private:
 };
 \endcode
 
-The clear advantage here is that the class Foo remains unchanged regarding alignment issues. The drawback is that a heap allocation will be required whatsoever.
+The clear advantage here is that the class `Foo` remains unchanged regarding alignment issues.
+The drawback is that an additional heap allocation will be required whatsoever.
 
 */
 
diff --git a/contrib/eigen/doc/TemplateKeyword.dox b/contrib/eigen/doc/TemplateKeyword.dox
index b84cfdae96f56046e665cc0b135dde0bc4e0a503..fbf2c708165b76830bc06c0974f60943fdb6ae10 100644
--- a/contrib/eigen/doc/TemplateKeyword.dox
+++ b/contrib/eigen/doc/TemplateKeyword.dox
@@ -76,7 +76,7 @@ point where the template is defined, without knowing the actual value of the tem
 and \c Derived2 in the example). That means that the compiler cannot know that <tt>dst.triangularView</tt> is
 a member template and that the following &lt; symbol is part of the delimiter for the template
 parameter. Another possibility would be that <tt>dst.triangularView</tt> is a member variable with the &lt;
-symbol refering to the <tt>operator&lt;()</tt> function. In fact, the compiler should choose the second
+symbol referring to the <tt>operator&lt;()</tt> function. In fact, the compiler should choose the second
 possibility, according to the standard. If <tt>dst.triangularView</tt> is a member template (as in our case),
 the programmer should specify this explicitly with the \c template keyword and write <tt>dst.template
 triangularView</tt>.
diff --git a/contrib/eigen/doc/TopicCMakeGuide.dox b/contrib/eigen/doc/TopicCMakeGuide.dox
index 896cfa831d5b1d3ce465081028667abc546f3e10..cf767d0ddabacd1a5abdfc7b3ad742defde1c66c 100644
--- a/contrib/eigen/doc/TopicCMakeGuide.dox
+++ b/contrib/eigen/doc/TopicCMakeGuide.dox
@@ -32,9 +32,13 @@ which requires at least version 3.3 of %Eigen. Here, `path-to-example-directory`
 is the path to the directory that contains both `CMakeLists.txt` and
 `example.cpp`.
 
-If you have multiple installed version of %Eigen, you can pick your favorite one by setting the \c Eigen3_DIR cmake's variable to the respective path containing the \c Eigen3*.cmake files. For instance:
-\code
-cmake path-to-example-directory -DEigen3_DIR=$HOME/mypackages/share/eigen3/cmake/
+Do not forget to set the <a href="https://cmake.org/cmake/help/v3.7/variable/CMAKE_PREFIX_PATH.html">\c CMAKE_PREFIX_PATH </a> variable if Eigen is not installed in a default location or if you want to pick a specific version. For instance:
+\code{.sh}
+$ cmake path-to-example-directory -DCMAKE_PREFIX_PATH=$HOME/mypackages
+\endcode
+An alternative is to set the \c Eigen3_DIR cmake's variable to the respective path containing the \c Eigen3*.cmake files. For instance:
+\code{.sh}
+$ cmake path-to-example-directory -DEigen3_DIR=$HOME/mypackages/share/eigen3/cmake/
 \endcode
 
 If the `REQUIRED` option is omitted when locating %Eigen using
diff --git a/contrib/eigen/doc/TopicLinearAlgebraDecompositions.dox b/contrib/eigen/doc/TopicLinearAlgebraDecompositions.dox
index d9db67755352cd0b45c899bef5953c47cdf27df4..402b3769e2f3e44e6c145e9d0bdeefe82d9a6462 100644
--- a/contrib/eigen/doc/TopicLinearAlgebraDecompositions.dox
+++ b/contrib/eigen/doc/TopicLinearAlgebraDecompositions.dox
@@ -72,7 +72,7 @@ To get an overview of the true relative speed of the different decompositions, c
         <td>Orthogonalization</td>
         <td>Yes</td>
         <td>Excellent</td>
-        <td><em>Soon: blocking</em></td>
+        <td><em>-</em></td>
     </tr>
 
     <tr>
@@ -88,6 +88,18 @@ To get an overview of the true relative speed of the different decompositions, c
     </tr>
 
     <tr class="alt">
+        <td>CompleteOrthogonalDecomposition</td>
+        <td>-</td>
+        <td>Fast</td>
+        <td>Good</td>
+        <td>Yes</td>
+        <td>Orthogonalization</td>
+        <td>Yes</td>
+        <td>Excellent</td>
+        <td><em>-</em></td>
+    </tr>
+
+    <tr>
         <td>LLT</td>
         <td>Positive definite</td>
         <td>Very fast</td>
@@ -99,7 +111,7 @@ To get an overview of the true relative speed of the different decompositions, c
         <td>Blocking</td>
     </tr>
 
-    <tr>
+    <tr class="alt">
         <td>LDLT</td>
         <td>Positive or negative semidefinite<sup><a href="#note1">1</a></sup></td>
         <td>Very fast</td>
@@ -260,7 +272,7 @@ To get an overview of the true relative speed of the different decompositions, c
   <dt><b>Blocking</b></dt>
     <dd>Means the algorithm can work per block, whence guaranteeing a good scaling of the performance for large matrices.</dd>
   <dt><b>Implicit Multi Threading (MT)</b></dt>
-    <dd>Means the algorithm can take advantage of multicore processors via OpenMP. "Implicit" means the algortihm itself is not parallelized, but that it relies on parallelized matrix-matrix product rountines.</dd>
+    <dd>Means the algorithm can take advantage of multicore processors via OpenMP. "Implicit" means the algortihm itself is not parallelized, but that it relies on parallelized matrix-matrix product routines.</dd>
   <dt><b>Explicit Multi Threading (MT)</b></dt>
     <dd>Means the algorithm is explicitly parallelized to take advantage of multicore processors via OpenMP.</dd>
   <dt><b>Meta-unroller</b></dt>
diff --git a/contrib/eigen/doc/TopicMultithreading.dox b/contrib/eigen/doc/TopicMultithreading.dox
index 47c9b261f89db1ca116a9ba3185cfec78ccda316..7a8ff301f02f961e28d8168389ea0982d59c23b2 100644
--- a/contrib/eigen/doc/TopicMultithreading.dox
+++ b/contrib/eigen/doc/TopicMultithreading.dox
@@ -4,22 +4,25 @@ namespace Eigen {
 
 \section TopicMultiThreading_MakingEigenMT Make Eigen run in parallel
 
-Some Eigen's algorithms can exploit the multiple cores present in your hardware. To this end, it is enough to enable OpenMP on your compiler, for instance:
- * GCC: \c -fopenmp
- * ICC: \c -openmp
- * MSVC: check the respective option in the build properties.
-You can control the number of thread that will be used using either the OpenMP API or Eigen's API using the following priority:
+Some %Eigen's algorithms can exploit the multiple cores present in your hardware.
+To this end, it is enough to enable OpenMP on your compiler, for instance:
+ - GCC: \c -fopenmp
+ - ICC: \c -openmp
+ - MSVC: check the respective option in the build properties.
+
+You can control the number of threads that will be used using either the OpenMP API or %Eigen's API using the following priority:
 \code
  OMP_NUM_THREADS=n ./my_program
  omp_set_num_threads(n);
  Eigen::setNbThreads(n);
 \endcode
-Unless setNbThreads has been called, Eigen uses the number of threads specified by OpenMP. You can restore this behavior by calling \code setNbThreads(0); \endcode
+Unless `setNbThreads` has been called, %Eigen uses the number of threads specified by OpenMP.
+You can restore this behavior by calling `setNbThreads(0);`.
 You can query the number of threads that will be used with:
 \code
 n = Eigen::nbThreads( );
 \endcode
-You can disable Eigen's multi threading at compile time by defining the EIGEN_DONT_PARALLELIZE preprocessor token.
+You can disable %Eigen's multi threading at compile time by defining the \link TopicPreprocessorDirectivesPerformance EIGEN_DONT_PARALLELIZE \endlink preprocessor token.
 
 Currently, the following algorithms can make use of multi-threading:
  - general dense matrix - matrix products
@@ -29,9 +32,17 @@ Currently, the following algorithms can make use of multi-threading:
  - BiCGSTAB with a row-major sparse matrix format.
  - LeastSquaresConjugateGradient
 
+\warning On most OS it is <strong>very important</strong> to limit the number of threads to the number of physical cores, otherwise significant slowdowns are expected, especially for operations involving dense matrices.
+
+Indeed, the principle of hyper-threading is to run multiple threads (in most cases 2) on a single core in an interleaved manner.
+However, %Eigen's matrix-matrix product kernel is fully optimized and already exploits nearly 100% of the CPU capacity.
+Consequently, there is no room for running multiple such threads on a single core, and the performance would drops significantly because of cache pollution and other sources of overheads.
+At this stage of reading you're probably wondering why %Eigen does not limit itself to the number of physical cores?
+This is simply because OpenMP does not allow to know the number of physical cores, and thus %Eigen will launch as many threads as <i>cores</i> reported by OpenMP.
+
 \section TopicMultiThreading_UsingEigenWithMT Using Eigen in a multi-threaded application
 
-In the case your own application is multithreaded, and multiple threads make calls to Eigen, then you have to initialize Eigen by calling the following routine \b before creating the threads:
+In the case your own application is multithreaded, and multiple threads make calls to %Eigen, then you have to initialize %Eigen by calling the following routine \b before creating the threads:
 \code
 #include <Eigen/Core>
 
@@ -43,12 +54,14 @@ int main(int argc, char** argv)
 }
 \endcode
 
-\note With Eigen 3.3, and a fully C++11 compliant compiler (i.e., <a href="http://en.cppreference.com/w/cpp/language/storage_duration#Static_local_variables">thread-safe static local variable initialization</a>), then calling \c initParallel() is optional.
+\note With %Eigen 3.3, and a fully C++11 compliant compiler (i.e., <a href="http://en.cppreference.com/w/cpp/language/storage_duration#Static_local_variables">thread-safe static local variable initialization</a>), then calling \c initParallel() is optional.
 
-\warning note that all functions generating random matrices are \b not re-entrant nor thread-safe. Those include DenseBase::Random(), and DenseBase::setRandom() despite a call to Eigen::initParallel(). This is because these functions are based on std::rand which is not re-entrant. For thread-safe random generator, we recommend the use of boost::random or c++11 random feature.
+\warning Note that all functions generating random matrices are \b not re-entrant nor thread-safe. Those include DenseBase::Random(), and DenseBase::setRandom() despite a call to `Eigen::initParallel()`. This is because these functions are based on `std::rand` which is not re-entrant.
+For thread-safe random generator, we recommend the use of c++11 random generators (\link DenseBase::NullaryExpr(Index, const CustomNullaryOp&) example \endlink) or `boost::random`.
 
-In the case your application is parallelized with OpenMP, you might want to disable Eigen's own parallization as detailed in the previous section.
+In the case your application is parallelized with OpenMP, you might want to disable %Eigen's own parallelization as detailed in the previous section.
 
+\warning Using OpenMP with custom scalar types that might throw exceptions can lead to unexpected behaviour in the event of throwing.
 */
 
 }
diff --git a/contrib/eigen/doc/TutorialBlockOperations.dox b/contrib/eigen/doc/TutorialBlockOperations.dox
index a2d8c97cc094f62b0bd64278658272792362295a..df277482c897e23675520e21e0fb07b50ff0cce5 100644
--- a/contrib/eigen/doc/TutorialBlockOperations.dox
+++ b/contrib/eigen/doc/TutorialBlockOperations.dox
@@ -167,6 +167,20 @@ matrix.rightCols(q);\endcode </td>
     <td>\code 
 matrix.rightCols<q>();\endcode </td>
 </tr>
+<tr><td>%Block containing the q columns starting from i
+                    \link DenseBase::middleCols() * \endlink</td>
+    <td>\code
+matrix.middleCols(i,q);\endcode </td>
+    <td>\code 
+matrix.middleCols<q>(i);\endcode </td>
+</tr>
+<tr><td>%Block containing the q rows starting from i
+                    \link DenseBase::middleRows() * \endlink</td>
+    <td>\code
+matrix.middleRows(i,q);\endcode </td>
+    <td>\code 
+matrix.middleRows<q>(i);\endcode </td>
+</tr>
 </table>
 
 Here is a simple example illustrating the use of the operations presented above:
diff --git a/contrib/eigen/doc/TutorialGeometry.dox b/contrib/eigen/doc/TutorialGeometry.dox
index 723f4dbcee9064df707b69761feeac169954c986..1d214f35540ca0be7b2beacab149dea801f64099 100644
--- a/contrib/eigen/doc/TutorialGeometry.dox
+++ b/contrib/eigen/doc/TutorialGeometry.dox
@@ -111,7 +111,7 @@ rot3 = rot1.slerp(alpha,rot2);\endcode</td></tr>
 
 
 <a href="#" class="top">top</a>\section TutorialGeoTransform Affine transformations
-Generic affine transformations are represented by the Transform class which internaly
+Generic affine transformations are represented by the Transform class which internally
 is a (Dim+1)^2 matrix. In Eigen we have chosen to not distinghish between points and
 vectors such that all points are actually represented by displacement vectors from the
 origin ( \f$ \mathbf{p} \equiv \mathbf{p}-0 \f$ ). With that in mind, real points and
diff --git a/contrib/eigen/doc/TutorialLinearAlgebra.dox b/contrib/eigen/doc/TutorialLinearAlgebra.dox
index a72724143c693fd918268be84e722e093c2c65bf..8042fcad333788acc479a97d1b39809d3bcefd0a 100644
--- a/contrib/eigen/doc/TutorialLinearAlgebra.dox
+++ b/contrib/eigen/doc/TutorialLinearAlgebra.dox
@@ -14,7 +14,7 @@ QR, %SVD, eigendecompositions... After reading this page, don't miss our
     \f[ Ax \: = \: b \f]
 Where \a A and \a b are matrices (\a b could be a vector, as a special case). You want to find a solution \a x.
 
-\b The \b solution: You can choose between various decompositions, depending on what your matrix \a A looks like,
+\b The \b solution: You can choose between various decompositions, depending on the properties of your matrix \a A,
 and depending on whether you favor speed or accuracy. However, let's start with an example that works in all cases,
 and is a good compromise:
 <table class="example">
@@ -34,7 +34,7 @@ Vector3f x = dec.solve(b);
 
 Here, ColPivHouseholderQR is a QR decomposition with column pivoting. It's a good compromise for this tutorial, as it
 works for all matrices while being quite fast. Here is a table of some other decompositions that you can choose from,
-depending on your matrix and the trade-off you want to make:
+depending on your matrix, the problem you are trying to solve, and the trade-off you want to make:
 
 <table class="manual">
     <tr>
@@ -128,11 +128,13 @@ depending on your matrix and the trade-off you want to make:
 </table>
 To get an overview of the true relative speed of the different decompositions, check this \link DenseDecompositionBenchmark benchmark \endlink.
 
-All of these decompositions offer a solve() method that works as in the above example.
+All of these decompositions offer a solve() method that works as in the above example. 
 
-For example, if your matrix is positive definite, the above table says that a very good
-choice is then the LLT or LDLT decomposition. Here's an example, also demonstrating that using a general
-matrix (not a vector) as right hand side is possible.
+If you know more about the properties of your matrix, you can use the above table to select the best method.
+For example, a good choice for solving linear systems with a non-symmetric matrix of full rank is PartialPivLU.
+If you know that your matrix is also symmetric and positive definite, the above table says that
+a very good choice is the LLT or LDLT decomposition. Here's an example, also demonstrating that using a general
+matrix (not a vector) as right hand side is possible:
 
 <table class="example">
 <tr><th>Example:</th><th>Output:</th></tr>
@@ -146,7 +148,34 @@ For a \ref TopicLinearAlgebraDecompositions "much more complete table" comparing
 supports many other decompositions), see our special page on
 \ref TopicLinearAlgebraDecompositions "this topic".
 
-\section TutorialLinAlgSolutionExists Checking if a solution really exists
+
+\section TutorialLinAlgLeastsquares Least squares solving
+
+The most general and accurate method to solve under- or over-determined linear systems
+in the least squares sense, is the SVD decomposition. Eigen provides two implementations.
+The recommended one is the BDCSVD class, which scales well for large problems
+and automatically falls back to the JacobiSVD class for smaller problems.
+For both classes, their solve() method solved the linear system in the least-squares
+sense. 
+
+Here is an example:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include TutorialLinAlgSVDSolve.cpp </td>
+  <td>\verbinclude TutorialLinAlgSVDSolve.out </td>
+</tr>
+</table>
+
+An alternative to the SVD, which is usually faster and about as accurate, is CompleteOrthogonalDecomposition. 
+
+Again, if you know more about the problem, the table above contains methods that are potentially faster.
+If your matrix is full rank, HouseHolderQR is the method of choice. If your matrix is full rank and well conditioned,
+using the Cholesky decomposition (LLT) on the matrix of the normal equations can be faster still.
+Our page on \link LeastSquares least squares solving \endlink has more details.
+
+
+\section TutorialLinAlgSolutionExists Checking if a matrix is singular
 
 Only you know what error margin you want to allow for a solution to be considered valid.
 So Eigen lets you do this computation for yourself, if you want to, as in this example:
@@ -179,11 +208,11 @@ very rare. The call to info() is to check for this possibility.
 \section TutorialLinAlgInverse Computing inverse and determinant
 
 First of all, make sure that you really want this. While inverse and determinant are fundamental mathematical concepts,
-in \em numerical linear algebra they are not as popular as in pure mathematics. Inverse computations are often
+in \em numerical linear algebra they are not as useful as in pure mathematics. Inverse computations are often
 advantageously replaced by solve() operations, and the determinant is often \em not a good way of checking if a matrix
 is invertible.
 
-However, for \em very \em small matrices, the above is not true, and inverse and determinant can be very useful.
+However, for \em very \em small matrices, the above may not be true, and inverse and determinant can be very useful.
 
 While certain decompositions, such as PartialPivLU and FullPivLU, offer inverse() and determinant() methods, you can also
 call inverse() and determinant() directly on a matrix. If your matrix is of a very small fixed size (at most 4x4) this
@@ -198,28 +227,6 @@ Here is an example:
 </tr>
 </table>
 
-\section TutorialLinAlgLeastsquares Least squares solving
-
-The most accurate method to do least squares solving is with a SVD decomposition.
-Eigen provides two implementations.
-The recommended one is the BDCSVD class, which scale well for large problems
-and automatically fall-back to the JacobiSVD class for smaller problems.
-For both classes, their solve() method is doing least-squares solving.
-
-Here is an example:
-<table class="example">
-<tr><th>Example:</th><th>Output:</th></tr>
-<tr>
-  <td>\include TutorialLinAlgSVDSolve.cpp </td>
-  <td>\verbinclude TutorialLinAlgSVDSolve.out </td>
-</tr>
-</table>
-
-Another methods, potentially faster but less reliable, are to use a Cholesky decomposition of the
-normal matrix or a QR decomposition. Our page on \link LeastSquares least squares solving \endlink
-has more details.
-
-
 \section TutorialLinAlgSeparateComputation Separating the computation from the construction
 
 In the above examples, the decomposition was computed at the same time that the decomposition object was constructed.
diff --git a/contrib/eigen/doc/TutorialMapClass.dox b/contrib/eigen/doc/TutorialMapClass.dox
index f8fb0fd2f2156561b7316153de93015b71ae68c0..caa2539d81e04bbf113e2197b18d8c2c5b0355be 100644
--- a/contrib/eigen/doc/TutorialMapClass.dox
+++ b/contrib/eigen/doc/TutorialMapClass.dox
@@ -29,9 +29,9 @@ Map<const Vector4i> mi(pi);
 \endcode
 where \c pi is an \c int \c *. In this case the size does not have to be passed to the constructor, because it is already specified by the Matrix/Array type.
 
-Note that Map does not have a default constructor; you \em must pass a pointer to intialize the object. However, you can work around this requirement (see \ref TutorialMapPlacementNew).
+Note that Map does not have a default constructor; you \em must pass a pointer to initialize the object. However, you can work around this requirement (see \ref TutorialMapPlacementNew).
 
-Map is flexible enough to accomodate a variety of different data representations.  There are two other (optional) template parameters:
+Map is flexible enough to accommodate a variety of different data representations.  There are two other (optional) template parameters:
 \code
 Map<typename MatrixType,
     int MapOptions,
diff --git a/contrib/eigen/doc/TutorialMatrixClass.dox b/contrib/eigen/doc/TutorialMatrixClass.dox
index 7ea0cd789b7f35400caf0d2a2a919d8087d3e5bf..2c452220f433757f0921668064c2f49b0da167ae 100644
--- a/contrib/eigen/doc/TutorialMatrixClass.dox
+++ b/contrib/eigen/doc/TutorialMatrixClass.dox
@@ -101,13 +101,41 @@ Matrix3f a(3,3);
 \endcode
 and is a no-operation.
 
-Finally, we also offer some constructors to initialize the coefficients of small fixed-size vectors up to size 4:
+Matrices and vectors can also be initialized from lists of coefficients.
+Prior to C++11, this feature is limited to small fixed-size column or vectors up to size 4:
 \code
 Vector2d a(5.0, 6.0);
 Vector3d b(5.0, 6.0, 7.0);
 Vector4d c(5.0, 6.0, 7.0, 8.0);
 \endcode
 
+If C++11 is enabled, fixed-size column or row vectors of arbitrary size can be initialized by passing an arbitrary number of coefficients:
+\code
+Vector2i a(1, 2);                      // A column vector containing the elements {1, 2}
+Matrix<int, 5, 1> b {1, 2, 3, 4, 5};   // A row-vector containing the elements {1, 2, 3, 4, 5}
+Matrix<int, 1, 5> c = {1, 2, 3, 4, 5}; // A column vector containing the elements {1, 2, 3, 4, 5}
+\endcode
+
+In the general case of matrices and vectors with either fixed or runtime sizes,
+coefficients have to be grouped by rows and passed as an initializer list of initializer list (\link Matrix::Matrix(const std::initializer_list<std::initializer_list<Scalar>>&) details \endlink):
+\code
+MatrixXi a {      // construct a 2x2 matrix
+      {1, 2},     // first row
+      {3, 4}      // second row
+};
+Matrix<double, 2, 3> b {
+      {2, 3, 4},
+      {5, 6, 7},
+};
+\endcode
+
+For column or row vectors, implicit transposition is allowed.
+This means that a column vector can be initialized from a single row:
+\code
+VectorXd a {{1.5, 2.5, 3.5}};             // A column-vector with 3 coefficients
+RowVectorXd b {{1.0, 2.0, 3.0, 4.0}};     // A row-vector with 4 coefficients
+\endcode
+
 \section TutorialMatrixCoeffAccessors Coefficient accessors
 
 The primary coefficient accessors and mutators in Eigen are the overloaded parenthesis operators.
diff --git a/contrib/eigen/doc/TutorialReshape.dox b/contrib/eigen/doc/TutorialReshape.dox
new file mode 100644
index 0000000000000000000000000000000000000000..5b4022a3b7efbdb89bd9228756312dc591579e3a
--- /dev/null
+++ b/contrib/eigen/doc/TutorialReshape.dox
@@ -0,0 +1,82 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialReshape Reshape
+
+Since the version 3.4, %Eigen exposes convenient methods to reshape a matrix to another matrix of different sizes or vector.
+All cases are handled via the DenseBase::reshaped(NRowsType,NColsType) and DenseBase::reshaped() functions.
+Those functions do not perform in-place reshaping, but instead return a <i> view </i> on the input expression.
+
+\eigenAutoToc
+
+\section TutorialReshapeMat2Mat Reshaped 2D views
+
+The more general reshaping transformation is handled via: `reshaped(nrows,ncols)`.
+Here is an example reshaping a 4x4 matrix to a 2x8 one:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include MatrixBase_reshaped_int_int.cpp
+</td>
+<td>
+\verbinclude MatrixBase_reshaped_int_int.out
+</td></tr></table>
+
+By default, the input coefficients are always interpreted in column-major order regardless of the storage order of the input expression.
+For more control on ordering, compile-time sizes, and automatic size deduction, please see de documentation of DenseBase::reshaped(NRowsType,NColsType) that contains all the details with many examples.
+
+
+\section TutorialReshapeMat2Vec 1D linear views
+
+A very common usage of reshaping is to create a 1D linear view over a given 2D matrix or expression.
+In this case, sizes can be deduced and thus omitted as in the following example:
+
+<table class="example">
+<tr><th>Example:</th></tr>
+<tr><td>
+\include MatrixBase_reshaped_to_vector.cpp
+</td></tr>
+<tr><th>Output:</th></tr>
+<tr><td>
+\verbinclude MatrixBase_reshaped_to_vector.out
+</td></tr></table>
+
+This shortcut always returns a column vector and by default input coefficients are always interpreted in column-major order.
+Again, see the documentation of DenseBase::reshaped() for more control on the ordering.
+
+\section TutorialReshapeInPlace
+
+The above examples create reshaped views, but what about reshaping inplace a given matrix?
+Of course this task in only conceivable for matrix and arrays having runtime dimensions.
+In many cases, this can be accomplished via PlainObjectBase::resize(Index,Index):
+
+<table class="example">
+<tr><th>Example:</th></tr>
+<tr><td>
+\include Tutorial_reshaped_vs_resize_1.cpp
+</td></tr>
+<tr><th>Output:</th></tr>
+<tr><td>
+\verbinclude Tutorial_reshaped_vs_resize_1.out
+</td></tr></table>
+
+However beware that unlike \c reshaped, the result of \c resize depends on the input storage order.
+It thus behaves similarly to `reshaped<AutoOrder>`:
+
+<table class="example">
+<tr><th>Example:</th></tr>
+<tr><td>
+\include Tutorial_reshaped_vs_resize_2.cpp
+</td></tr>
+<tr><th>Output:</th></tr>
+<tr><td>
+\verbinclude Tutorial_reshaped_vs_resize_2.out
+</td></tr></table>
+
+Finally, assigning a reshaped matrix to itself is currently not supported and will result to undefined-behavior because of \link TopicAliasing aliasing \endlink.
+The following is forbidden: \code A = A.reshaped(2,8); \endcode
+This is OK: \code A = A.reshaped(2,8).eval(); \endcode
+
+*/
+
+}
diff --git a/contrib/eigen/doc/TutorialReshapeSlicing.dox b/contrib/eigen/doc/TutorialReshapeSlicing.dox
deleted file mode 100644
index 3730a5de6eccc1d3fa7ddb7fb83f94ab7210378f..0000000000000000000000000000000000000000
--- a/contrib/eigen/doc/TutorialReshapeSlicing.dox
+++ /dev/null
@@ -1,65 +0,0 @@
-namespace Eigen {
-
-/** \eigenManualPage TutorialReshapeSlicing Reshape and Slicing
-
-%Eigen does not expose convenient methods to take slices or to reshape a matrix yet.
-Nonetheless, such features can easily be emulated using the Map class.
-
-\eigenAutoToc
-
-\section TutorialReshape Reshape
-
-A reshape operation consists in modifying the sizes of a matrix while keeping the same coefficients.
-Instead of modifying the input matrix itself, which is not possible for compile-time sizes, the approach consist in creating a different \em view on the storage using class Map.
-Here is a typical example creating a 1D linear view of a matrix:
-
-<table class="example">
-<tr><th>Example:</th><th>Output:</th></tr>
-<tr><td>
-\include Tutorial_ReshapeMat2Vec.cpp
-</td>
-<td>
-\verbinclude Tutorial_ReshapeMat2Vec.out
-</td></tr></table>
-
-Remark how the storage order of the input matrix modifies the order of the coefficients in the linear view.
-Here is another example reshaping a 2x6 matrix to a 6x2 one:
-<table class="example">
-<tr><th>Example:</th><th>Output:</th></tr>
-<tr><td>
-\include Tutorial_ReshapeMat2Mat.cpp
-</td>
-<td>
-\verbinclude Tutorial_ReshapeMat2Mat.out
-</td></tr></table>
-
-
-
-\section TutorialSlicing Slicing
-
-Slicing consists in taking a set of rows, columns, or elements, uniformly spaced within a matrix.
-Again, the class Map allows to easily mimic this feature.
-
-For instance, one can skip every P elements in a vector:
-<table class="example">
-<tr><th>Example:</th><th>Output:</th></tr>
-<tr><td>
-\include Tutorial_SlicingVec.cpp
-</td>
-<td>
-\verbinclude Tutorial_SlicingVec.out
-</td></tr></table>
-
-One can olso take one column over three using an adequate outer-stride or inner-stride depending on the actual storage order:
-<table class="example">
-<tr><th>Example:</th><th>Output:</th></tr>
-<tr><td>
-\include Tutorial_SlicingCol.cpp
-</td>
-<td>
-\verbinclude Tutorial_SlicingCol.out
-</td></tr></table>
-
-*/
-
-}
diff --git a/contrib/eigen/doc/TutorialSTL.dox b/contrib/eigen/doc/TutorialSTL.dox
new file mode 100644
index 0000000000000000000000000000000000000000..9a825bc4885a2d0db802db59cd885a206ce77fb5
--- /dev/null
+++ b/contrib/eigen/doc/TutorialSTL.dox
@@ -0,0 +1,66 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialSTL STL iterators and algorithms
+
+Since the version 3.4, %Eigen's dense matrices and arrays provide STL compatible iterators.
+As demonstrated below, this makes them naturally compatible with range-for-loops and STL's algorithms.
+
+\eigenAutoToc
+
+\section TutorialSTLVectors Iterating over 1D arrays and vectors 
+
+Any dense 1D expressions exposes the pair of `begin()/end()` methods to iterate over them.
+
+This directly enables c++11 range for loops:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_range_for_loop_1d_cxx11.cpp
+</td>
+<td>
+\verbinclude Tutorial_range_for_loop_1d_cxx11.out
+</td></tr></table>
+
+One dimensional expressions can also easily be passed to STL algorithms:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_std_sort.cpp
+</td>
+<td>
+\verbinclude Tutorial_std_sort.out
+</td></tr></table>
+
+Similar to `std::vector`, 1D expressions also exposes the pair of `cbegin()/cend()` methods to conveniently get const iterators on non-const object.
+
+\section TutorialSTLMatrices Iterating over coefficients of 2D arrays and matrices
+
+STL iterators are intrinsically designed to iterate over 1D structures.
+This is why `begin()/end()` methods are disabled for 2D expressions.
+Iterating over all coefficients of a 2D expressions is still easily accomplished by creating a 1D linear view through `reshaped()`:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_range_for_loop_2d_cxx11.cpp
+</td>
+<td>
+\verbinclude Tutorial_range_for_loop_2d_cxx11.out
+</td></tr></table>
+
+\section TutorialSTLRowsColumns Iterating over rows or columns of 2D arrays and matrices
+
+It is also possible to get iterators over rows or columns of 2D expressions.
+Those are available through the `rowwise()` and `colwise()` proxies.
+Here is an example sorting each row of a matrix:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_std_sort_rows_cxx11.cpp
+</td>
+<td>
+\verbinclude Tutorial_std_sort_rows_cxx11.out
+</td></tr></table>
+
+*/
+
+}
diff --git a/contrib/eigen/doc/TutorialSlicingIndexing.dox b/contrib/eigen/doc/TutorialSlicingIndexing.dox
new file mode 100644
index 0000000000000000000000000000000000000000..98ace43e4a5a2cf9e793b0711d3196f1f621638c
--- /dev/null
+++ b/contrib/eigen/doc/TutorialSlicingIndexing.dox
@@ -0,0 +1,244 @@
+namespace Eigen {
+
+/** \eigenManualPage TutorialSlicingIndexing Slicing and Indexing
+
+This page presents the numerous possibilities offered by `operator()` to index sub-set of rows and columns.
+This API has been introduced in %Eigen 3.4.
+It supports all the feature proposed by the \link TutorialBlockOperations block API \endlink, and much more.
+In particular, it supports \b slicing that consists in taking a set of rows, columns, or elements, uniformly spaced within a matrix or indexed from an array of indices.
+
+\eigenAutoToc
+
+\section TutorialSlicingOverview Overview
+
+All the aforementioned operations are handled through the generic DenseBase::operator()(const RowIndices&, const ColIndices&) method.
+Each argument can be:
+  - An integer indexing a single row or column, including symbolic indices.
+  - The symbol Eigen::all representing the whole set of respective rows or columns in increasing order.
+  - An ArithmeticSequence as constructed by the Eigen::seq, Eigen::seqN, or Eigen::lastN functions.
+  - Any 1D vector/array of integers including %Eigen's vector/array, expressions, std::vector, std::array, as well as plain C arrays: `int[N]`.
+
+More generally, it can accepts any object exposing the following two member functions:
+  \code
+  <integral type> operator[](<integral type>) const;
+  <integral type> size() const;
+  \endcode
+where `<integral type>` stands for any integer type compatible with Eigen::Index (i.e. `std::ptrdiff_t`).
+
+\section TutorialSlicingBasic Basic slicing
+
+Taking a set of rows, columns, or elements, uniformly spaced within a matrix or vector is achieved through the Eigen::seq or Eigen::seqN functions where "seq" stands for arithmetic sequence. Their signatures are summarized below:
+
+<table class="manual">
+<tr>
+  <th>function</th>
+  <th>description</th>
+  <th>example</th>
+</tr>
+<tr>
+  <td>\code seq(firstIdx,lastIdx) \endcode</td>
+  <td>represents the sequence of integers ranging from \c firstIdx to \c lastIdx</td>
+  <td>\code seq(2,5) <=> {2,3,4,5} \endcode</td>
+</tr>
+<tr>
+  <td>\code seq(firstIdx,lastIdx,incr) \endcode</td>
+  <td>same but using the increment \c incr to advance from one index to the next</td>
+  <td>\code seq(2,8,2) <=> {2,4,6,8} \endcode</td>
+</tr>
+<tr>
+  <td>\code seqN(firstIdx,size) \endcode</td>
+  <td>represents the sequence of \c size integers starting from \c firstIdx</td>
+  <td>\code seqN(2,5) <=> {2,3,4,5,6} \endcode</td>
+</tr>
+<tr>
+  <td>\code seqN(firstIdx,size,incr) \endcode</td>
+  <td>same but using the increment \c incr to advance from one index to the next</td>
+  <td>\code seqN(2,3,3) <=> {2,5,8} \endcode</td>
+</tr>
+</table>
+
+The \c firstIdx and \c lastIdx parameters can also be defined with the help of the Eigen::last symbol representing the index of the last row, column or element of the underlying matrix/vector once the arithmetic sequence is passed to it through operator().
+Here are some examples for a 2D array/matrix \c A and a 1D array/vector \c v.
+<table class="manual">
+<tr>
+  <th>Intent</th>
+  <th>Code</th>
+  <th>Block-API equivalence</th>
+</tr>
+<tr>
+  <td>Bottom-left corner starting at row \c i with \c n columns</td>
+  <td>\code A(seq(i,last), seqN(0,n)) \endcode</td>
+  <td>\code A.bottomLeftCorner(A.rows()-i,n) \endcode</td>
+</tr>
+<tr>
+  <td>%Block starting at \c i,j having \c m rows, and \c n columns</td>
+  <td>\code A(seqN(i,m), seqN(i,n) \endcode</td>
+  <td>\code A.block(i,j,m,n) \endcode</td>
+</tr>
+<tr>
+  <td>%Block starting at \c i0,j0 and ending at \c i1,j1</td>
+  <td>\code A(seq(i0,i1), seq(j0,j1) \endcode</td>
+  <td>\code A.block(i0,j0,i1-i0+1,j1-j0+1) \endcode</td>
+</tr>
+<tr>
+  <td>Even columns of A</td>
+  <td>\code A(all, seq(0,last,2)) \endcode</td>
+  <td></td>
+</tr>
+<tr>
+  <td>First \c n odd rows A</td>
+  <td>\code A(seqN(1,n,2), all) \endcode</td>
+  <td></td>
+</tr>
+<tr>
+  <td>The last past one column</td>
+  <td>\code A(all, last-1) \endcode</td>
+  <td>\code A.col(A.cols()-2) \endcode</td>
+</tr>
+<tr>
+  <td>The middle row</td>
+  <td>\code A(last/2,all) \endcode</td>
+  <td>\code A.row((A.rows()-1)/2) \endcode</td>
+</tr>
+<tr>
+  <td>Last elements of v starting at i</td>
+  <td>\code v(seq(i,last)) \endcode</td>
+  <td>\code v.tail(v.size()-i) \endcode</td>
+</tr>
+<tr>
+  <td>Last \c n elements of v</td>
+  <td>\code v(seq(last+1-n,last)) \endcode</td>
+  <td>\code v.tail(n) \endcode</td>
+</tr>
+</table>
+
+As seen in the last exemple, referencing the <i> last n </i> elements (or rows/columns) is a bit cumbersome to write.
+This becomes even more tricky and error prone with a non-default increment.
+Here comes \link Eigen::lastN(SizeType) Eigen::lastN(size) \endlink, and \link Eigen::lastN(SizeType,IncrType) Eigen::lastN(size,incr) \endlink:
+
+<table class="manual">
+<tr>
+  <th>Intent</th>
+  <th>Code</th>
+  <th>Block-API equivalence</th>
+</tr>
+<tr>
+  <td>Last \c n elements of v</td>
+  <td>\code v(lastN(n)) \endcode</td>
+  <td>\code v.tail(n) \endcode</td>
+</tr>
+<tr>
+  <td>Bottom-right corner of A of size \c m times \c n</td>
+  <td>\code v(lastN(m), lastN(n)) \endcode</td>
+  <td>\code A.bottomRightCorner(m,n) \endcode</td>
+</tr>
+<tr>
+  <td>Bottom-right corner of A of size \c m times \c n</td>
+  <td>\code v(lastN(m), lastN(n)) \endcode</td>
+  <td>\code A.bottomRightCorner(m,n) \endcode</td>
+</tr>
+<tr>
+  <td>Last \c n columns taking 1 column over 3</td>
+  <td>\code A(all, lastN(n,3)) \endcode</td>
+  <td></td>
+</tr>
+</table>
+
+\section TutorialSlicingFixed Compile time size and increment
+
+In terms of performance, %Eigen and the compiler can take advantage of compile-time size and increment.
+To this end, you can enforce compile-time parameters using Eigen::fix<val>.
+Such compile-time value can be combined with the Eigen::last symbol:
+\code v(seq(last-fix<7>, last-fix<2>))
+\endcode
+In this example %Eigen knowns at compile-time that the returned expression has 6 elements.
+It is equivalent to:
+\code v(seqN(last-7, fix<6>))
+\endcode
+
+We can revisit the <i>even columns of A</i> example as follows:
+\code A(all, seq(0,last,fix<2>))
+\endcode
+
+
+\section TutorialSlicingReverse Reverse order
+
+Row/column indices can also be enumerated in decreasing order using a negative increment.
+For instance, one over two columns of A from the column 20 to 10:
+\code A(all, seq(20, 10, fix<-2>))
+\endcode
+The last \c n rows starting from the last one:
+\code A(seqN(last, n, fix<-1>), all)
+\endcode
+You can also use the ArithmeticSequence::reverse() method to reverse its order.
+The previous example can thus also be written as:
+\code A(lastN(n).reverse(), all)
+\endcode
+
+
+\section TutorialSlicingArray Array of indices
+
+The generic `operator()` can also takes as input an arbitrary list of row or column indices stored as either an `ArrayXi`, a `std::vector<int>`, `std::array<int,N>`, etc.
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Slicing_stdvector_cxx11.cpp
+</td>
+<td>
+\verbinclude Slicing_stdvector_cxx11.out
+</td></tr></table>
+
+You can also directly pass a static array:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Slicing_rawarray_cxx11.cpp
+</td>
+<td>
+\verbinclude Slicing_rawarray_cxx11.out
+</td></tr></table>
+
+or expressions:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Slicing_arrayexpr.cpp
+</td>
+<td>
+\verbinclude Slicing_arrayexpr.out
+</td></tr></table>
+
+When passing an object with a compile-time size such as `Array4i`, `std::array<int,N>`, or a static array, then the returned expression also exhibit compile-time dimensions.
+
+\section TutorialSlicingCustomArray Custom index list
+
+More generally, `operator()` can accept as inputs any object \c ind of type \c T compatible with:
+\code
+Index s = ind.size(); or Index s = size(ind);
+Index i;
+i = ind[i];
+\endcode
+
+This means you can easily build your own fancy sequence generator and pass it to `operator()`.
+Here is an exemple enlarging a given matrix while padding the additional first rows and columns through repetition:
+
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Slicing_custom_padding_cxx11.cpp
+</td>
+<td>
+\verbinclude Slicing_custom_padding_cxx11.out
+</td></tr></table>
+
+<br>
+
+*/
+
+/*
+TODO add:
+so_repeat_inner.cpp
+so_repeleme.cpp
+*/
+}
diff --git a/contrib/eigen/doc/TutorialSparse.dox b/contrib/eigen/doc/TutorialSparse.dox
index 3529074088846d79318226c95dd2d111304c59cc..c69171ec5d52999c8b8bf9ab259ba7de0e064146 100644
--- a/contrib/eigen/doc/TutorialSparse.dox
+++ b/contrib/eigen/doc/TutorialSparse.dox
@@ -57,10 +57,10 @@ The \c "_" indicates available free space to quickly insert new elements.
 Assuming no reallocation is needed, the insertion of a random element is therefore in O(nnz_j) where nnz_j is the number of nonzeros of the respective inner vector.
 On the other hand, inserting elements with increasing inner indices in a given inner vector is much more efficient since this only requires to increase the respective \c InnerNNZs entry that is a O(1) operation.
 
-The case where no empty space is available is a special case, and is refered as the \em compressed mode.
+The case where no empty space is available is a special case, and is referred as the \em compressed mode.
 It corresponds to the widely used Compressed Column (or Row) Storage schemes (CCS or CRS).
 Any SparseMatrix can be turned to this form by calling the SparseMatrix::makeCompressed() function.
-In this case, one can remark that the \c InnerNNZs array is redundant with \c OuterStarts because we the equality: \c InnerNNZs[j] = \c OuterStarts[j+1]-\c OuterStarts[j].
+In this case, one can remark that the \c InnerNNZs array is redundant with \c OuterStarts because we have the equality: \c InnerNNZs[j] = \c OuterStarts[j+1]-\c OuterStarts[j].
 Therefore, in practice a call to SparseMatrix::makeCompressed() frees this buffer.
 
 It is worth noting that most of our wrappers to external libraries requires compressed matrices as inputs.
@@ -212,7 +212,7 @@ See the SparseMatrix::setFromTriplets() function and class Triplet for more deta
 
 
 In some cases, however, slightly higher performance, and lower memory consumption can be reached by directly inserting the non-zeros into the destination matrix.
-A typical scenario of this approach is illustrated bellow:
+A typical scenario of this approach is illustrated below:
 \code
 1: SparseMatrix<double> mat(rows,cols);         // default is column major
 2: mat.reserve(VectorXi::Constant(cols,6));
diff --git a/contrib/eigen/doc/UnalignedArrayAssert.dox b/contrib/eigen/doc/UnalignedArrayAssert.dox
index 95d95a2d51074bb7bcbec0bd39d18460697b07c2..410c8a58f86bd5dac079c9d8c12955ec65d211f3 100644
--- a/contrib/eigen/doc/UnalignedArrayAssert.dox
+++ b/contrib/eigen/doc/UnalignedArrayAssert.dox
@@ -12,7 +12,9 @@ is explained here: http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArr
 **** READ THIS WEB PAGE !!! ****"' failed.
 </pre>
 
-There are 4 known causes for this issue. Please read on to understand them and learn how to fix them.
+There are 4 known causes for this issue.
+If you can target \cpp17 only with a recent compiler (e.g., GCC>=7, clang>=5, MSVC>=19.12), then you're lucky: enabling c++17 should be enough (if not, please <a href="http://eigen.tuxfamily.org/bz/">report</a> to us).
+Otherwise, please read on to understand those issues and learn how to fix them.
 
 \eigenAutoToc
 
@@ -35,7 +37,7 @@ If you have code like this,
 class Foo
 {
   //...
-  Eigen::Vector2d v;
+  Eigen::Vector4d v;
   //...
 };
 //...
@@ -44,27 +46,27 @@ Foo *foo = new Foo;
 
 then you need to read this separate page: \ref TopicStructHavingEigenMembers "Structures Having Eigen Members".
 
-Note that here, Eigen::Vector2d is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types".
+Note that here, Eigen::Vector4d is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types".
 
 \section c2 Cause 2: STL Containers or manual memory allocation
 
 If you use STL Containers such as std::vector, std::map, ..., with %Eigen objects, or with classes containing %Eigen objects, like this,
 
 \code
-std::vector<Eigen::Matrix2f> my_vector;
-struct my_class { ... Eigen::Matrix2f m; ... };
+std::vector<Eigen::Matrix2d> my_vector;
+struct my_class { ... Eigen::Matrix2d m; ... };
 std::map<int, my_class> my_map;
 \endcode
 
 then you need to read this separate page: \ref TopicStlContainers "Using STL Containers with Eigen".
 
-Note that here, Eigen::Matrix2f is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types" and \ref TopicStructHavingEigenMembers "structures having such Eigen objects as member".
+Note that here, Eigen::Matrix2d is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types" and \ref TopicStructHavingEigenMembers "structures having such Eigen objects as member".
 
-The same issue will be exhibited by any classes/functions by-passing operator new to allocate memory, that is, by performing custom memory allocation followed by calls to the placement new operator. This is for instance typically the case of \c std::make_shared or \c std::allocate_shared for which is the solution is to use an \ref aligned_allocator "aligned allocator" as detailed in the \ref TopicStlContainers "solution for STL containers".
+The same issue will be exhibited by any classes/functions by-passing operator new to allocate memory, that is, by performing custom memory allocation followed by calls to the placement new operator. This is for instance typically the case of \c `std::make_shared` or `std::allocate_shared` for which is the solution is to use an \ref aligned_allocator "aligned allocator" as detailed in the \ref TopicStlContainers "solution for STL containers".
 
 \section c3 Cause 3: Passing Eigen objects by value
 
-If some function in your code is getting an Eigen object passed by value, like this,
+If some function in your code is getting an %Eigen object passed by value, like this,
 
 \code
 void func(Eigen::Vector4d v);
@@ -90,11 +92,13 @@ then you need to read this separate page: \ref TopicWrongStackAlignment "Compile
 
 Note that here, Eigen::Quaternionf is only used as an example, more generally the issue arises for all \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types".
 
+
 \section explanation General explanation of this assertion
 
-\ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen objects" must absolutely be created at 16-byte-aligned locations, otherwise SIMD instructions addressing them will crash.
+\ref TopicFixedSizeVectorizable "Fixed-size vectorizable Eigen objects" must absolutely be created at properly aligned locations, otherwise SIMD instructions addressing them will crash.
+For instance, SSE/NEON/MSA/Altivec/VSX targets will require 16-byte-alignment, whereas AVX and AVX512 targets may require up to 32 and 64 byte alignment respectively.
 
-Eigen normally takes care of these alignment issues for you, by setting an alignment attribute on them and by overloading their "operator new".
+%Eigen normally takes care of these alignment issues for you, by setting an alignment attribute on them and by overloading their `operator new`.
 
 However there are a few corner cases where these alignment settings get overridden: they are the possible causes for this assertion.
 
@@ -102,18 +106,27 @@ However there are a few corner cases where these alignment settings get overridd
 
 Three possibilities:
 <ul>
-  <li>Use the \c DontAlign option to Matrix, Array, Quaternion, etc. objects that gives you trouble. This way Eigen won't try to align them, and thus won"t assume any special alignment. On the down side, you will pay the cost of unaligned loads/stores for them, but on modern CPUs, the overhead is either null or marginal. See \link StructHavingEigenMembers_othersolutions here \endlink for an example.</li>
-  <li>Define \link TopicPreprocessorDirectivesPerformance EIGEN_DONT_ALIGN_STATICALLY \endlink. That disables all 16-byte (and above) static alignment code, while keeping 16-byte (or above) heap alignment. This has the effect of
+  <li>Use the \c DontAlign option to Matrix, Array, Quaternion, etc. objects that gives you trouble. This way %Eigen won't try to over-align them, and thus won"t assume any special alignment. On the down side, you will pay the cost of unaligned loads/stores for them, but on modern CPUs, the overhead is either null or marginal. See \link StructHavingEigenMembers_othersolutions here \endlink for an example.</li>
+  <li>Define \link TopicPreprocessorDirectivesPerformance EIGEN_MAX_STATIC_ALIGN_BYTES \endlink to 0. That disables all 16-byte (and above) static alignment code, while keeping 16-byte (or above) heap alignment. This has the effect of
       vectorizing fixed-size objects (like Matrix4d) through unaligned stores (as controlled by \link TopicPreprocessorDirectivesPerformance EIGEN_UNALIGNED_VECTORIZE \endlink), while keeping unchanged the vectorization of dynamic-size objects
-      (like MatrixXd). But do note that this breaks ABI compatibility with the default behavior of static alignment.</li>
-  <li>Or define both \link TopicPreprocessorDirectivesPerformance  EIGEN_DONT_VECTORIZE \endlink and EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT. This keeps the
-      16-byte alignment code and thus preserves ABI compatibility, but completely disables vectorization.</li>
+      (like MatrixXd). On 64 bytes systems, you might also define it 16 to disable only 32 and 64 bytes of over-alignment. But do note that this breaks ABI compatibility with the default behavior of static alignment.</li>
+  <li>Or define both \link TopicPreprocessorDirectivesPerformance  EIGEN_DONT_VECTORIZE \endlink and `EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT`. This keeps the
+      16-byte (or above) alignment code and thus preserves ABI compatibility, but completely disables vectorization.</li>
 </ul>
 
-If you want to know why defining EIGEN_DONT_VECTORIZE does not by itself disable 16-byte alignment and the assertion, here's the explanation:
+If you want to know why defining `EIGEN_DONT_VECTORIZE` does not by itself disable 16-byte (or above) alignment and the assertion, here's the explanation:
 
 It doesn't disable the assertion, because otherwise code that runs fine without vectorization would suddenly crash when enabling vectorization.
-It doesn't disable 16-byte alignment, because that would mean that vectorized and non-vectorized code are not mutually ABI-compatible. This ABI compatibility is very important, even for people who develop only an in-house application, as for instance one may want to have in the same application a vectorized path and a non-vectorized path.
+It doesn't disable 16-byte (or above) alignment, because that would mean that vectorized and non-vectorized code are not mutually ABI-compatible. This ABI compatibility is very important, even for people who develop only an in-house application, as for instance one may want to have in the same application a vectorized path and a non-vectorized path.
+
+\section checkmycode How can I check my code is safe regarding alignment issues?
+
+Unfortunately, there is no possibility in c++ to detect any of the aforementioned shortcoming at compile time (though static analyzers are becoming more and more powerful and could detect some of them).
+Even at runtime, all we can do is to catch invalid unaligned allocation and trigger the explicit assertion mentioned at the beginning of this page.
+Therefore, if your program runs fine on a given system with some given compilation flags, then this does not guarantee that your code is safe. For instance, on most 64 bits systems buffer are aligned on 16 bytes boundary and so, if you do not enable AVX instruction set, then your code will run fine. On the other hand, the same code may assert if moving to a more exotic platform, or enabling AVX instructions that required 32 bytes alignment by default.
+
+The situation is not hopeless though. Assuming your code is well covered by unit test, then you can check its alignment safety by linking it to a custom malloc library returning 8 bytes aligned buffers only. This way all alignment shortcomings should pop-up. To this end, you must also compile your program with \link TopicPreprocessorDirectivesPerformance EIGEN_MALLOC_ALREADY_ALIGNED=0 \endlink.
+
 
 */
 
diff --git a/contrib/eigen/doc/UsingIntelMKL.dox b/contrib/eigen/doc/UsingIntelMKL.dox
index 6de14afad9bb09db5d2fd3e37414f3a82ec171d8..fc35c3cf0237d1ef700eb7a82ad491be869e12eb 100644
--- a/contrib/eigen/doc/UsingIntelMKL.dox
+++ b/contrib/eigen/doc/UsingIntelMKL.dox
@@ -63,7 +63,11 @@ In addition you can choose which parts will be substituted by defining one or mu
 <tr><td>\c EIGEN_USE_MKL_ALL </td><td>Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_MKL_VML </td></tr>
 </table>
 
-The options can be combined with \b MKL_DIRECT_CALL to enable MKL direct call feature. This may help to increase performance of some MKL BLAS (?GEMM, ?GEMV, ?TRSM, ?AXPY and ?DOT) and LAPACK (LU, Cholesky and QR) routines for very small matrices. To make it work properly, the macro \c EIGEN_USE_MKL must also be defined in the case none of the other \c EIGEN_USE_MKL_* macros has been defined.
+The \c EIGEN_USE_BLAS and \c EIGEN_USE_LAPACKE* macros can be combined with \c EIGEN_USE_MKL to explicitly tell Eigen that the underlying BLAS/Lapack implementation is Intel MKL.
+The main effect is to enable MKL direct call feature (\c MKL_DIRECT_CALL).
+This may help to increase performance of some MKL BLAS (?GEMM, ?GEMV, ?TRSM, ?AXPY and ?DOT) and LAPACK (LU, Cholesky and QR) routines for very small matrices.
+MKL direct call can be disabled by defining \c EIGEN_MKL_NO_DIRECT_CALL.
+
 
 Note that the BLAS and LAPACKE backends can be enabled for any F77 compatible BLAS and LAPACK libraries. See this \link TopicUsingBlasLapack page \endlink for the details.
 
diff --git a/contrib/eigen/doc/UsingNVCC.dox b/contrib/eigen/doc/UsingNVCC.dox
index f8e755b79d68b09426f5222cfafda6e4fd44f780..36beb2ddda68abdf1cc5980ed843be09f402a3d5 100644
--- a/contrib/eigen/doc/UsingNVCC.dox
+++ b/contrib/eigen/doc/UsingNVCC.dox
@@ -3,18 +3,16 @@ namespace Eigen {
 
 /** \page TopicCUDA Using Eigen in CUDA kernels
 
-\b Disclaimer: this page is about an \b experimental feature in %Eigen.
-
-Staring from CUDA 5.0, the CUDA compiler, \c nvcc, is able to properly parse %Eigen's code (almost).
-A few adaptations of the %Eigen's code already allows to use some parts of %Eigen in your own CUDA kernels.
-To this end you need the devel branch of %Eigen, CUDA 5.0 or greater with GCC.
+Staring from CUDA 5.5 and Eigen 3.3, it is possible to use Eigen's matrices, vectors, and arrays for fixed size within CUDA kernels. This is especially useful when working on numerous but small problems. By default, when Eigen's headers are included within a .cu file compiled by nvcc most Eigen's functions and methods are prefixed by the \c __device__ \c __host__ keywords making them callable from both host and device code.
+This support can be disabled by defining \c EIGEN_NO_CUDA before including any Eigen's header.
+This might be useful to disable some warnings when a .cu file makes use of Eigen on the host side only.
+However, in both cases, host's SIMD vectorization has to be disabled in .cu files.
+It is thus \b strongly \b recommended to properly move all costly host computation from your .cu files to regular .cpp files.
 
 Known issues:
 
  - \c nvcc with MS Visual Studio does not work (patch welcome)
  
- - \c nvcc with \c clang does not work (patch welcome)
- 
  - \c nvcc 5.5 with gcc-4.7 (or greater) has issues with the standard \c \<limits\> header file. To workaround this, you can add the following before including any other files:
    \code
     // workaround issue between gcc >= 4.7 and cuda 5.5
diff --git a/contrib/eigen/doc/eigendoxy.css b/contrib/eigen/doc/eigendoxy.css
index 225f5d32ebc4fcebefc70dc0daa2bc1628fd927d..4e9d7d1207098be1c284233f538cb1aa3a3fb36d 100644
--- a/contrib/eigen/doc/eigendoxy.css
+++ b/contrib/eigen/doc/eigendoxy.css
@@ -93,7 +93,7 @@ table th.inter {
 	border-color: #cccccc;
 }
 
-/** class for exemple / output tables **/
+/** class for example / output tables **/
 
 table.example {
 }
@@ -183,6 +183,18 @@ span.cpp11,span.cpp14,span.cpp17 {
   font-weight: bold;
 }
 
+.newin3x {
+  color: #a37c1a;
+  font-weight: bold;
+}
+
+div.warningbox {
+  max-width:60em;
+  border-style: solid solid solid solid;
+  border-color: red;
+  border-width: 3px;
+}
+
 /**** old Eigen's styles ****/
 
 
@@ -220,4 +232,4 @@ td.width20em p.endtd {
 /* needed for huge screens */
 .ui-resizable-e {
   background-repeat: repeat-y;
-}
\ No newline at end of file
+}
diff --git a/contrib/eigen/doc/examples/CMakeLists.txt b/contrib/eigen/doc/examples/CMakeLists.txt
index f7a19055fce2693d956bd3c5b843bf285f12307c..a2c9d05a43c9031b6c8bfea6a24f6d88cf338681 100644
--- a/contrib/eigen/doc/examples/CMakeLists.txt
+++ b/contrib/eigen/doc/examples/CMakeLists.txt
@@ -13,9 +13,8 @@ foreach(example_src ${examples_SRCS})
     ARGS >${CMAKE_CURRENT_BINARY_DIR}/${example}.out
   )
   add_dependencies(all_examples ${example})
-endforeach(example_src)
+endforeach()
 
-check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
 if(EIGEN_COMPILER_SUPPORT_CPP11)
 ei_add_target_property(nullary_indexing COMPILE_FLAGS "-std=c++11")
 endif()
\ No newline at end of file
diff --git a/contrib/eigen/doc/examples/Tutorial_BlockOperations_block_assignment.cpp b/contrib/eigen/doc/examples/Tutorial_BlockOperations_block_assignment.cpp
index 76f49f2fbc63aec2735c9e65db755d7564ae1a28..0b87313a1cf83776df0ce492100358c8f9a7ce9e 100644
--- a/contrib/eigen/doc/examples/Tutorial_BlockOperations_block_assignment.cpp
+++ b/contrib/eigen/doc/examples/Tutorial_BlockOperations_block_assignment.cpp
@@ -14,5 +14,5 @@ int main()
   a.block<2,2>(1,1) = m;
   cout << "Here is now a with m copied into its central 2x2 block:" << endl << a << endl << endl;
   a.block(0,0,2,3) = a.block(2,1,2,3);
-  cout << "Here is now a with bottom-right 2x3 block copied into top-left 2x2 block:" << endl << a << endl << endl;
+  cout << "Here is now a with bottom-right 2x3 block copied into top-left 2x3 block:" << endl << a << endl << endl;
 }
diff --git a/contrib/eigen/doc/examples/class_FixedReshaped.cpp b/contrib/eigen/doc/examples/class_FixedReshaped.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6d4085dea39cc0d291ddc4670bf034d2c48ccae
--- /dev/null
+++ b/contrib/eigen/doc/examples/class_FixedReshaped.cpp
@@ -0,0 +1,22 @@
+#include <Eigen/Core>
+#include <iostream>
+using namespace Eigen;
+using namespace std;
+
+template<typename Derived>
+Eigen::Reshaped<Derived, 4, 2>
+reshape_helper(MatrixBase<Derived>& m)
+{
+  return Eigen::Reshaped<Derived, 4, 2>(m.derived());
+}
+
+int main(int, char**)
+{
+  MatrixXd m(2, 4);
+  m << 1, 2, 3, 4,
+       5, 6, 7, 8;
+  MatrixXd n = reshape_helper(m);
+  cout << "matrix m is:" << endl << m << endl;
+  cout << "matrix n is:" << endl << n << endl;
+  return 0;
+}
diff --git a/contrib/eigen/doc/examples/class_Reshaped.cpp b/contrib/eigen/doc/examples/class_Reshaped.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..18fb45454de20474dce470f50d83454b2e85c157
--- /dev/null
+++ b/contrib/eigen/doc/examples/class_Reshaped.cpp
@@ -0,0 +1,23 @@
+#include <Eigen/Core>
+#include <iostream>
+using namespace std;
+using namespace Eigen;
+
+template<typename Derived>
+const Reshaped<const Derived>
+reshape_helper(const MatrixBase<Derived>& m, int rows, int cols)
+{
+  return Reshaped<const Derived>(m.derived(), rows, cols);
+}
+
+int main(int, char**)
+{
+  MatrixXd m(3, 4);
+  m << 1, 4, 7, 10,
+       2, 5, 8, 11,
+       3, 6, 9, 12;
+  cout << m << endl;
+  Ref<const MatrixXd> n = reshape_helper(m, 2, 6);
+  cout << "Matrix m is:" << endl << m << endl;
+  cout << "Matrix n is:" << endl << n << endl;
+}
diff --git a/contrib/eigen/doc/examples/nullary_indexing.cpp b/contrib/eigen/doc/examples/nullary_indexing.cpp
index e27c3585ab795c3c07b4713acc1c13e20f03445f..b74db5fd14a6b3071bbb300597ea8901f1c4f77d 100644
--- a/contrib/eigen/doc/examples/nullary_indexing.cpp
+++ b/contrib/eigen/doc/examples/nullary_indexing.cpp
@@ -30,7 +30,7 @@ public:
 // [function]
 template <class ArgType, class RowIndexType, class ColIndexType>
 CwiseNullaryOp<indexing_functor<ArgType,RowIndexType,ColIndexType>, typename indexing_functor<ArgType,RowIndexType,ColIndexType>::MatrixType>
-indexing(const Eigen::MatrixBase<ArgType>& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
+mat_indexing(const Eigen::MatrixBase<ArgType>& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
 {
   typedef indexing_functor<ArgType,RowIndexType,ColIndexType> Func;
   typedef typename Func::MatrixType MatrixType;
@@ -45,7 +45,7 @@ int main()
   Eigen::MatrixXi A = Eigen::MatrixXi::Random(4,4);
   Array3i ri(1,2,1);
   ArrayXi ci(6); ci << 3,2,1,0,0,2;
-  Eigen::MatrixXi B = indexing(A, ri, ci);
+  Eigen::MatrixXi B = mat_indexing(A, ri, ci);
   std::cout << "A =" << std::endl;
   std::cout << A << std::endl << std::endl;
   std::cout << "A([" << ri.transpose() << "], [" << ci.transpose() << "]) =" << std::endl;
@@ -53,11 +53,11 @@ int main()
   std::cout << "[main1]\n";
 
   std::cout << "[main2]\n";
-  B =  indexing(A, ri+1, ci);
+  B =  mat_indexing(A, ri+1, ci);
   std::cout << "A(ri+1,ci) =" << std::endl;
   std::cout << B << std::endl << std::endl;
-#if __cplusplus >= 201103L
-  B =  indexing(A, ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3));
+#if EIGEN_COMP_CXXVER >= 11
+  B =  mat_indexing(A, ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3));
   std::cout << "A(ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)) =" << std::endl;
   std::cout << B << std::endl << std::endl;
 #endif
diff --git a/contrib/eigen/doc/snippets/Array_initializer_list_23_cxx11.cpp b/contrib/eigen/doc/snippets/Array_initializer_list_23_cxx11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c2166eab1f0fecfa5aab0d3eec6b3beda5d5326
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Array_initializer_list_23_cxx11.cpp
@@ -0,0 +1,5 @@
+ArrayXXi a {
+  {1, 2, 3},
+  {3, 4, 5}
+};
+cout << a << endl;
diff --git a/contrib/eigen/doc/snippets/Array_initializer_list_vector_cxx11.cpp b/contrib/eigen/doc/snippets/Array_initializer_list_vector_cxx11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a668d84accbd8747fa4fdada7adeac5aa7b712da
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Array_initializer_list_vector_cxx11.cpp
@@ -0,0 +1,2 @@
+Array<int, Dynamic, 1> v {{1, 2, 3, 4, 5}};
+cout << v << endl;
diff --git a/contrib/eigen/doc/snippets/Array_variadic_ctor_cxx11.cpp b/contrib/eigen/doc/snippets/Array_variadic_ctor_cxx11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0e4ec4469f9ae08fe8a44d41ec8ab0091d9d5655
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Array_variadic_ctor_cxx11.cpp
@@ -0,0 +1,3 @@
+Array<int, 1, 6> a(1, 2, 3, 4, 5, 6);
+Array<int, 3, 1> b {1, 2, 3};
+cout << a << "\n\n" << b << endl;
diff --git a/contrib/eigen/doc/snippets/BiCGSTAB_simple.cpp b/contrib/eigen/doc/snippets/BiCGSTAB_simple.cpp
index 5520f4f1f058ed13dc6b13413e1c66103134e67b..8c8829fd3a0125f2b2941428ed0e1f8de481cab9 100644
--- a/contrib/eigen/doc/snippets/BiCGSTAB_simple.cpp
+++ b/contrib/eigen/doc/snippets/BiCGSTAB_simple.cpp
@@ -8,4 +8,4 @@
   std::cout << "#iterations:     " << solver.iterations() << std::endl;
   std::cout << "estimated error: " << solver.error()      << std::endl;
   /* ... update b ... */
-  x = solver.solve(b); // solve again
\ No newline at end of file
+  x = solver.solve(b); // solve again
diff --git a/contrib/eigen/doc/snippets/BiCGSTAB_step_by_step.cpp b/contrib/eigen/doc/snippets/BiCGSTAB_step_by_step.cpp
index 06147bb81e7eaacb4a5601a65a735e1c20388310..6c95d5a9caa1cc377a40b7e477b6da4f3dcdd50b 100644
--- a/contrib/eigen/doc/snippets/BiCGSTAB_step_by_step.cpp
+++ b/contrib/eigen/doc/snippets/BiCGSTAB_step_by_step.cpp
@@ -11,4 +11,4 @@
     x = solver.solveWithGuess(b,x);
     std::cout << i << " : " << solver.error() << std::endl;
     ++i;
-  } while (solver.info()!=Success && i<100);
\ No newline at end of file
+  } while (solver.info()!=Success && i<100);
diff --git a/contrib/eigen/doc/snippets/CMakeLists.txt b/contrib/eigen/doc/snippets/CMakeLists.txt
index 1baf32fbac0274410133f90070f21368991c7910..65f195a31ab13d80f91cedf70795dc16c8fd9cc2 100644
--- a/contrib/eigen/doc/snippets/CMakeLists.txt
+++ b/contrib/eigen/doc/snippets/CMakeLists.txt
@@ -6,21 +6,31 @@ foreach(snippet_src ${snippets_SRCS})
   get_filename_component(snippet ${snippet_src} NAME_WE)
   set(compile_snippet_target compile_${snippet})
   set(compile_snippet_src ${compile_snippet_target}.cpp)
-  file(READ ${snippet_src} snippet_source_code)
-  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/compile_snippet.cpp.in
-                 ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src})
-  add_executable(${compile_snippet_target}
-                 ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src})
-  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
-    target_link_libraries(${compile_snippet_target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  if((NOT ${snippet_src} MATCHES "cxx11") OR EIGEN_COMPILER_SUPPORT_CPP11)
+    file(READ ${snippet_src} snippet_source_code)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/compile_snippet.cpp.in
+                  ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src})
+    add_executable(${compile_snippet_target}
+                  ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src})
+    if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+      target_link_libraries(${compile_snippet_target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+    endif()
+    if(${snippet_src} MATCHES "cxx11")
+      set_target_properties(${compile_snippet_target} PROPERTIES COMPILE_FLAGS "-std=c++11")
+    endif()
+    if(${snippet_src} MATCHES "deprecated")
+      set_target_properties(${compile_snippet_target} PROPERTIES COMPILE_FLAGS "-DEIGEN_NO_DEPRECATED_WARNING")
+    endif()
+    add_custom_command(
+      TARGET ${compile_snippet_target}
+      POST_BUILD
+      COMMAND ${compile_snippet_target}
+      ARGS >${CMAKE_CURRENT_BINARY_DIR}/${snippet}.out
+    )
+    add_dependencies(all_snippets ${compile_snippet_target})
+    set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src}
+                                PROPERTIES OBJECT_DEPENDS ${snippet_src})
+  else()
+    message("skip snippet ${snippet_src} because compiler does not support C++11")
   endif()
-  add_custom_command(
-    TARGET ${compile_snippet_target}
-    POST_BUILD
-    COMMAND ${compile_snippet_target}
-    ARGS >${CMAKE_CURRENT_BINARY_DIR}/${snippet}.out
-  )
-  add_dependencies(all_snippets ${compile_snippet_target})
-  set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src}
-                              PROPERTIES OBJECT_DEPENDS ${snippet_src})
-endforeach(snippet_src)
+endforeach()
diff --git a/contrib/eigen/doc/snippets/ComplexEigenSolver_eigenvectors.cpp b/contrib/eigen/doc/snippets/ComplexEigenSolver_eigenvectors.cpp
index bb1c2ccf14b48b8fce1c584abf71281c1107104d..adeed9af64e43535ae708f7b1c7fb3dbd8655008 100644
--- a/contrib/eigen/doc/snippets/ComplexEigenSolver_eigenvectors.cpp
+++ b/contrib/eigen/doc/snippets/ComplexEigenSolver_eigenvectors.cpp
@@ -1,4 +1,4 @@
 MatrixXcf ones = MatrixXcf::Ones(3,3);
 ComplexEigenSolver<MatrixXcf> ces(ones);
 cout << "The first eigenvector of the 3x3 matrix of ones is:" 
-     << endl << ces.eigenvectors().col(1) << endl;
+     << endl << ces.eigenvectors().col(0) << endl;
diff --git a/contrib/eigen/doc/snippets/Cwise_rint.cpp b/contrib/eigen/doc/snippets/Cwise_rint.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1dc7b2fd143f72245801834dd96141aee50b4fbf
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Cwise_rint.cpp
@@ -0,0 +1,3 @@
+ArrayXd v = ArrayXd::LinSpaced(7,-2,2);
+cout << v << endl << endl;
+cout << rint(v) << endl;
diff --git a/contrib/eigen/doc/snippets/DenseBase_LinSpaced_seq.cpp b/contrib/eigen/doc/snippets/DenseBase_LinSpaced_seq_deprecated.cpp
similarity index 100%
rename from contrib/eigen/doc/snippets/DenseBase_LinSpaced_seq.cpp
rename to contrib/eigen/doc/snippets/DenseBase_LinSpaced_seq_deprecated.cpp
diff --git a/contrib/eigen/doc/snippets/DirectionWise_hnormalized.cpp b/contrib/eigen/doc/snippets/DirectionWise_hnormalized.cpp
index 3410790a87782c053d795fd389462dd4a669a3f5..2451f6e7bbbe754f4ce1e155a8a6157f06441562 100644
--- a/contrib/eigen/doc/snippets/DirectionWise_hnormalized.cpp
+++ b/contrib/eigen/doc/snippets/DirectionWise_hnormalized.cpp
@@ -1,7 +1,6 @@
-typedef Matrix<double,4,Dynamic> Matrix4Xd;
 Matrix4Xd M = Matrix4Xd::Random(4,5);
 Projective3d P(Matrix4d::Random());
 cout << "The matrix M is:" << endl << M << endl << endl;
 cout << "M.colwise().hnormalized():" << endl << M.colwise().hnormalized() << endl << endl;
 cout << "P*M:" << endl << P*M << endl << endl;
-cout << "(P*M).colwise().hnormalized():" << endl << (P*M).colwise().hnormalized() << endl << endl;
\ No newline at end of file
+cout << "(P*M).colwise().hnormalized():" << endl << (P*M).colwise().hnormalized() << endl << endl;
diff --git a/contrib/eigen/doc/snippets/Jacobi_makeGivens.cpp b/contrib/eigen/doc/snippets/Jacobi_makeGivens.cpp
index 4b733c3064a6fbcbd2d6ec9cd63d20db1f04b880..6f8ec054aed538cc2939ce74fd246370fa1c0111 100644
--- a/contrib/eigen/doc/snippets/Jacobi_makeGivens.cpp
+++ b/contrib/eigen/doc/snippets/Jacobi_makeGivens.cpp
@@ -3,4 +3,4 @@ JacobiRotation<float> G;
 G.makeGivens(v.x(), v.y());
 cout << "Here is the vector v:" << endl << v << endl;
 v.applyOnTheLeft(0, 1, G.adjoint());
-cout << "Here is the vector J' * v:" << endl << v << endl;
\ No newline at end of file
+cout << "Here is the vector J' * v:" << endl << v << endl;
diff --git a/contrib/eigen/doc/snippets/Jacobi_makeJacobi.cpp b/contrib/eigen/doc/snippets/Jacobi_makeJacobi.cpp
index 0cc331d9f93fe0766fecd334142b9ce034923e6b..a86e80a6297305c8256c968876be579d52990eb3 100644
--- a/contrib/eigen/doc/snippets/Jacobi_makeJacobi.cpp
+++ b/contrib/eigen/doc/snippets/Jacobi_makeJacobi.cpp
@@ -5,4 +5,4 @@ J.makeJacobi(m, 0, 1);
 cout << "Here is the matrix m:" << endl << m << endl;
 m.applyOnTheLeft(0, 1, J.adjoint());
 m.applyOnTheRight(0, 1, J);
-cout << "Here is the matrix J' * m * J:" << endl << m << endl;
\ No newline at end of file
+cout << "Here is the matrix J' * m * J:" << endl << m << endl;
diff --git a/contrib/eigen/doc/snippets/Map_placement_new.cpp b/contrib/eigen/doc/snippets/Map_placement_new.cpp
index 2e40eca32f353445d52975b6d3cb94b4575e1207..83b83a893238facb599a8ce4147f0300849f855b 100644
--- a/contrib/eigen/doc/snippets/Map_placement_new.cpp
+++ b/contrib/eigen/doc/snippets/Map_placement_new.cpp
@@ -2,4 +2,4 @@ int data[] = {1,2,3,4,5,6,7,8,9};
 Map<RowVectorXi> v(data,4);
 cout << "The mapped vector v is: " << v << "\n";
 new (&v) Map<RowVectorXi>(data+4,5);
-cout << "Now v is: " << v << "\n";
\ No newline at end of file
+cout << "Now v is: " << v << "\n";
diff --git a/contrib/eigen/doc/snippets/MatrixBase_colwise_iterator_cxx11.cpp b/contrib/eigen/doc/snippets/MatrixBase_colwise_iterator_cxx11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..116063fb15f7d4556f3d3e556de3474388c761f2
--- /dev/null
+++ b/contrib/eigen/doc/snippets/MatrixBase_colwise_iterator_cxx11.cpp
@@ -0,0 +1,12 @@
+Matrix3i m = Matrix3i::Random();
+cout << "Here is the initial matrix m:" << endl << m << endl;
+int i = -1;
+for(auto c: m.colwise()) {
+  c *= i;
+  ++i;
+}
+cout << "Here is the matrix m after the for-range-loop:" << endl << m << endl;
+auto cols = m.colwise();
+auto it = std::find_if(cols.cbegin(), cols.cend(),
+                       [](Matrix3i::ConstColXpr x) { return x.squaredNorm() == 0; });
+cout << "The first empty column is: " << distance(cols.cbegin(),it) << endl;
diff --git a/contrib/eigen/doc/snippets/MatrixBase_cwiseArg.cpp b/contrib/eigen/doc/snippets/MatrixBase_cwiseArg.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e0857cf97e11c7b2f2cc56a76005538ada587fc0
--- /dev/null
+++ b/contrib/eigen/doc/snippets/MatrixBase_cwiseArg.cpp
@@ -0,0 +1,3 @@
+MatrixXcf v = MatrixXcf::Random(2, 3);
+cout << v << endl << endl;
+cout << v.cwiseArg() << endl;
\ No newline at end of file
diff --git a/contrib/eigen/doc/snippets/MatrixBase_hnormalized.cpp b/contrib/eigen/doc/snippets/MatrixBase_hnormalized.cpp
index 652cd77c09e05ee7c09ccdd33cfac46bae1e73f9..b714adcc3591d6c8f15c1d8482fb6c4bc0d98c21 100644
--- a/contrib/eigen/doc/snippets/MatrixBase_hnormalized.cpp
+++ b/contrib/eigen/doc/snippets/MatrixBase_hnormalized.cpp
@@ -3,4 +3,4 @@ Projective3d P(Matrix4d::Random());
 cout << "v                   = " << v.transpose() << "]^T" << endl;
 cout << "v.hnormalized()     = " << v.hnormalized().transpose() << "]^T" << endl;
 cout << "P*v                 = " << (P*v).transpose() << "]^T" << endl;
-cout << "(P*v).hnormalized() = " << (P*v).hnormalized().transpose() << "]^T" << endl;
\ No newline at end of file
+cout << "(P*v).hnormalized() = " << (P*v).hnormalized().transpose() << "]^T" << endl;
diff --git a/contrib/eigen/doc/snippets/MatrixBase_homogeneous.cpp b/contrib/eigen/doc/snippets/MatrixBase_homogeneous.cpp
index 457c28f91a5f279e5ec09b91ae275ff689921a18..26319609769547d6846b1eeb8b7aace5471bd079 100644
--- a/contrib/eigen/doc/snippets/MatrixBase_homogeneous.cpp
+++ b/contrib/eigen/doc/snippets/MatrixBase_homogeneous.cpp
@@ -3,4 +3,4 @@ Projective3d P(Matrix4d::Random());
 cout << "v                                   = [" << v.transpose() << "]^T" << endl;
 cout << "h.homogeneous()                     = [" << v.homogeneous().transpose() << "]^T" << endl;
 cout << "(P * v.homogeneous())               = [" << (P * v.homogeneous()).transpose() << "]^T" << endl;
-cout << "(P * v.homogeneous()).hnormalized() = [" << (P * v.homogeneous()).eval().hnormalized().transpose() << "]^T" << endl;
\ No newline at end of file
+cout << "(P * v.homogeneous()).hnormalized() = [" << (P * v.homogeneous()).eval().hnormalized().transpose() << "]^T" << endl;
diff --git a/contrib/eigen/doc/snippets/MatrixBase_reshaped_auto.cpp b/contrib/eigen/doc/snippets/MatrixBase_reshaped_auto.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..59f9d3f60c762c18600ac47e07cbd508bb5d0a54
--- /dev/null
+++ b/contrib/eigen/doc/snippets/MatrixBase_reshaped_auto.cpp
@@ -0,0 +1,4 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.reshaped(2, AutoSize):" << endl << m.reshaped(2, AutoSize) << endl;
+cout << "Here is m.reshaped<RowMajor>(AutoSize, fix<8>):" << endl << m.reshaped<RowMajor>(AutoSize, fix<8>) << endl;
diff --git a/contrib/eigen/doc/snippets/MatrixBase_reshaped_fixed.cpp b/contrib/eigen/doc/snippets/MatrixBase_reshaped_fixed.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e9e2cfb675984d3b0142c30917aa130d588ed42
--- /dev/null
+++ b/contrib/eigen/doc/snippets/MatrixBase_reshaped_fixed.cpp
@@ -0,0 +1,3 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.reshaped(fix<2>,fix<8>):" << endl << m.reshaped(fix<2>,fix<8>) << endl;
diff --git a/contrib/eigen/doc/snippets/MatrixBase_reshaped_int_int.cpp b/contrib/eigen/doc/snippets/MatrixBase_reshaped_int_int.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..af4ca592f907053664b534c36d585b16da5cc861
--- /dev/null
+++ b/contrib/eigen/doc/snippets/MatrixBase_reshaped_int_int.cpp
@@ -0,0 +1,3 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.reshaped(2, 8):" << endl << m.reshaped(2, 8) << endl;
diff --git a/contrib/eigen/doc/snippets/MatrixBase_reshaped_to_vector.cpp b/contrib/eigen/doc/snippets/MatrixBase_reshaped_to_vector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..37f65f7c6049ba34ad1589c8d8492b207ab2bf56
--- /dev/null
+++ b/contrib/eigen/doc/snippets/MatrixBase_reshaped_to_vector.cpp
@@ -0,0 +1,4 @@
+Matrix4i m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.reshaped().transpose():" << endl << m.reshaped().transpose() << endl;
+cout << "Here is m.reshaped<RowMajor>().transpose():  " << endl << m.reshaped<RowMajor>().transpose() << endl;
diff --git a/contrib/eigen/doc/snippets/Matrix_initializer_list_23_cxx11.cpp b/contrib/eigen/doc/snippets/Matrix_initializer_list_23_cxx11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..60280ab58e4b8109b292efda240fafee072cbef0
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Matrix_initializer_list_23_cxx11.cpp
@@ -0,0 +1,5 @@
+MatrixXd m {
+  {1, 2, 3},
+  {4, 5, 6}
+};
+cout << m << endl;
diff --git a/contrib/eigen/doc/snippets/Matrix_initializer_list_vector_cxx11.cpp b/contrib/eigen/doc/snippets/Matrix_initializer_list_vector_cxx11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..325257cb0c57b6f30720eb28d5f40eb0155ab4f9
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Matrix_initializer_list_vector_cxx11.cpp
@@ -0,0 +1,2 @@
+VectorXi v {{1, 2}};
+cout << v << endl;
diff --git a/contrib/eigen/doc/snippets/Matrix_variadic_ctor_cxx11.cpp b/contrib/eigen/doc/snippets/Matrix_variadic_ctor_cxx11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..06d33f57130048dd867bf29e319b0ed205c95c57
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Matrix_variadic_ctor_cxx11.cpp
@@ -0,0 +1,3 @@
+Matrix<int, 1, 6> a(1, 2, 3, 4, 5, 6);
+Matrix<int, 3, 1> b {1, 2, 3};
+cout << a << "\n\n" << b << endl;
diff --git a/contrib/eigen/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp b/contrib/eigen/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp
index cfc8b0d54b73ea7984211b7e7238ddc575a4e840..94b0d6ebd3c08f1414ae1ae7b5adf87d0da7d0d4 100644
--- a/contrib/eigen/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp
+++ b/contrib/eigen/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp
@@ -1,4 +1,4 @@
 MatrixXd ones = MatrixXd::Ones(3,3);
 SelfAdjointEigenSolver<MatrixXd> es(ones);
 cout << "The first eigenvector of the 3x3 matrix of ones is:" 
-     << endl << es.eigenvectors().col(1) << endl;
+     << endl << es.eigenvectors().col(0) << endl;
diff --git a/contrib/eigen/doc/snippets/Slicing_arrayexpr.cpp b/contrib/eigen/doc/snippets/Slicing_arrayexpr.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2df8180981455072097c156cf46b2d5c42dba8a1
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Slicing_arrayexpr.cpp
@@ -0,0 +1,4 @@
+ArrayXi ind(5); ind<<4,2,5,5,3;
+MatrixXi A = MatrixXi::Random(4,6);
+cout << "Initial matrix A:\n" << A << "\n\n";
+cout << "A(all,ind-1):\n" << A(all,ind-1) << "\n\n";
diff --git a/contrib/eigen/doc/snippets/Slicing_custom_padding_cxx11.cpp b/contrib/eigen/doc/snippets/Slicing_custom_padding_cxx11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..24db98b7db76c28e4a835a403d62c644eabce1fb
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Slicing_custom_padding_cxx11.cpp
@@ -0,0 +1,12 @@
+struct pad {
+  Index size() const { return out_size; }
+  Index operator[] (Index i) const { return std::max<Index>(0,i-(out_size-in_size)); }
+  Index in_size, out_size;
+};
+
+Matrix3i A;
+A.reshaped() = VectorXi::LinSpaced(9,1,9);
+cout << "Initial matrix A:\n" << A << "\n\n";
+MatrixXi B(5,5);
+B = A(pad{3,5}, pad{3,5});
+cout << "A(pad{3,N}, pad{3,N}):\n" << B << "\n\n";
diff --git a/contrib/eigen/doc/snippets/Slicing_rawarray_cxx11.cpp b/contrib/eigen/doc/snippets/Slicing_rawarray_cxx11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1087131ab02366110b269402f18e8d64a662bed2
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Slicing_rawarray_cxx11.cpp
@@ -0,0 +1,5 @@
+#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE
+MatrixXi A = MatrixXi::Random(4,6);
+cout << "Initial matrix A:\n" << A << "\n\n";
+cout << "A(all,{4,2,5,5,3}):\n" << A(all,{4,2,5,5,3}) << "\n\n";
+#endif
diff --git a/contrib/eigen/doc/snippets/Slicing_stdvector_cxx11.cpp b/contrib/eigen/doc/snippets/Slicing_stdvector_cxx11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..555f6625f2306f738074c9a23eaba68b3542b4a2
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Slicing_stdvector_cxx11.cpp
@@ -0,0 +1,4 @@
+std::vector<int> ind{4,2,5,5,3};
+MatrixXi A = MatrixXi::Random(4,6);
+cout << "Initial matrix A:\n" << A << "\n\n";
+cout << "A(all,ind):\n" << A(all,ind) << "\n\n";
diff --git a/contrib/eigen/doc/snippets/TopicAliasing_mult4.cpp b/contrib/eigen/doc/snippets/TopicAliasing_mult4.cpp
index 8a8992f6ca58572090f5cd3ee2a67f62ab572daa..01c1c6d77c5e4d75ab5f63756737367c7c271196 100644
--- a/contrib/eigen/doc/snippets/TopicAliasing_mult4.cpp
+++ b/contrib/eigen/doc/snippets/TopicAliasing_mult4.cpp
@@ -2,4 +2,4 @@ MatrixXf A(2,2), B(3,2);
 B << 2, 0,  0, 3, 1, 1;
 A << 2, 0, 0, -2;
 A = (B * A).cwiseAbs();
-cout << A;
\ No newline at end of file
+cout << A;
diff --git a/contrib/eigen/doc/snippets/Tridiagonalization_decomposeInPlace.cpp b/contrib/eigen/doc/snippets/Tridiagonalization_decomposeInPlace.cpp
index 93dcfca1d6de1bf0e06419c7575a94d7c329fbfa..3cdce679b2e96b6507f597f5e5575a735bec9e19 100644
--- a/contrib/eigen/doc/snippets/Tridiagonalization_decomposeInPlace.cpp
+++ b/contrib/eigen/doc/snippets/Tridiagonalization_decomposeInPlace.cpp
@@ -4,7 +4,8 @@ cout << "Here is a random symmetric 5x5 matrix:" << endl << A << endl << endl;
 
 VectorXd diag(5);
 VectorXd subdiag(4);
-internal::tridiagonalization_inplace(A, diag, subdiag, true);
+VectorXd hcoeffs(4);  // Scratch space for householder reflector.
+internal::tridiagonalization_inplace(A, diag, subdiag, hcoeffs, true);
 cout << "The orthogonal matrix Q is:" << endl << A << endl;
 cout << "The diagonal of the tridiagonal matrix T is:" << endl << diag << endl;
 cout << "The subdiagonal of the tridiagonal matrix T is:" << endl << subdiag << endl;
diff --git a/contrib/eigen/doc/snippets/Tutorial_ReshapeMat2Mat.cpp b/contrib/eigen/doc/snippets/Tutorial_ReshapeMat2Mat.cpp
index f84d6e76d0e8975221672d2cd0d0484af3e195ac..737afecb8078fc998c5b3a16146e5e49d927b23b 100644
--- a/contrib/eigen/doc/snippets/Tutorial_ReshapeMat2Mat.cpp
+++ b/contrib/eigen/doc/snippets/Tutorial_ReshapeMat2Mat.cpp
@@ -3,4 +3,4 @@ M1 << 1, 2, 3,  4,  5,  6,
       7, 8, 9, 10, 11, 12;
 
 Map<MatrixXf> M2(M1.data(), 6,2);
-cout << "M2:" << endl << M2 << endl;
\ No newline at end of file
+cout << "M2:" << endl << M2 << endl;
diff --git a/contrib/eigen/doc/snippets/Tutorial_ReshapeMat2Vec.cpp b/contrib/eigen/doc/snippets/Tutorial_ReshapeMat2Vec.cpp
index 95bd4e0e6b00b7118a5c71b62453080adf1de987..32980a790a2eef57d1344f1fbfa13bc942f684db 100644
--- a/contrib/eigen/doc/snippets/Tutorial_ReshapeMat2Vec.cpp
+++ b/contrib/eigen/doc/snippets/Tutorial_ReshapeMat2Vec.cpp
@@ -8,4 +8,4 @@ cout << "v1:" << endl << v1 << endl;
 
 Matrix<float,Dynamic,Dynamic,RowMajor> M2(M1);
 Map<RowVectorXf> v2(M2.data(), M2.size());
-cout << "v2:" << endl << v2 << endl;
\ No newline at end of file
+cout << "v2:" << endl << v2 << endl;
diff --git a/contrib/eigen/doc/snippets/Tutorial_SlicingCol.cpp b/contrib/eigen/doc/snippets/Tutorial_SlicingCol.cpp
index f667ff68942f245d444a1288e2a33bdc43e2be45..695d130140e14fe0e837e88903a5bf2bdd4b45dc 100644
--- a/contrib/eigen/doc/snippets/Tutorial_SlicingCol.cpp
+++ b/contrib/eigen/doc/snippets/Tutorial_SlicingCol.cpp
@@ -8,4 +8,4 @@ RowMajorMatrixXf M3(M1);
 cout << "Row major input:" << endl << M3 << "\n";
 Map<RowMajorMatrixXf,0,Stride<Dynamic,3> > M4(M3.data(), M3.rows(), (M3.cols()+2)/3,
                                               Stride<Dynamic,3>(M3.outerStride(),3));
-cout << "1 column over 3:" << endl << M4 << "\n";
\ No newline at end of file
+cout << "1 column over 3:" << endl << M4 << "\n";
diff --git a/contrib/eigen/doc/snippets/Tutorial_SlicingVec.cpp b/contrib/eigen/doc/snippets/Tutorial_SlicingVec.cpp
index 07e10bf6958c047a65c4e9c67e028718fc78c7dc..9b822464d6a5b0ea45b6f80798c6d09126f21f71 100644
--- a/contrib/eigen/doc/snippets/Tutorial_SlicingVec.cpp
+++ b/contrib/eigen/doc/snippets/Tutorial_SlicingVec.cpp
@@ -1,4 +1,4 @@
 RowVectorXf v = RowVectorXf::LinSpaced(20,0,19);
 cout << "Input:" << endl << v << endl;
 Map<RowVectorXf,0,InnerStride<2> > v2(v.data(), v.size()/2);
-cout << "Even:" << v2 << endl;
\ No newline at end of file
+cout << "Even:" << v2 << endl;
diff --git a/contrib/eigen/doc/snippets/Tutorial_range_for_loop_1d_cxx11.cpp b/contrib/eigen/doc/snippets/Tutorial_range_for_loop_1d_cxx11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e72e715d849fa16df6e83bc93aa68a8f712d6c63
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Tutorial_range_for_loop_1d_cxx11.cpp
@@ -0,0 +1,4 @@
+VectorXi v = VectorXi::Random(4);
+cout << "Here is the vector v:\n";
+for(auto x : v) cout << x << " ";
+cout << "\n";
diff --git a/contrib/eigen/doc/snippets/Tutorial_range_for_loop_2d_cxx11.cpp b/contrib/eigen/doc/snippets/Tutorial_range_for_loop_2d_cxx11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4a12d26c71f69721808f6a5a8e86ceaa4f971f80
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Tutorial_range_for_loop_2d_cxx11.cpp
@@ -0,0 +1,5 @@
+Matrix2i A = Matrix2i::Random();
+cout << "Here are the coeffs of the 2x2 matrix A:\n";
+for(auto x : A.reshaped())
+  cout << x << " ";
+cout << "\n";
diff --git a/contrib/eigen/doc/snippets/Tutorial_reshaped_vs_resize_1.cpp b/contrib/eigen/doc/snippets/Tutorial_reshaped_vs_resize_1.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e520e8e6b3871b3b6b900a6e772ea2e41a4d983f
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Tutorial_reshaped_vs_resize_1.cpp
@@ -0,0 +1,5 @@
+MatrixXi m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.reshaped(2, 8):" << endl << m.reshaped(2, 8) << endl;
+m.resize(2,8);
+cout << "Here is the matrix m after m.resize(2,8):" << endl << m << endl;
diff --git a/contrib/eigen/doc/snippets/Tutorial_reshaped_vs_resize_2.cpp b/contrib/eigen/doc/snippets/Tutorial_reshaped_vs_resize_2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..50dc454883041d61c9f660c4764fdd715f7386d4
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Tutorial_reshaped_vs_resize_2.cpp
@@ -0,0 +1,6 @@
+Matrix<int,Dynamic,Dynamic,RowMajor> m = Matrix4i::Random();
+cout << "Here is the matrix m:" << endl << m << endl;
+cout << "Here is m.reshaped(2, 8):" << endl << m.reshaped(2, 8) << endl;
+cout << "Here is m.reshaped<AutoOrder>(2, 8):" << endl << m.reshaped<AutoOrder>(2, 8) << endl;
+m.resize(2,8);
+cout << "Here is the matrix m after m.resize(2,8):" << endl << m << endl;
diff --git a/contrib/eigen/doc/snippets/Tutorial_std_sort.cpp b/contrib/eigen/doc/snippets/Tutorial_std_sort.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cde2a6f1b4c142dcf71cfe6840190329e27b428e
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Tutorial_std_sort.cpp
@@ -0,0 +1,4 @@
+Array4i v = Array4i::Random().abs();
+cout << "Here is the initial vector v:\n" << v.transpose() << "\n";
+std::sort(v.begin(), v.end());
+cout << "Here is the sorted vector v:\n" << v.transpose() << "\n";
diff --git a/contrib/eigen/doc/snippets/Tutorial_std_sort_rows_cxx11.cpp b/contrib/eigen/doc/snippets/Tutorial_std_sort_rows_cxx11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..03641603d86cbd95b4e1b6be0f6de4ba3d9ab842
--- /dev/null
+++ b/contrib/eigen/doc/snippets/Tutorial_std_sort_rows_cxx11.cpp
@@ -0,0 +1,5 @@
+ArrayXXi A = ArrayXXi::Random(4,4).abs();
+cout << "Here is the initial matrix A:\n" << A << "\n";
+for(auto row : A.rowwise())
+  std::sort(row.begin(), row.end());
+cout << "Here is the sorted matrix A:\n" << A << "\n";
diff --git a/contrib/eigen/doc/snippets/VectorwiseOp_homogeneous.cpp b/contrib/eigen/doc/snippets/VectorwiseOp_homogeneous.cpp
index aba4fed0eb2aa2e6743951aaa9858fbadebaf463..67cf5737d12de68b29baa3d196fd5c73e57934a5 100644
--- a/contrib/eigen/doc/snippets/VectorwiseOp_homogeneous.cpp
+++ b/contrib/eigen/doc/snippets/VectorwiseOp_homogeneous.cpp
@@ -1,7 +1,6 @@
-typedef Matrix<double,3,Dynamic> Matrix3Xd;
 Matrix3Xd M = Matrix3Xd::Random(3,5);
 Projective3d P(Matrix4d::Random());
 cout << "The matrix M is:" << endl << M << endl << endl;
 cout << "M.colwise().homogeneous():" << endl << M.colwise().homogeneous() << endl << endl;
 cout << "P * M.colwise().homogeneous():" << endl << P * M.colwise().homogeneous() << endl << endl;
-cout << "P * M.colwise().homogeneous().hnormalized(): " << endl << (P * M.colwise().homogeneous()).colwise().hnormalized() << endl << endl;
\ No newline at end of file
+cout << "P * M.colwise().homogeneous().hnormalized(): " << endl << (P * M.colwise().homogeneous()).colwise().hnormalized() << endl << endl;
diff --git a/contrib/eigen/doc/snippets/compile_snippet.cpp.in b/contrib/eigen/doc/snippets/compile_snippet.cpp.in
index d63f371a301b7aaea7d8dec14b2d1b9aeb4aa3c4..c11457a3f576df2555fa5d45fbae5a08c29962a5 100644
--- a/contrib/eigen/doc/snippets/compile_snippet.cpp.in
+++ b/contrib/eigen/doc/snippets/compile_snippet.cpp.in
@@ -15,6 +15,9 @@ using namespace std;
 int main(int, char**)
 {
   cout.precision(3);
-  ${snippet_source_code}
+// intentionally remove indentation of snippet
+{
+${snippet_source_code}
+}
   return 0;
 }
diff --git a/contrib/eigen/doc/snippets/tut_arithmetic_transpose_aliasing.cpp b/contrib/eigen/doc/snippets/tut_arithmetic_transpose_aliasing.cpp
index c8e4746d07d8c70c02aec51f779efc29d2f4fe04..f82e6f2ac98170299112c7185282df2937991c78 100644
--- a/contrib/eigen/doc/snippets/tut_arithmetic_transpose_aliasing.cpp
+++ b/contrib/eigen/doc/snippets/tut_arithmetic_transpose_aliasing.cpp
@@ -2,4 +2,4 @@ Matrix2i a; a << 1, 2, 3, 4;
 cout << "Here is the matrix a:\n" << a << endl;
 
 a = a.transpose(); // !!! do NOT do this !!!
-cout << "and the result of the aliasing effect:\n" << a << endl;
\ No newline at end of file
+cout << "and the result of the aliasing effect:\n" << a << endl;
diff --git a/contrib/eigen/doc/snippets/tut_arithmetic_transpose_inplace.cpp b/contrib/eigen/doc/snippets/tut_arithmetic_transpose_inplace.cpp
index 7a069ff233a13ae5891cc6361c54dd04b7c47394..5c81c9e02df83f78784c96cec5bb0fb436db99be 100644
--- a/contrib/eigen/doc/snippets/tut_arithmetic_transpose_inplace.cpp
+++ b/contrib/eigen/doc/snippets/tut_arithmetic_transpose_inplace.cpp
@@ -3,4 +3,4 @@ cout << "Here is the initial matrix a:\n" << a << endl;
 
 
 a.transposeInPlace();
-cout << "and after being transposed:\n" << a << endl;
\ No newline at end of file
+cout << "and after being transposed:\n" << a << endl;
diff --git a/contrib/eigen/doc/special_examples/CMakeLists.txt b/contrib/eigen/doc/special_examples/CMakeLists.txt
index 66ba4deeeb1f2f4989506c3903b1fc867ba93ee6..5b00e8b1a2107e738435e5d6340ca3e28a281249 100644
--- a/contrib/eigen/doc/special_examples/CMakeLists.txt
+++ b/contrib/eigen/doc/special_examples/CMakeLists.txt
@@ -3,7 +3,7 @@ if(NOT EIGEN_TEST_NOQT)
   if(QT4_FOUND)
     include(${QT_USE_FILE})
   endif()
-endif(NOT EIGEN_TEST_NOQT)
+endif()
 
 if(QT4_FOUND)
   add_executable(Tutorial_sparse_example Tutorial_sparse_example.cpp Tutorial_sparse_example_details.cpp)
@@ -17,7 +17,7 @@ if(QT4_FOUND)
   )
 
   add_dependencies(all_examples Tutorial_sparse_example)
-endif(QT4_FOUND)
+endif()
 
 if(EIGEN_COMPILER_SUPPORT_CPP11)
   add_executable(random_cpp11 random_cpp11.cpp)
diff --git a/contrib/eigen/doc/special_examples/Tutorial_sparse_example.cpp b/contrib/eigen/doc/special_examples/Tutorial_sparse_example.cpp
index c5767a8d34994e7ec82918751cfc6ecdc891d5ef..8850db052f63d564c80e424334e6017e1f5e9fd9 100644
--- a/contrib/eigen/doc/special_examples/Tutorial_sparse_example.cpp
+++ b/contrib/eigen/doc/special_examples/Tutorial_sparse_example.cpp
@@ -16,7 +16,7 @@ int main(int argc, char** argv)
   }
   
   int n = 300;  // size of the image
-  int m = n*n;  // number of unknows (=number of pixels)
+  int m = n*n;  // number of unknowns (=number of pixels)
 
   // Assembly:
   std::vector<T> coefficients;            // list of non-zeros coefficients
diff --git a/contrib/eigen/failtest/CMakeLists.txt b/contrib/eigen/failtest/CMakeLists.txt
index 1a73f05e629067a275b3c14d41bb7f4d654fad3c..256e541e270947a240e1b6e95154f4a3325d5560 100644
--- a/contrib/eigen/failtest/CMakeLists.txt
+++ b/contrib/eigen/failtest/CMakeLists.txt
@@ -1,4 +1,3 @@
-message(STATUS "Running the failtests")
 
 ei_add_failtest("failtest_sanity_check")
 
@@ -64,12 +63,8 @@ ei_add_failtest("bdcsvd_int")
 ei_add_failtest("eigensolver_int")
 ei_add_failtest("eigensolver_cplx")
 
-if (EIGEN_FAILTEST_FAILURE_COUNT)
-  message(FATAL_ERROR
-          "${EIGEN_FAILTEST_FAILURE_COUNT} out of ${EIGEN_FAILTEST_COUNT} failtests FAILED. "
-          "To debug these failures, manually compile these programs in ${CMAKE_CURRENT_SOURCE_DIR}, "
-          "with and without #define EIGEN_SHOULD_FAIL_TO_BUILD.")
-else()
-  message(STATUS "Failtest SUCCESS: all ${EIGEN_FAILTEST_COUNT} failtests passed.")
-  message(STATUS "")
+if(EIGEN_TEST_CXX11)
+  ei_add_failtest("initializer_list_1")
+  ei_add_failtest("initializer_list_2")
 endif()
+
diff --git a/contrib/eigen/failtest/initializer_list_1.cpp b/contrib/eigen/failtest/initializer_list_1.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..92dfd1f658aed123c3a6a1125d4f444bc8e270cf
--- /dev/null
+++ b/contrib/eigen/failtest/initializer_list_1.cpp
@@ -0,0 +1,14 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define ROWS Dynamic
+#else
+#define ROWS 3
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  Matrix<int, ROWS, 1> {1, 2, 3};
+}
diff --git a/contrib/eigen/failtest/initializer_list_2.cpp b/contrib/eigen/failtest/initializer_list_2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1996050a781b53bfce99685da2ccbc5481766ad3
--- /dev/null
+++ b/contrib/eigen/failtest/initializer_list_2.cpp
@@ -0,0 +1,16 @@
+#include "../Eigen/Core"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define ROWS Dynamic
+#define COLS Dynamic
+#else
+#define ROWS 3
+#define COLS 1
+#endif
+
+using namespace Eigen;
+
+int main()
+{
+  Matrix<int, ROWS, COLS> {1, 2, 3};
+}
diff --git a/contrib/eigen/failtest/swap_2.cpp b/contrib/eigen/failtest/swap_2.cpp
index c130ba6e4e2a5f2e59dbb8163d58091f30ed7497..b386cf419ff5762479f2a7a9a34d116952d654fd 100644
--- a/contrib/eigen/failtest/swap_2.cpp
+++ b/contrib/eigen/failtest/swap_2.cpp
@@ -11,4 +11,4 @@ int main()
 #else
   b.swap(ac.const_cast_derived());
 #endif
-}
\ No newline at end of file
+}
diff --git a/contrib/eigen/lapack/CMakeLists.txt b/contrib/eigen/lapack/CMakeLists.txt
index fbecd66249810affb27d4f3ca45ba486ea7eade1..e48497fda0b7bb23e52c72424ba74629ea40bd56 100644
--- a/contrib/eigen/lapack/CMakeLists.txt
+++ b/contrib/eigen/lapack/CMakeLists.txt
@@ -1,15 +1,13 @@
 
 project(EigenLapack CXX)
 
-include("../cmake/language_support.cmake")
-
-workaround_9220(Fortran EIGEN_Fortran_COMPILER_WORKS)
-
-if(EIGEN_Fortran_COMPILER_WORKS)
-  enable_language(Fortran OPTIONAL)
-  if(NOT CMAKE_Fortran_COMPILER)
-    set(EIGEN_Fortran_COMPILER_WORKS OFF)
-  endif()
+include(CheckLanguage)
+check_language(Fortran)
+if(CMAKE_Fortran_COMPILER)
+  enable_language(Fortran)
+  set(EIGEN_Fortran_COMPILER_WORKS ON)
+else()
+  set(EIGEN_Fortran_COMPILER_WORKS OFF)
 endif()
 
 add_custom_target(lapack)
@@ -35,7 +33,7 @@ set(EigenLapack_SRCS  ${EigenLapack_SRCS}
   second_NONE.f dsecnd_NONE.f
 )
 
-option(EIGEN_ENABLE_LAPACK_TESTS OFF "Enbale the Lapack unit tests")
+option(EIGEN_ENABLE_LAPACK_TESTS OFF "Enable the Lapack unit tests")
 
 if(EIGEN_ENABLE_LAPACK_TESTS)
 
@@ -59,7 +57,7 @@ if(EIGEN_ENABLE_LAPACK_TESTS)
       message(STATUS "Setup lapack reference and lapack unit tests")
       execute_process(COMMAND tar xzf  "lapack_addons_3.4.1.tgz" WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
     else()
-      message(STATUS "Download of lapack_addons_3.4.1.tgz failed, LAPACK unit tests wont be enabled")
+      message(STATUS "Download of lapack_addons_3.4.1.tgz failed, LAPACK unit tests won't be enabled")
       set(EIGEN_ENABLE_LAPACK_TESTS false)
     endif()
                   
@@ -74,7 +72,7 @@ if(EIGEN_ENABLE_LAPACK_TESTS)
         sgetrf.f  dgetrf.f  cgetrf.f  zgetrf.f
         sgetrs.f  dgetrs.f  cgetrs.f  zgetrs.f)
     
-    FILE(GLOB ReferenceLapack_SRCS0 RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "reference/*.f")
+    file(GLOB ReferenceLapack_SRCS0 RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "reference/*.f")
     foreach(filename1 IN LISTS ReferenceLapack_SRCS0)
       string(REPLACE "reference/" "" filename ${filename1})
       list(FIND EigenLapack_SRCS ${filename} id1)
@@ -86,29 +84,33 @@ if(EIGEN_ENABLE_LAPACK_TESTS)
   endif()
   
   
-endif(EIGEN_ENABLE_LAPACK_TESTS)
+endif()
 
-endif(EIGEN_Fortran_COMPILER_WORKS)
+endif()
 
-add_library(eigen_lapack_static ${EigenLapack_SRCS} ${ReferenceLapack_SRCS})
-add_library(eigen_lapack SHARED ${EigenLapack_SRCS})
+set(EIGEN_LAPACK_TARGETS "")
 
-target_link_libraries(eigen_lapack  eigen_blas)
+add_library(eigen_lapack_static ${EigenLapack_SRCS} ${ReferenceLapack_SRCS})
+list(APPEND EIGEN_LAPACK_TARGETS eigen_lapack_static)
 
-if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
-  target_link_libraries(eigen_lapack_static ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
-  target_link_libraries(eigen_lapack        ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+if (EIGEN_BUILD_SHARED_LIBS)
+  add_library(eigen_lapack SHARED ${EigenLapack_SRCS})
+  list(APPEND EIGEN_LAPACK_TARGETS eigen_lapack)
+  target_link_libraries(eigen_lapack  eigen_blas)
 endif()
 
-add_dependencies(lapack eigen_lapack eigen_lapack_static)
+foreach(target IN LISTS EIGEN_LAPACK_TARGETS)
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+    target_link_libraries(${target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  endif()
+  add_dependencies(lapack ${target})
+  install(TARGETS ${target}
+          RUNTIME DESTINATION bin
+          LIBRARY DESTINATION lib
+          ARCHIVE DESTINATION lib)
+endforeach()
 
-install(TARGETS eigen_lapack eigen_lapack_static
-        RUNTIME DESTINATION bin
-        LIBRARY DESTINATION lib
-        ARCHIVE DESTINATION lib)
 
-        
-        
 get_filename_component(eigen_full_path_to_testing_lapack "./testing/" ABSOLUTE)
 if(EXISTS ${eigen_full_path_to_testing_lapack})
   
@@ -137,22 +139,21 @@ if(EXISTS ${eigen_full_path_to_testing_lapack})
   add_subdirectory(testing/MATGEN)
   add_subdirectory(testing/LIN)
   add_subdirectory(testing/EIG)
-  cmake_policy(SET CMP0026 OLD)
   macro(add_lapack_test output input target)
     set(TEST_INPUT "${LAPACK_SOURCE_DIR}/testing/${input}")
     set(TEST_OUTPUT "${LAPACK_BINARY_DIR}/TESTING/${output}")
-    get_target_property(TEST_LOC ${target} LOCATION)
     string(REPLACE "." "_" input_name ${input})
     set(testName "${target}_${input_name}")
     if(EXISTS "${TEST_INPUT}")
-      add_test(LAPACK-${testName} "${CMAKE_COMMAND}"
-        -DTEST=${TEST_LOC}
+      add_test(NAME LAPACK-${testName}
+        COMMAND "${CMAKE_COMMAND}"
+        -DTEST=$<TARGET_FILE:${target}>
         -DINPUT=${TEST_INPUT} 
         -DOUTPUT=${TEST_OUTPUT} 
         -DINTDIR=${CMAKE_CFG_INTDIR}
         -P "${LAPACK_SOURCE_DIR}/testing/runtest.cmake")
     endif()
-  endmacro(add_lapack_test)
+  endmacro()
 
   if (BUILD_SINGLE)
   add_lapack_test(stest.out stest.in xlintsts)
diff --git a/contrib/eigen/scripts/buildtests.in b/contrib/eigen/scripts/buildtests.in
index 526d5b74b9421b613d03f67603510fe33903caa4..ab9c18fb1d54c52ecb3f5ee61fffad8767721c83 100755
--- a/contrib/eigen/scripts/buildtests.in
+++ b/contrib/eigen/scripts/buildtests.in
@@ -10,7 +10,7 @@ then
 fi
 
 TESTSLIST="@EIGEN_TESTS_LIST@"
-targets_to_make=`echo "$TESTSLIST" | egrep "$1" | xargs echo`
+targets_to_make=$(echo "$TESTSLIST" | grep -E "$1" | xargs echo)
 
 if [ -n "${EIGEN_MAKE_ARGS:+x}" ]
 then
diff --git a/contrib/eigen/scripts/cdashtesting.cmake.in b/contrib/eigen/scripts/cdashtesting.cmake.in
index 59cf5332808396d987ee5b1e15d1e621e03bdab4..0bf0fac2a2e179cd6ce7b4df436ead091cfd0161 100644
--- a/contrib/eigen/scripts/cdashtesting.cmake.in
+++ b/contrib/eigen/scripts/cdashtesting.cmake.in
@@ -12,8 +12,8 @@ elseif(${CTEST_SCRIPT_ARG} MATCHES Continuous)
   set(MODEL Continuous)
 endif()
 
-find_program(CTEST_HG_COMMAND NAMES hg)
-set(CTEST_UPDATE_COMMAND "${CTEST_HG_COMMAND}")
+find_program(CTEST_GIT_COMMAND NAMES git)
+set(CTEST_UPDATE_COMMAND "${CTEST_GIT_COMMAND}")
 
 ctest_start(${MODEL} ${CTEST_SOURCE_DIRECTORY} ${CTEST_BINARY_DIRECTORY})
 
diff --git a/contrib/eigen/scripts/eigen_gen_split_test_help.cmake b/contrib/eigen/scripts/eigen_gen_split_test_help.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e43f5aabec38103cf449219b26f0f0c07d4cb461
--- /dev/null
+++ b/contrib/eigen/scripts/eigen_gen_split_test_help.cmake
@@ -0,0 +1,11 @@
+#!cmake -P
+file(WRITE split_test_helper.h "")
+foreach(i RANGE 1 999)
+  file(APPEND split_test_helper.h
+    "#if defined(EIGEN_TEST_PART_${i}) || defined(EIGEN_TEST_PART_ALL)\n"
+    "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n"
+    "#else\n"
+    "#define CALL_SUBTEST_${i}(FUNC)\n"
+    "#endif\n\n"
+  )
+endforeach()
\ No newline at end of file
diff --git a/contrib/eigen/scripts/eigen_monitor_perf.sh b/contrib/eigen/scripts/eigen_monitor_perf.sh
new file mode 100755
index 0000000000000000000000000000000000000000..8f3425dafcf077602d48d16023606df05938e7ca
--- /dev/null
+++ b/contrib/eigen/scripts/eigen_monitor_perf.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# This is a script example to automatically update and upload performance unit tests.
+# The following five variables must be adjusted to match your settings.
+
+USER='ggael'
+UPLOAD_DIR=perf_monitoring/ggaelmacbook26
+EIGEN_SOURCE_PATH=$HOME/Eigen/eigen
+export PREFIX="haswell-fma"
+export CXX_FLAGS="-mfma -w"
+
+####
+
+BENCH_PATH=$EIGEN_SOURCE_PATH/bench/perf_monitoring/$PREFIX
+PREVPATH=$(pwd)
+cd $EIGEN_SOURCE_PATH/bench/perf_monitoring && ./runall.sh "Haswell 2.6GHz, FMA, Apple's clang" "$@"
+cd $PREVPATH || exit 1
+
+ALLFILES="$BENCH_PATH/*.png $BENCH_PATH/*.html $BENCH_PATH/index.html $BENCH_PATH/s1.js $BENCH_PATH/s2.js"
+
+# (the '/' at the end of path is very important, see rsync documentation)
+rsync -az --no-p --delete $ALLFILES $USER@ssh.tuxfamily.org:eigen/eigen.tuxfamily.org-web/htdocs/$UPLOAD_DIR/ || { echo "upload failed"; exit 1; }
+
+# fix the perm
+ssh $USER@ssh.tuxfamily.org "chmod -R g+w /home/eigen/eigen.tuxfamily.org-web/htdocs/perf_monitoring" || { echo "perm failed"; exit 1; }
diff --git a/contrib/eigen/test/AnnoyingScalar.h b/contrib/eigen/test/AnnoyingScalar.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ace083c51e1977fa684588076a11943fc2fc016
--- /dev/null
+++ b/contrib/eigen/test/AnnoyingScalar.h
@@ -0,0 +1,165 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_H
+#define EIGEN_TEST_ANNOYING_SCALAR_H
+
+#include <ostream>
+
+#if EIGEN_COMP_GNUC
+#pragma GCC diagnostic ignored "-Wshadow"
+#endif
+
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+struct my_exception
+{
+  my_exception() {}
+  ~my_exception() {}
+};
+#endif
+
+// An AnnoyingScalar is a pseudo scalar type that:
+// - can randomly through an exception in operator +
+// - randomly allocate on the heap or initialize a reference to itself making it non trivially copyable, nor movable, nor relocatable.
+
+class AnnoyingScalar
+{
+  public:
+    AnnoyingScalar()                { init(); *v = 0;  }
+    AnnoyingScalar(long double _v)  { init(); *v = _v; }
+    AnnoyingScalar(double _v)       { init(); *v = _v; }
+    AnnoyingScalar(float _v)        { init(); *v = _v; }
+    AnnoyingScalar(int _v)          { init(); *v = _v; }
+    AnnoyingScalar(long _v)         { init(); *v = _v; }
+    #if EIGEN_HAS_CXX11
+    AnnoyingScalar(long long _v)    { init(); *v = _v; }
+    #endif
+    AnnoyingScalar(const AnnoyingScalar& other) { init(); *v = *(other.v); }
+    ~AnnoyingScalar() {
+      if(v!=&data)
+        delete v;
+      instances--;
+    }
+
+    void init() {
+      if(internal::random<bool>())
+        v = new float;
+      else
+        v = &data;
+      instances++;
+    }
+
+    AnnoyingScalar operator+(const AnnoyingScalar& other) const
+    {
+      #ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+      countdown--;
+      if(countdown<=0 && !dont_throw)
+        throw my_exception();
+      #endif
+      return AnnoyingScalar(*v+*other.v);
+    }
+
+    AnnoyingScalar operator-() const
+    { return AnnoyingScalar(-*v); }
+    
+    AnnoyingScalar operator-(const AnnoyingScalar& other) const
+    { return AnnoyingScalar(*v-*other.v); }
+    
+    AnnoyingScalar operator*(const AnnoyingScalar& other) const
+    { return AnnoyingScalar((*v)*(*other.v)); }
+
+    AnnoyingScalar operator/(const AnnoyingScalar& other) const
+    { return AnnoyingScalar((*v)/(*other.v)); }
+
+    AnnoyingScalar& operator+=(const AnnoyingScalar& other) { *v += *other.v; return *this; }
+    AnnoyingScalar& operator-=(const AnnoyingScalar& other) { *v -= *other.v; return *this; }
+    AnnoyingScalar& operator*=(const AnnoyingScalar& other) { *v *= *other.v; return *this; }
+    AnnoyingScalar& operator/=(const AnnoyingScalar& other) { *v /= *other.v; return *this; }
+    AnnoyingScalar& operator= (const AnnoyingScalar& other) { *v  = *other.v; return *this; }
+
+    bool operator==(const AnnoyingScalar& other) const { return *v == *other.v; }
+    bool operator!=(const AnnoyingScalar& other) const { return *v != *other.v; }
+    bool operator<=(const AnnoyingScalar& other) const { return *v <= *other.v; }
+    bool operator< (const AnnoyingScalar& other) const { return *v <  *other.v; }
+    bool operator>=(const AnnoyingScalar& other) const { return *v >= *other.v; }
+    bool operator> (const AnnoyingScalar& other) const { return *v >  *other.v; }
+  
+    float* v;
+    float data;
+    static int instances;
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+    static int countdown;
+    static bool dont_throw;
+#endif
+};
+
+AnnoyingScalar real(const AnnoyingScalar &x) { return x; }
+AnnoyingScalar imag(const AnnoyingScalar & ) { return 0; }
+AnnoyingScalar conj(const AnnoyingScalar &x) { return x; }
+AnnoyingScalar sqrt(const AnnoyingScalar &x) { return std::sqrt(*x.v); }
+AnnoyingScalar abs (const AnnoyingScalar &x) { return std::abs(*x.v); }
+AnnoyingScalar cos (const AnnoyingScalar &x) { return std::cos(*x.v); }
+AnnoyingScalar sin (const AnnoyingScalar &x) { return std::sin(*x.v); }
+AnnoyingScalar acos(const AnnoyingScalar &x) { return std::acos(*x.v); }
+AnnoyingScalar atan2(const AnnoyingScalar &y,const AnnoyingScalar &x) { return std::atan2(*y.v,*x.v); }
+
+std::ostream& operator<<(std::ostream& stream,const AnnoyingScalar& x) {
+  stream << (*(x.v));
+  return stream;
+}
+
+int AnnoyingScalar::instances = 0;
+
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+int AnnoyingScalar::countdown = 0;
+bool AnnoyingScalar::dont_throw = false;
+#endif
+
+namespace Eigen {
+template<>
+struct NumTraits<AnnoyingScalar> : NumTraits<float>
+{
+  enum {
+    RequireInitialization = 1,
+  };
+  typedef AnnoyingScalar Real;
+  typedef AnnoyingScalar Nested;
+  typedef AnnoyingScalar Literal;
+  typedef AnnoyingScalar NonInteger;
+};
+
+template<> inline AnnoyingScalar test_precision<AnnoyingScalar>() { return test_precision<float>(); }
+
+namespace numext {
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isfinite)(const AnnoyingScalar& x) {
+  return (numext::isfinite)(*x.v);
+}
+}
+
+namespace internal {
+  template<> EIGEN_STRONG_INLINE double cast(const AnnoyingScalar& x) { return double(*x.v); }
+  template<> EIGEN_STRONG_INLINE float  cast(const AnnoyingScalar& x) { return *x.v; }
+}
+}  // namespace Eigen
+
+AnnoyingScalar get_test_precision(const AnnoyingScalar&)
+{ return Eigen::test_precision<AnnoyingScalar>(); }
+
+AnnoyingScalar test_relative_error(const AnnoyingScalar &a, const AnnoyingScalar &b)
+{ return test_relative_error(*a.v, *b.v); }
+
+inline bool test_isApprox(const AnnoyingScalar &a, const AnnoyingScalar &b)
+{ return internal::isApprox(*a.v, *b.v, test_precision<float>()); }
+
+inline bool test_isMuchSmallerThan(const AnnoyingScalar &a, const AnnoyingScalar &b)
+{ return test_isMuchSmallerThan(*a.v, *b.v); }
+
+#endif // EIGEN_TEST_ANNOYING_SCALAR_H
diff --git a/contrib/eigen/test/CMakeLists.txt b/contrib/eigen/test/CMakeLists.txt
index 0747aa6cb65a40a47fdee2685a2798f93c481ea0..5136f82aa6e73f6499b864181f971c0b458196af 100644
--- a/contrib/eigen/test/CMakeLists.txt
+++ b/contrib/eigen/test/CMakeLists.txt
@@ -1,35 +1,30 @@
-# generate split test header file only if it does not yet exist
-# in order to prevent a rebuild everytime cmake is configured
-if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)  
-  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "")
-  foreach(i RANGE 1 999)
-    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h
-      "#ifdef EIGEN_TEST_PART_${i}\n"
-      "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n"
-      "#else\n"
-      "#define CALL_SUBTEST_${i}(FUNC)\n"
-      "#endif\n\n"
-    )
-  endforeach()
+# The file split_test_helper.h was generated at first run,
+# it is now included in test/
+if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
+  file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
 endif()
 
 # check if we have a Fortran compiler
-include("../cmake/language_support.cmake")
-
-workaround_9220(Fortran EIGEN_Fortran_COMPILER_WORKS)
-
-if(EIGEN_Fortran_COMPILER_WORKS)
-  enable_language(Fortran OPTIONAL)
-  if(NOT CMAKE_Fortran_COMPILER)
-    set(EIGEN_Fortran_COMPILER_WORKS OFF)
-  endif()
-endif()
-
-if(NOT EIGEN_Fortran_COMPILER_WORKS)
+include(CheckLanguage)
+check_language(Fortran)
+if(CMAKE_Fortran_COMPILER)
+  enable_language(Fortran)
+  set(EIGEN_Fortran_COMPILER_WORKS ON)
+else()
+  set(EIGEN_Fortran_COMPILER_WORKS OFF)
   # search for a default Lapack library to complete Eigen's one
   find_package(LAPACK QUIET)
 endif()
 
+# TODO do the same for EXTERNAL_LAPACK
+option(EIGEN_TEST_EXTERNAL_BLAS "Use external BLAS library for testsuite" OFF)
+if(EIGEN_TEST_EXTERNAL_BLAS)
+  find_package(BLAS REQUIRED)
+  message(STATUS "BLAS_COMPILER_FLAGS: ${BLAS_COMPILER_FLAGS}")
+  add_definitions("-DEIGEN_USE_BLAS") # is adding  ${BLAS_COMPILER_FLAGS} necessary?
+  list(APPEND EXTERNAL_LIBS "${BLAS_LIBRARIES}")
+endif()
+
 # configure blas/lapack (use Eigen's ones)
 set(EIGEN_BLAS_LIBRARIES eigen_blas)
 set(EIGEN_LAPACK_LIBRARIES eigen_lapack)
@@ -39,37 +34,48 @@ if(EIGEN_TEST_MATRIX_DIR)
   if(NOT WIN32)
     message(STATUS "Test realworld sparse matrices: ${EIGEN_TEST_MATRIX_DIR}")
     add_definitions( -DTEST_REAL_CASES="${EIGEN_TEST_MATRIX_DIR}" )
-  else(NOT WIN32)
+  else()
     message(STATUS "REAL CASES CAN NOT BE CURRENTLY TESTED ON WIN32")
-  endif(NOT WIN32)
-endif(EIGEN_TEST_MATRIX_DIR)
+  endif()
+endif()
 
 set(SPARSE_LIBS " ")
 
-find_package(Cholmod)
+find_package(CHOLMOD)
 if(CHOLMOD_FOUND)
   add_definitions("-DEIGEN_CHOLMOD_SUPPORT")
   include_directories(${CHOLMOD_INCLUDES})
   set(SPARSE_LIBS ${SPARSE_LIBS} ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES})
   set(CHOLMOD_ALL_LIBS  ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES})
-  ei_add_property(EIGEN_TESTED_BACKENDS "Cholmod, ")
+  ei_add_property(EIGEN_TESTED_BACKENDS "CHOLMOD, ")
 else()
-  ei_add_property(EIGEN_MISSING_BACKENDS "Cholmod, ")
+  ei_add_property(EIGEN_MISSING_BACKENDS "CHOLMOD, ")
 endif()
 
-find_package(Umfpack)
+find_package(UMFPACK)
 if(UMFPACK_FOUND)
   add_definitions("-DEIGEN_UMFPACK_SUPPORT")
   include_directories(${UMFPACK_INCLUDES})
   set(SPARSE_LIBS ${SPARSE_LIBS} ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
   set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
-  ei_add_property(EIGEN_TESTED_BACKENDS "UmfPack, ")
+  ei_add_property(EIGEN_TESTED_BACKENDS "UMFPACK, ")
 else()
-  ei_add_property(EIGEN_MISSING_BACKENDS "UmfPack, ")
+  ei_add_property(EIGEN_MISSING_BACKENDS "UMFPACK, ")
+endif()
+
+find_package(KLU)
+if(KLU_FOUND)
+  add_definitions("-DEIGEN_KLU_SUPPORT")
+  include_directories(${KLU_INCLUDES})
+  set(SPARSE_LIBS ${SPARSE_LIBS} ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
+  set(KLU_ALL_LIBS ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
+  ei_add_property(EIGEN_TESTED_BACKENDS "KLU, ")
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "KLU, ")
 endif()
 
 find_package(SuperLU 4.0)
-if(SUPERLU_FOUND)
+if(SuperLU_FOUND)
   add_definitions("-DEIGEN_SUPERLU_SUPPORT")
   include_directories(${SUPERLU_INCLUDES})
   set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
@@ -80,7 +86,7 @@ else()
 endif()
 
 
-find_package(PASTIX QUIET COMPONENTS METIS SCOTCH)
+find_package(PASTIX QUIET COMPONENTS METIS SEQ)
 # check that the PASTIX found is a version without MPI
 find_path(PASTIX_pastix_nompi.h_INCLUDE_DIRS
   NAMES pastix_nompi.h
@@ -99,9 +105,9 @@ if(PASTIX_FOUND AND PASTIX_pastix_nompi.h_INCLUDE_DIRS)
   elseif(METIS_FOUND)
     include_directories(${METIS_INCLUDE_DIRS})
     set(PASTIX_LIBRARIES ${PASTIX_LIBRARIES} ${METIS_LIBRARIES})
-  else(SCOTCH_FOUND)
+  else()
     ei_add_property(EIGEN_MISSING_BACKENDS  "PaStiX, ")
-  endif(SCOTCH_FOUND)
+  endif()
   set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES_DEP} ${ORDERING_LIBRARIES})
   set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES_DEP})
   ei_add_property(EIGEN_TESTED_BACKENDS  "PaStiX, ")
@@ -137,11 +143,11 @@ if(NOT EIGEN_TEST_NOQT)
   else()
     ei_add_property(EIGEN_MISSING_BACKENDS  "Qt4 support, ")
   endif()
-endif(NOT EIGEN_TEST_NOQT)
+endif()
 
 if(TEST_LIB)
   add_definitions("-DEIGEN_EXTERN_INSTANTIATIONS=1")
-endif(TEST_LIB)
+endif()
 
 set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Official")
 add_custom_target(BuildOfficial)
@@ -153,23 +159,27 @@ ei_add_test(sizeof)
 ei_add_test(dynalloc)
 ei_add_test(nomalloc)
 ei_add_test(first_aligned)
+ei_add_test(type_alias)
 ei_add_test(nullary)
 ei_add_test(mixingtypes)
+ei_add_test(io)
 ei_add_test(packetmath "-DEIGEN_FAST_MATH=1")
-ei_add_test(unalignedassert)
 ei_add_test(vectorization_logic)
 ei_add_test(basicstuff)
 ei_add_test(constructor)
 ei_add_test(linearstructure)
 ei_add_test(integer_types)
 ei_add_test(unalignedcount)
-if(NOT EIGEN_TEST_NO_EXCEPTIONS)
+if(NOT EIGEN_TEST_NO_EXCEPTIONS AND NOT EIGEN_TEST_OPENMP)
   ei_add_test(exceptions)
 endif()
 ei_add_test(redux)
 ei_add_test(visitor)
 ei_add_test(block)
 ei_add_test(corners)
+ei_add_test(symbolic_index)
+ei_add_test(indexed_view)
+ei_add_test(reshape)
 ei_add_test(swap)
 ei_add_test(resize)
 ei_add_test(conservative_resize)
@@ -185,7 +195,7 @@ ei_add_test(smallvectors)
 ei_add_test(mapped_matrix)
 ei_add_test(mapstride)
 ei_add_test(mapstaticmethods)
-ei_add_test(array)
+ei_add_test(array_cwise)
 ei_add_test(array_for_matrix)
 ei_add_test(array_replicate)
 ei_add_test(array_reverse)
@@ -254,6 +264,7 @@ ei_add_test(sparselu)
 ei_add_test(sparseqr)
 ei_add_test(umeyama)
 ei_add_test(nesting_ops "${CMAKE_CXX_FLAGS_DEBUG}")
+ei_add_test(nestbyvalue)
 ei_add_test(zerosized)
 ei_add_test(dontalign)
 ei_add_test(evaluators)
@@ -269,7 +280,15 @@ ei_add_test(ctorleak)
 ei_add_test(mpl2only)
 ei_add_test(inplace_decomposition)
 ei_add_test(half_float)
+ei_add_test(bfloat16_float)
 ei_add_test(array_of_string)
+ei_add_test(num_dimensions)
+ei_add_test(stl_iterators)
+ei_add_test(blasutil)
+if(EIGEN_TEST_CXX11)
+  ei_add_test(initializer_list_construction)
+  ei_add_test(diagonal_matrix_variadic_ctor)
+endif()
 
 add_executable(bug1213 bug1213.cpp bug1213_main.cpp)
 
@@ -289,12 +308,16 @@ ei_add_test(fastmath " ${EIGEN_FASTMATH_FLAGS} ")
 
 if(QT4_FOUND)
   ei_add_test(qtvector "" "${QT_QTCORE_LIBRARY}")
-endif(QT4_FOUND)
+endif()
 
 if(UMFPACK_FOUND)
   ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}")
 endif()
 
+if(KLU_FOUND OR SuiteSparse_FOUND)
+  ei_add_test(klu_support "" "${KLU_ALL_LIBS}")
+endif()
+
 if(SUPERLU_FOUND)
   ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}")
 endif()
@@ -330,6 +353,9 @@ if(CMAKE_COMPILER_IS_GNUCXX AND NOT CXX_IS_QCC)
   ei_add_property(EIGEN_TESTING_SUMMARY "CXX_VERSION:       ${EIGEN_CXX_VERSION_STRING}\n")
 endif()
 ei_add_property(EIGEN_TESTING_SUMMARY "CXX_FLAGS:         ${CMAKE_CXX_FLAGS}\n")
+if (EIGEN_TEST_CUSTOM_CXX_FLAGS)
+  ei_add_property(EIGEN_TESTING_SUMMARY "Custom CXX flags:  ${EIGEN_TEST_CUSTOM_CXX_FLAGS}\n")
+endif()
 ei_add_property(EIGEN_TESTING_SUMMARY "Sparse lib flags:  ${SPARSE_LIBS}\n")
 
 option(EIGEN_TEST_EIGEN2 "Run whole Eigen2 test suite against EIGEN2_SUPPORT" OFF)
@@ -339,7 +365,7 @@ if(EIGEN_TEST_EIGEN2)
 endif()
 
 # boost MP unit test
-find_package(Boost)
+find_package(Boost 1.53.0)
 if(Boost_FOUND)
   include_directories(${Boost_INCLUDE_DIRS})
   ei_add_test(boostmultiprec "" "${Boost_LIBRARIES}")
@@ -363,28 +389,77 @@ find_package(CUDA 5.0)
 if(CUDA_FOUND)
   
   set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+  
+  set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
+  if (${CUDA_VERSION} STREQUAL "7.0")
+    set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
+  endif()
+  
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 
     set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
   endif()
   if(EIGEN_TEST_CUDA_CLANG)
-   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_30")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+    string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
+    foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+      string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${GPU}")
+    endforeach()
+  else()
+    foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+      string(APPEND CUDA_NVCC_FLAGS " -gencode arch=compute_${GPU},code=sm_${GPU}")
+    endforeach()
   endif()
-  cuda_include_directories(${CMAKE_CURRENT_BINARY_DIR})
+  string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}")
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
   
-  ei_add_test(cuda_basic)
+  ei_add_test(gpu_basic)
   
   unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 
-endif(CUDA_FOUND)
+endif()
+
+endif()
 
-endif(EIGEN_TEST_CUDA)
 
+# HIP unit tests
+option(EIGEN_TEST_HIP "Add HIP support." OFF)
+if (EIGEN_TEST_HIP)
 
-file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests)    
-add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON)
+  set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
 
-option(EIGEN_TEST_BUILD_DOCUMENTATION "Test building the doxygen documentation" OFF)
-IF(EIGEN_TEST_BUILD_DOCUMENTATION)
+  if (EXISTS ${HIP_PATH})
+    
+    list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) 
+
+    find_package(HIP REQUIRED)
+    if (HIP_FOUND)
+
+      execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
+
+      if ((${HIP_PLATFORM} STREQUAL "hcc") OR (${HIP_PLATFORM} STREQUAL "amd"))
+
+	include_directories(${HIP_PATH}/include)
+
+	set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
+	ei_add_test(gpu_basic)
+	unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+	
+      elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia"))
+	message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
+      else ()
+	message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
+      endif() 
+    endif()
+  else ()
+    message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
+  endif()
+endif()
+
+cmake_dependent_option(EIGEN_TEST_BUILD_DOCUMENTATION "Test building the doxygen documentation" OFF "EIGEN_BUILD_DOC" OFF)
+if(EIGEN_TEST_BUILD_DOCUMENTATION)
   add_dependencies(buildtests doc)
-ENDIF()
+endif()
+
+# Register all smoke tests
+include("EigenSmokeTestList")
+ei_add_smoke_tests("${ei_smoke_test_list}")
diff --git a/contrib/eigen/test/MovableScalar.h b/contrib/eigen/test/MovableScalar.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a90d037a852f6981bf206dd6aaac5baa9e71287
--- /dev/null
+++ b/contrib/eigen/test/MovableScalar.h
@@ -0,0 +1,35 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Sebastien Boisvert <seb@boisvert.info>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MISC_MOVABLE_SCALAR_H
+#define EIGEN_MISC_MOVABLE_SCALAR_H
+
+#include <vector>
+
+namespace Eigen
+{
+template <typename Scalar, typename Base = std::vector<Scalar>>
+struct MovableScalar : public Base
+{
+  MovableScalar() = default;
+  ~MovableScalar() = default;
+  MovableScalar(const MovableScalar&) = default;
+  MovableScalar(MovableScalar&& other) = default;
+  MovableScalar& operator=(const MovableScalar&) = default;
+  MovableScalar& operator=(MovableScalar&& other) = default;
+  MovableScalar(Scalar scalar) : Base(100, scalar) {}
+
+  operator Scalar() const { return this->size() > 0 ? this->back() : Scalar(); }
+};
+
+template<> struct NumTraits<MovableScalar<float>> : GenericNumTraits<float> {};
+}
+
+#endif
+
diff --git a/contrib/eigen/test/SafeScalar.h b/contrib/eigen/test/SafeScalar.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5cb75717c7c0131814ee98ceb840ac83b48dd26
--- /dev/null
+++ b/contrib/eigen/test/SafeScalar.h
@@ -0,0 +1,30 @@
+
+// A Scalar that asserts for uninitialized access.
+template<typename T>
+class SafeScalar {
+ public:
+  SafeScalar() : initialized_(false) {}
+  SafeScalar(const SafeScalar& other) {
+    *this = other;
+  }
+  SafeScalar& operator=(const SafeScalar& other) {
+    val_ = T(other);
+    initialized_ = true;
+    return *this;
+  }
+  
+  SafeScalar(T val) : val_(val), initialized_(true) {}
+  SafeScalar& operator=(T val) {
+    val_ = val;
+    initialized_ = true;
+  }
+  
+  operator T() const {
+    VERIFY(initialized_ && "Uninitialized access.");
+    return val_;
+  }
+ 
+ private:
+  T val_;
+  bool initialized_;
+};
diff --git a/contrib/eigen/test/adjoint.cpp b/contrib/eigen/test/adjoint.cpp
index 37032d2205ff3f45e1ab56e5e3f328582ec346d1..4c4f98bb978c0277957089217caf9c21f14f4cad 100644
--- a/contrib/eigen/test/adjoint.cpp
+++ b/contrib/eigen/test/adjoint.cpp
@@ -143,9 +143,55 @@ template<typename MatrixType> void adjoint(const MatrixType& m)
   RealVectorType rv1 = RealVectorType::Random(rows);
   VERIFY_IS_APPROX(v1.dot(rv1.template cast<Scalar>()), v1.dot(rv1));
   VERIFY_IS_APPROX(rv1.template cast<Scalar>().dot(v1), rv1.dot(v1));
+
+  VERIFY( is_same_type(m1,m1.template conjugateIf<false>()) );
+  VERIFY( is_same_type(m1.conjugate(),m1.template conjugateIf<true>()) );
+}
+
+template<int>
+void adjoint_extra()
+{
+  MatrixXcf a(10,10), b(10,10);
+  VERIFY_RAISES_ASSERT(a = a.transpose());
+  VERIFY_RAISES_ASSERT(a = a.transpose() + b);
+  VERIFY_RAISES_ASSERT(a = b + a.transpose());
+  VERIFY_RAISES_ASSERT(a = a.conjugate().transpose());
+  VERIFY_RAISES_ASSERT(a = a.adjoint());
+  VERIFY_RAISES_ASSERT(a = a.adjoint() + b);
+  VERIFY_RAISES_ASSERT(a = b + a.adjoint());
+
+  // no assertion should be triggered for these cases:
+  a.transpose() = a.transpose();
+  a.transpose() += a.transpose();
+  a.transpose() += a.transpose() + b;
+  a.transpose() = a.adjoint();
+  a.transpose() += a.adjoint();
+  a.transpose() += a.adjoint() + b;
+
+  // regression tests for check_for_aliasing
+  MatrixXd c(10,10);
+  c = 1.0 * MatrixXd::Ones(10,10) + c;
+  c = MatrixXd::Ones(10,10) * 1.0 + c;
+  c = c + MatrixXd::Ones(10,10) .cwiseProduct( MatrixXd::Zero(10,10) );
+  c = MatrixXd::Ones(10,10) * MatrixXd::Zero(10,10);
+
+  // regression for bug 1646
+  for (int j = 0; j < 10; ++j) {
+    c.col(j).head(j) = c.row(j).head(j);
+  }
+
+  for (int j = 0; j < 10; ++j) {
+    c.col(j) = c.row(j);
+  }
+
+  a.conservativeResize(1,1);
+  a = a.transpose();
+
+  a.conservativeResize(0,0);
+  a = a.transpose();
 }
 
-void test_adjoint()
+EIGEN_DECLARE_TEST(adjoint)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( adjoint(Matrix<float, 1, 1>()) );
@@ -168,32 +214,6 @@ void test_adjoint()
   // test a large static matrix only once
   CALL_SUBTEST_7( adjoint(Matrix<float, 100, 100>()) );
 
-#ifdef EIGEN_TEST_PART_13
-  {
-    MatrixXcf a(10,10), b(10,10);
-    VERIFY_RAISES_ASSERT(a = a.transpose());
-    VERIFY_RAISES_ASSERT(a = a.transpose() + b);
-    VERIFY_RAISES_ASSERT(a = b + a.transpose());
-    VERIFY_RAISES_ASSERT(a = a.conjugate().transpose());
-    VERIFY_RAISES_ASSERT(a = a.adjoint());
-    VERIFY_RAISES_ASSERT(a = a.adjoint() + b);
-    VERIFY_RAISES_ASSERT(a = b + a.adjoint());
-
-    // no assertion should be triggered for these cases:
-    a.transpose() = a.transpose();
-    a.transpose() += a.transpose();
-    a.transpose() += a.transpose() + b;
-    a.transpose() = a.adjoint();
-    a.transpose() += a.adjoint();
-    a.transpose() += a.adjoint() + b;
-
-    // regression tests for check_for_aliasing
-    MatrixXd c(10,10);
-    c = 1.0 * MatrixXd::Ones(10,10) + c;
-    c = MatrixXd::Ones(10,10) * 1.0 + c;
-    c = c + MatrixXd::Ones(10,10) .cwiseProduct( MatrixXd::Zero(10,10) );
-    c = MatrixXd::Ones(10,10) * MatrixXd::Zero(10,10);
-  }
-#endif
+  CALL_SUBTEST_13( adjoint_extra<0>() );
 }
 
diff --git a/contrib/eigen/test/array.cpp b/contrib/eigen/test/array_cwise.cpp
similarity index 62%
rename from contrib/eigen/test/array.cpp
rename to contrib/eigen/test/array_cwise.cpp
index 6950439b9cc7f4de9bb98a44008313e79837053c..0cc438b39ba4249c7f72fbba221f480630ae1aef 100644
--- a/contrib/eigen/test/array.cpp
+++ b/contrib/eigen/test/array_cwise.cpp
@@ -9,6 +9,79 @@
 
 #include "main.h"
 
+
+// Test the corner cases of pow(x, y) for real types.
+template<typename Scalar>
+void pow_test() {
+  const Scalar zero = Scalar(0);
+  const Scalar eps = Eigen::NumTraits<Scalar>::epsilon();
+  const Scalar one = Scalar(1);
+  const Scalar two = Scalar(2);
+  const Scalar three = Scalar(3);
+  const Scalar sqrt_half = Scalar(std::sqrt(0.5));
+  const Scalar sqrt2 = Scalar(std::sqrt(2));
+  const Scalar inf = Eigen::NumTraits<Scalar>::infinity();
+  const Scalar nan = Eigen::NumTraits<Scalar>::quiet_NaN();
+  const Scalar denorm_min = std::numeric_limits<Scalar>::denorm_min();
+  const Scalar min = (std::numeric_limits<Scalar>::min)();
+  const Scalar max = (std::numeric_limits<Scalar>::max)();
+  const Scalar max_exp = (static_cast<Scalar>(int(Eigen::NumTraits<Scalar>::max_exponent())) * Scalar(EIGEN_LN2)) / eps;
+
+  const static Scalar abs_vals[] = {zero,
+                                    denorm_min,
+                                    min,
+                                    eps,
+                                    sqrt_half,
+                                    one,
+                                    sqrt2,
+                                    two,
+                                    three,
+                                    max_exp,
+                                    max,
+                                    inf,
+                                    nan};
+  const int abs_cases = 13;
+  const int num_cases = 2*abs_cases * 2*abs_cases;
+  // Repeat the same value to make sure we hit the vectorized path.
+  const int num_repeats = 32;
+  Array<Scalar, Dynamic, Dynamic> x(num_repeats, num_cases);
+  Array<Scalar, Dynamic, Dynamic> y(num_repeats, num_cases);
+  int count = 0;
+  for (int i = 0; i < abs_cases; ++i) {
+    const Scalar abs_x = abs_vals[i];
+    for (int sign_x = 0; sign_x < 2; ++sign_x) {
+      Scalar x_case = sign_x == 0 ? -abs_x : abs_x;
+      for (int j = 0; j < abs_cases; ++j) {
+        const Scalar abs_y = abs_vals[j];
+        for (int sign_y = 0; sign_y < 2; ++sign_y) {
+          Scalar y_case = sign_y == 0 ? -abs_y : abs_y;
+          for (int repeat = 0; repeat < num_repeats; ++repeat) {
+            x(repeat, count) = x_case;
+            y(repeat, count) = y_case;
+          }
+          ++count;
+        }
+      }
+    }
+  }
+
+  Array<Scalar, Dynamic, Dynamic> actual = x.pow(y);
+  const Scalar tol = test_precision<Scalar>();
+  bool all_pass = true;
+  for (int i = 0; i < 1; ++i) {
+    for (int j = 0; j < num_cases; ++j) {
+      Scalar e = static_cast<Scalar>(std::pow(x(i,j), y(i,j)));
+      Scalar a = actual(i, j);
+      bool fail = !(a==e) && !internal::isApprox(a, e, tol) && !((numext::isnan)(a) && (numext::isnan)(e));
+      all_pass &= !fail;
+      if (fail) {
+        std::cout << "pow(" << x(i,j) << "," << y(i,j) << ")   =   " << a << " !=  " << e << std::endl;
+      }
+    }
+  }
+  VERIFY(all_pass);
+}
+
 template<typename ArrayType> void array(const ArrayType& m)
 {
   typedef typename ArrayType::Scalar Scalar;
@@ -17,7 +90,7 @@ template<typename ArrayType> void array(const ArrayType& m)
   typedef Array<Scalar, 1, ArrayType::ColsAtCompileTime> RowVectorType;
 
   Index rows = m.rows();
-  Index cols = m.cols(); 
+  Index cols = m.cols();
 
   ArrayType m1 = ArrayType::Random(rows, cols),
              m2 = ArrayType::Random(rows, cols),
@@ -43,25 +116,25 @@ template<typename ArrayType> void array(const ArrayType& m)
   VERIFY_IS_APPROX(m3, m1 + s2);
   m3 = m1;
   m3 -= s1;
-  VERIFY_IS_APPROX(m3, m1 - s1);  
-  
+  VERIFY_IS_APPROX(m3, m1 - s1);
+
   // scalar operators via Maps
   m3 = m1;
   ArrayType::Map(m1.data(), m1.rows(), m1.cols()) -= ArrayType::Map(m2.data(), m2.rows(), m2.cols());
   VERIFY_IS_APPROX(m1, m3 - m2);
-  
+
   m3 = m1;
   ArrayType::Map(m1.data(), m1.rows(), m1.cols()) += ArrayType::Map(m2.data(), m2.rows(), m2.cols());
   VERIFY_IS_APPROX(m1, m3 + m2);
-  
+
   m3 = m1;
   ArrayType::Map(m1.data(), m1.rows(), m1.cols()) *= ArrayType::Map(m2.data(), m2.rows(), m2.cols());
   VERIFY_IS_APPROX(m1, m3 * m2);
-  
+
   m3 = m1;
   m2 = ArrayType::Random(rows,cols);
   m2 = (m2==0).select(1,m2);
-  ArrayType::Map(m1.data(), m1.rows(), m1.cols()) /= ArrayType::Map(m2.data(), m2.rows(), m2.cols());  
+  ArrayType::Map(m1.data(), m1.rows(), m1.cols()) /= ArrayType::Map(m2.data(), m2.rows(), m2.cols());
   VERIFY_IS_APPROX(m1, m3 / m2);
 
   // reductions
@@ -83,7 +156,7 @@ template<typename ArrayType> void array(const ArrayType& m)
   VERIFY_IS_APPROX(m3.rowwise() += rv1, m1.rowwise() + rv1);
   m3 = m1;
   VERIFY_IS_APPROX(m3.rowwise() -= rv1, m1.rowwise() - rv1);
-  
+
   // Conversion from scalar
   VERIFY_IS_APPROX((m3 = s1), ArrayType::Constant(rows,cols,s1));
   VERIFY_IS_APPROX((m3 = 1),  ArrayType::Constant(rows,cols,1));
@@ -92,16 +165,31 @@ template<typename ArrayType> void array(const ArrayType& m)
                 ArrayType::RowsAtCompileTime==Dynamic?2:ArrayType::RowsAtCompileTime,
                 ArrayType::ColsAtCompileTime==Dynamic?2:ArrayType::ColsAtCompileTime,
                 ArrayType::Options> FixedArrayType;
-  FixedArrayType f1(s1);
-  VERIFY_IS_APPROX(f1, FixedArrayType::Constant(s1));
-  FixedArrayType f2(numext::real(s1));
-  VERIFY_IS_APPROX(f2, FixedArrayType::Constant(numext::real(s1)));
-  FixedArrayType f3((int)100*numext::real(s1));
-  VERIFY_IS_APPROX(f3, FixedArrayType::Constant((int)100*numext::real(s1)));
-  f1.setRandom();
-  FixedArrayType f4(f1.data());
-  VERIFY_IS_APPROX(f4, f1);
-  
+  {
+    FixedArrayType f1(s1);
+    VERIFY_IS_APPROX(f1, FixedArrayType::Constant(s1));
+    FixedArrayType f2(numext::real(s1));
+    VERIFY_IS_APPROX(f2, FixedArrayType::Constant(numext::real(s1)));
+    FixedArrayType f3((int)100*numext::real(s1));
+    VERIFY_IS_APPROX(f3, FixedArrayType::Constant((int)100*numext::real(s1)));
+    f1.setRandom();
+    FixedArrayType f4(f1.data());
+    VERIFY_IS_APPROX(f4, f1);
+  }
+  #if EIGEN_HAS_CXX11
+  {
+    FixedArrayType f1{s1};
+    VERIFY_IS_APPROX(f1, FixedArrayType::Constant(s1));
+    FixedArrayType f2{numext::real(s1)};
+    VERIFY_IS_APPROX(f2, FixedArrayType::Constant(numext::real(s1)));
+    FixedArrayType f3{(int)100*numext::real(s1)};
+    VERIFY_IS_APPROX(f3, FixedArrayType::Constant((int)100*numext::real(s1)));
+    f1.setRandom();
+    FixedArrayType f4{f1.data()};
+    VERIFY_IS_APPROX(f4, f1);
+  }
+  #endif
+
   // pow
   VERIFY_IS_APPROX(m1.pow(2), m1.square());
   VERIFY_IS_APPROX(pow(m1,2), m1.square());
@@ -120,10 +208,51 @@ template<typename ArrayType> void array(const ArrayType& m)
 
   // Check possible conflicts with 1D ctor
   typedef Array<Scalar, Dynamic, 1> OneDArrayType;
-  OneDArrayType o1(rows);
-  VERIFY(o1.size()==rows);
-  OneDArrayType o4((int)rows);
-  VERIFY(o4.size()==rows);
+  {
+    OneDArrayType o1(rows);
+    VERIFY(o1.size()==rows);
+    OneDArrayType o2(static_cast<int>(rows));
+    VERIFY(o2.size()==rows);
+  }
+  #if EIGEN_HAS_CXX11
+  {
+    OneDArrayType o1{rows};
+    VERIFY(o1.size()==rows);
+    OneDArrayType o4{int(rows)};
+    VERIFY(o4.size()==rows);
+  }
+  #endif
+  // Check possible conflicts with 2D ctor
+  typedef Array<Scalar, Dynamic, Dynamic> TwoDArrayType;
+  typedef Array<Scalar, 2, 1> ArrayType2;
+  {
+    TwoDArrayType o1(rows,cols);
+    VERIFY(o1.rows()==rows);
+    VERIFY(o1.cols()==cols);
+    TwoDArrayType o2(static_cast<int>(rows),static_cast<int>(cols));
+    VERIFY(o2.rows()==rows);
+    VERIFY(o2.cols()==cols);
+
+    ArrayType2 o3(rows,cols);
+    VERIFY(o3(0)==Scalar(rows) && o3(1)==Scalar(cols));
+    ArrayType2 o4(static_cast<int>(rows),static_cast<int>(cols));
+    VERIFY(o4(0)==Scalar(rows) && o4(1)==Scalar(cols));
+  }
+  #if EIGEN_HAS_CXX11
+  {
+    TwoDArrayType o1{rows,cols};
+    VERIFY(o1.rows()==rows);
+    VERIFY(o1.cols()==cols);
+    TwoDArrayType o2{int(rows),int(cols)};
+    VERIFY(o2.rows()==rows);
+    VERIFY(o2.cols()==cols);
+
+    ArrayType2 o3{rows,cols};
+    VERIFY(o3(0)==Scalar(rows) && o3(1)==Scalar(cols));
+    ArrayType2 o4{int(rows),int(cols)};
+    VERIFY(o4(0)==Scalar(rows) && o4(1)==Scalar(cols));
+  }
+  #endif
 }
 
 template<typename ArrayType> void comparisons(const ArrayType& m)
@@ -142,7 +271,7 @@ template<typename ArrayType> void comparisons(const ArrayType& m)
             m2 = ArrayType::Random(rows, cols),
             m3(rows, cols),
             m4 = m1;
-  
+
   m4 = (m4.abs()==Scalar(0)).select(1,m4);
 
   VERIFY(((m1 + Scalar(1)) > m1).all());
@@ -195,7 +324,7 @@ template<typename ArrayType> void comparisons(const ArrayType& m)
   RealScalar a = m1.abs().mean();
   VERIFY( (m1<-a || m1>a).count() == (m1.abs()>a).count());
 
-  typedef Array<typename ArrayType::Index, Dynamic, 1> ArrayOfIndices;
+  typedef Array<Index, Dynamic, 1> ArrayOfIndices;
 
   // TODO allows colwise/rowwise for array
   VERIFY_IS_APPROX(((m1.abs()+1)>RealScalar(0.1)).colwise().count(), ArrayOfIndices::Constant(cols,rows).transpose());
@@ -217,7 +346,7 @@ template<typename ArrayType> void array_real(const ArrayType& m)
             m3(rows, cols),
             m4 = m1;
 
-  m4 = (m4.abs()==Scalar(0)).select(1,m4);
+  m4 = (m4.abs()==Scalar(0)).select(Scalar(1),m4);
 
   Scalar  s1 = internal::random<Scalar>();
 
@@ -231,31 +360,39 @@ template<typename ArrayType> void array_real(const ArrayType& m)
   VERIFY_IS_APPROX(m1.sinh(), sinh(m1));
   VERIFY_IS_APPROX(m1.cosh(), cosh(m1));
   VERIFY_IS_APPROX(m1.tanh(), tanh(m1));
+#if EIGEN_HAS_CXX11_MATH
+  VERIFY_IS_APPROX(m1.tanh().atanh(), atanh(tanh(m1)));
+  VERIFY_IS_APPROX(m1.sinh().asinh(), asinh(sinh(m1)));
+  VERIFY_IS_APPROX(m1.cosh().acosh(), acosh(cosh(m1)));
+#endif
+  VERIFY_IS_APPROX(m1.logistic(), logistic(m1));
 
   VERIFY_IS_APPROX(m1.arg(), arg(m1));
   VERIFY_IS_APPROX(m1.round(), round(m1));
+  VERIFY_IS_APPROX(m1.rint(), rint(m1));
   VERIFY_IS_APPROX(m1.floor(), floor(m1));
   VERIFY_IS_APPROX(m1.ceil(), ceil(m1));
   VERIFY((m1.isNaN() == (Eigen::isnan)(m1)).all());
   VERIFY((m1.isInf() == (Eigen::isinf)(m1)).all());
   VERIFY((m1.isFinite() == (Eigen::isfinite)(m1)).all());
-  VERIFY_IS_APPROX(m1.inverse(), inverse(m1));
+  VERIFY_IS_APPROX(m4.inverse(), inverse(m4));
   VERIFY_IS_APPROX(m1.abs(), abs(m1));
   VERIFY_IS_APPROX(m1.abs2(), abs2(m1));
   VERIFY_IS_APPROX(m1.square(), square(m1));
   VERIFY_IS_APPROX(m1.cube(), cube(m1));
   VERIFY_IS_APPROX(cos(m1+RealScalar(3)*m2), cos((m1+RealScalar(3)*m2).eval()));
   VERIFY_IS_APPROX(m1.sign(), sign(m1));
+  VERIFY((m1.sqrt().sign().isNaN() == (Eigen::isnan)(sign(sqrt(m1)))).all());
 
-
-  // avoid NaNs with abs() so verification doesn't fail
-  m3 = m1.abs();
-  VERIFY_IS_APPROX(m3.sqrt(), sqrt(abs(m1)));
-  VERIFY_IS_APPROX(m3.rsqrt(), Scalar(1)/sqrt(abs(m1)));
-  VERIFY_IS_APPROX(rsqrt(m3), Scalar(1)/sqrt(abs(m1)));
+  // avoid inf and NaNs so verification doesn't fail
+  m3 = m4.abs();
+  VERIFY_IS_APPROX(m3.sqrt(), sqrt(abs(m3)));
+  VERIFY_IS_APPROX(m3.rsqrt(), Scalar(1)/sqrt(abs(m3)));
+  VERIFY_IS_APPROX(rsqrt(m3), Scalar(1)/sqrt(abs(m3)));
   VERIFY_IS_APPROX(m3.log(), log(m3));
   VERIFY_IS_APPROX(m3.log1p(), log1p(m3));
   VERIFY_IS_APPROX(m3.log10(), log10(m3));
+  VERIFY_IS_APPROX(m3.log2(), log2(m3));
 
 
   VERIFY((!(m1>m2) == (m1<=m2)).all());
@@ -263,17 +400,24 @@ template<typename ArrayType> void array_real(const ArrayType& m)
   VERIFY_IS_APPROX(sin(m1.asin()), m1);
   VERIFY_IS_APPROX(cos(m1.acos()), m1);
   VERIFY_IS_APPROX(tan(m1.atan()), m1);
-  VERIFY_IS_APPROX(sinh(m1), 0.5*(exp(m1)-exp(-m1)));
-  VERIFY_IS_APPROX(cosh(m1), 0.5*(exp(m1)+exp(-m1)));
-  VERIFY_IS_APPROX(tanh(m1), (0.5*(exp(m1)-exp(-m1)))/(0.5*(exp(m1)+exp(-m1))));
-  VERIFY_IS_APPROX(arg(m1), ((m1<0).template cast<Scalar>())*std::acos(-1.0));
+  VERIFY_IS_APPROX(sinh(m1), Scalar(0.5)*(exp(m1)-exp(-m1)));
+  VERIFY_IS_APPROX(cosh(m1), Scalar(0.5)*(exp(m1)+exp(-m1)));
+  VERIFY_IS_APPROX(tanh(m1), (Scalar(0.5)*(exp(m1)-exp(-m1)))/(Scalar(0.5)*(exp(m1)+exp(-m1))));
+  VERIFY_IS_APPROX(logistic(m1), (Scalar(1)/(Scalar(1)+exp(-m1))));
+  VERIFY_IS_APPROX(arg(m1), ((m1<Scalar(0)).template cast<Scalar>())*Scalar(std::acos(Scalar(-1))));
   VERIFY((round(m1) <= ceil(m1) && round(m1) >= floor(m1)).all());
-  VERIFY((Eigen::isnan)((m1*0.0)/0.0).all());
-  VERIFY((Eigen::isinf)(m4/0.0).all());
-  VERIFY(((Eigen::isfinite)(m1) && (!(Eigen::isfinite)(m1*0.0/0.0)) && (!(Eigen::isfinite)(m4/0.0))).all());
-  VERIFY_IS_APPROX(inverse(inverse(m1)),m1);
+  VERIFY((rint(m1) <= ceil(m1) && rint(m1) >= floor(m1)).all());
+  VERIFY(((ceil(m1) - round(m1)) <= Scalar(0.5) || (round(m1) - floor(m1)) <= Scalar(0.5)).all());
+  VERIFY(((ceil(m1) - round(m1)) <= Scalar(1.0) && (round(m1) - floor(m1)) <= Scalar(1.0)).all());
+  VERIFY(((ceil(m1) - rint(m1)) <= Scalar(0.5) || (rint(m1) - floor(m1)) <= Scalar(0.5)).all());
+  VERIFY(((ceil(m1) - rint(m1)) <= Scalar(1.0) && (rint(m1) - floor(m1)) <= Scalar(1.0)).all());
+  VERIFY((Eigen::isnan)((m1*Scalar(0))/Scalar(0)).all());
+  VERIFY((Eigen::isinf)(m4/Scalar(0)).all());
+  VERIFY(((Eigen::isfinite)(m1) && (!(Eigen::isfinite)(m1*Scalar(0)/Scalar(0))) && (!(Eigen::isfinite)(m4/Scalar(0)))).all());
+  VERIFY_IS_APPROX(inverse(inverse(m4)),m4);
   VERIFY((abs(m1) == m1 || abs(m1) == -m1).all());
-  VERIFY_IS_APPROX(m3, sqrt(abs2(m1)));
+  VERIFY_IS_APPROX(m3, sqrt(abs2(m3)));
+  VERIFY_IS_APPROX(m1.absolute_difference(m2), (m1 > m2).select(m1 - m2, m2 - m1));
   VERIFY_IS_APPROX( m1.sign(), -(-m1).sign() );
   VERIFY_IS_APPROX( m1*m1.sign(),m1.abs());
   VERIFY_IS_APPROX(m1.sign() * m1.abs(), m1);
@@ -285,20 +429,29 @@ template<typename ArrayType> void array_real(const ArrayType& m)
 
   // shift argument of logarithm so that it is not zero
   Scalar smallNumber = NumTraits<Scalar>::dummy_precision();
-  VERIFY_IS_APPROX((m3 + smallNumber).log() , log(abs(m1) + smallNumber));
-  VERIFY_IS_APPROX((m3 + smallNumber + 1).log() , log1p(abs(m1) + smallNumber));
+  VERIFY_IS_APPROX((m3 + smallNumber).log() , log(abs(m3) + smallNumber));
+  VERIFY_IS_APPROX((m3 + smallNumber + Scalar(1)).log() , log1p(abs(m3) + smallNumber));
 
   VERIFY_IS_APPROX(m1.exp() * m2.exp(), exp(m1+m2));
   VERIFY_IS_APPROX(m1.exp(), exp(m1));
   VERIFY_IS_APPROX(m1.exp() / m2.exp(),(m1-m2).exp());
 
+  VERIFY_IS_APPROX(m1.expm1(), expm1(m1));
+  VERIFY_IS_APPROX((m3 + smallNumber).exp() - Scalar(1), expm1(abs(m3) + smallNumber));
+
   VERIFY_IS_APPROX(m3.pow(RealScalar(0.5)), m3.sqrt());
   VERIFY_IS_APPROX(pow(m3,RealScalar(0.5)), m3.sqrt());
 
   VERIFY_IS_APPROX(m3.pow(RealScalar(-0.5)), m3.rsqrt());
   VERIFY_IS_APPROX(pow(m3,RealScalar(-0.5)), m3.rsqrt());
 
-  VERIFY_IS_APPROX(log10(m3), log(m3)/log(10));
+  // Avoid inf and NaN.
+  m3 = (m1.square()<NumTraits<Scalar>::epsilon()).select(Scalar(1),m3);
+  VERIFY_IS_APPROX(m3.pow(RealScalar(-2)), m3.square().inverse());
+  pow_test<Scalar>();
+
+  VERIFY_IS_APPROX(log10(m3), log(m3)/numext::log(Scalar(10)));
+  VERIFY_IS_APPROX(log2(m3), log(m3)/numext::log(Scalar(2)));
 
   // scalar by array division
   const RealScalar tiny = sqrt(std::numeric_limits<RealScalar>::epsilon());
@@ -325,7 +478,7 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
   ArrayType m1 = ArrayType::Random(rows, cols),
             m2(rows, cols),
             m4 = m1;
-  
+
   m4.real() = (m4.real().abs()==RealScalar(0)).select(RealScalar(1),m4.real());
   m4.imag() = (m4.imag().abs()==RealScalar(0)).select(RealScalar(1),m4.imag());
 
@@ -342,13 +495,15 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
   VERIFY_IS_APPROX(m1.sinh(), sinh(m1));
   VERIFY_IS_APPROX(m1.cosh(), cosh(m1));
   VERIFY_IS_APPROX(m1.tanh(), tanh(m1));
+  VERIFY_IS_APPROX(m1.logistic(), logistic(m1));
   VERIFY_IS_APPROX(m1.arg(), arg(m1));
   VERIFY((m1.isNaN() == (Eigen::isnan)(m1)).all());
   VERIFY((m1.isInf() == (Eigen::isinf)(m1)).all());
   VERIFY((m1.isFinite() == (Eigen::isfinite)(m1)).all());
-  VERIFY_IS_APPROX(m1.inverse(), inverse(m1));
+  VERIFY_IS_APPROX(m4.inverse(), inverse(m4));
   VERIFY_IS_APPROX(m1.log(), log(m1));
   VERIFY_IS_APPROX(m1.log10(), log10(m1));
+  VERIFY_IS_APPROX(m1.log2(), log2(m1));
   VERIFY_IS_APPROX(m1.abs(), abs(m1));
   VERIFY_IS_APPROX(m1.abs2(), abs2(m1));
   VERIFY_IS_APPROX(m1.sqrt(), sqrt(m1));
@@ -362,9 +517,15 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
   VERIFY_IS_APPROX(m1.exp(), exp(m1));
   VERIFY_IS_APPROX(m1.exp() / m2.exp(),(m1-m2).exp());
 
+  VERIFY_IS_APPROX(m1.expm1(), expm1(m1));
+  VERIFY_IS_APPROX(expm1(m1), exp(m1) - 1.);
+  // Check for larger magnitude complex numbers that expm1 matches exp - 1.
+  VERIFY_IS_APPROX(expm1(10. * m1), exp(10. * m1) - 1.);
+
   VERIFY_IS_APPROX(sinh(m1), 0.5*(exp(m1)-exp(-m1)));
   VERIFY_IS_APPROX(cosh(m1), 0.5*(exp(m1)+exp(-m1)));
   VERIFY_IS_APPROX(tanh(m1), (0.5*(exp(m1)-exp(-m1)))/(0.5*(exp(m1)+exp(-m1))));
+  VERIFY_IS_APPROX(logistic(m1), (1.0/(1.0 + exp(-m1))));
 
   for (Index i = 0; i < m.rows(); ++i)
     for (Index j = 0; j < m.cols(); ++j)
@@ -393,11 +554,12 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
 
   VERIFY(((Eigen::isfinite)(m1) && (!(Eigen::isfinite)(m1*zero/zero)) && (!(Eigen::isfinite)(m1/zero))).all());
 
-  VERIFY_IS_APPROX(inverse(inverse(m1)),m1);
+  VERIFY_IS_APPROX(inverse(inverse(m4)),m4);
   VERIFY_IS_APPROX(conj(m1.conjugate()), m1);
   VERIFY_IS_APPROX(abs(m1), sqrt(square(m1.real())+square(m1.imag())));
   VERIFY_IS_APPROX(abs(m1), sqrt(abs2(m1)));
   VERIFY_IS_APPROX(log10(m1), log(m1)/log(10));
+  VERIFY_IS_APPROX(log2(m1), log(m1)/log(2));
 
   VERIFY_IS_APPROX( m1.sign(), -(-m1).sign() );
   VERIFY_IS_APPROX( m1.sign() * m1.abs(), m1);
@@ -415,7 +577,11 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
   VERIFY_IS_APPROX(m2, m1.transpose());
   m2.transposeInPlace();
   VERIFY_IS_APPROX(m2, m1);
-
+  // Check vectorized inplace transpose.
+  ArrayType m5 = ArrayType::Random(131, 131);
+  ArrayType m6 = m5;
+  m6.transposeInPlace();
+  VERIFY_IS_APPROX(m6, m5.transpose());
 }
 
 template<typename ArrayType> void min_max(const ArrayType& m)
@@ -444,9 +610,58 @@ template<typename ArrayType> void min_max(const ArrayType& m)
   VERIFY_IS_APPROX(ArrayType::Constant(rows,cols, maxM1), (m1.max)( maxM1));
   VERIFY_IS_APPROX(m1, (m1.max)( minM1));
 
+
+  // min/max with various NaN propagation options.
+  if (m1.size() > 1 && !NumTraits<Scalar>::IsInteger) {
+    m1(0,0) = NumTraits<Scalar>::quiet_NaN();
+    maxM1 = m1.template maxCoeff<PropagateNaN>();
+    minM1 = m1.template minCoeff<PropagateNaN>();
+    VERIFY((numext::isnan)(maxM1));
+    VERIFY((numext::isnan)(minM1));
+
+    maxM1 = m1.template maxCoeff<PropagateNumbers>();
+    minM1 = m1.template minCoeff<PropagateNumbers>();
+    VERIFY(!(numext::isnan)(maxM1));
+    VERIFY(!(numext::isnan)(minM1));
+  }
+}
+
+template<int N>
+struct shift_left {
+  template<typename Scalar>
+  Scalar operator()(const Scalar& v) const {
+    return v << N;
+  }
+};
+
+template<int N>
+struct arithmetic_shift_right {
+  template<typename Scalar>
+  Scalar operator()(const Scalar& v) const {
+    return v >> N;
+  }
+};
+
+template<typename ArrayType> void array_integer(const ArrayType& m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  ArrayType m1 = ArrayType::Random(rows, cols),
+            m2(rows, cols);
+
+  m2 = m1.template shiftLeft<2>();
+  VERIFY( (m2 == m1.unaryExpr(shift_left<2>())).all() );
+  m2 = m1.template shiftLeft<9>();
+  VERIFY( (m2 == m1.unaryExpr(shift_left<9>())).all() );
+  
+  m2 = m1.template shiftRight<2>();
+  VERIFY( (m2 == m1.unaryExpr(arithmetic_shift_right<2>())).all() );
+  m2 = m1.template shiftRight<9>();
+  VERIFY( (m2 == m1.unaryExpr(arithmetic_shift_right<9>())).all() );
 }
 
-void test_array()
+EIGEN_DECLARE_TEST(array_cwise)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( array(Array<float, 1, 1>()) );
@@ -455,6 +670,9 @@ void test_array()
     CALL_SUBTEST_4( array(ArrayXXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_5( array(ArrayXXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_6( array(ArrayXXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( array(Array<Index,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( array_integer(ArrayXXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( array_integer(Array<Index,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
   }
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( comparisons(Array<float, 1, 1>()) );
@@ -475,6 +693,8 @@ void test_array()
     CALL_SUBTEST_2( array_real(Array22f()) );
     CALL_SUBTEST_3( array_real(Array44d()) );
     CALL_SUBTEST_5( array_real(ArrayXXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_7( array_real(Array<Eigen::half, 32, 32>()) );
+    CALL_SUBTEST_8( array_real(Array<Eigen::bfloat16, 32, 32>()) );
   }
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_4( array_complex(ArrayXXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
diff --git a/contrib/eigen/test/array_for_matrix.cpp b/contrib/eigen/test/array_for_matrix.cpp
index a05bba191f17f7e723107cb17c615789c2e19098..fb6be351e69af963db61c903ea0e9f6e3c6d52da 100644
--- a/contrib/eigen/test/array_for_matrix.cpp
+++ b/contrib/eigen/test/array_for_matrix.cpp
@@ -57,7 +57,14 @@ template<typename MatrixType> void array_for_matrix(const MatrixType& m)
   VERIFY_IS_APPROX(m3.rowwise() -= rv1, m1.rowwise() - rv1);
   
   // empty objects
-  VERIFY_IS_APPROX(m1.block(0,0,0,cols).colwise().sum(),  RowVectorType::Zero(cols));
+  VERIFY_IS_APPROX((m1.template block<0,Dynamic>(0,0,0,cols).colwise().sum()), RowVectorType::Zero(cols));
+  VERIFY_IS_APPROX((m1.template block<Dynamic,0>(0,0,rows,0).rowwise().sum()), ColVectorType::Zero(rows));
+  VERIFY_IS_APPROX((m1.template block<0,Dynamic>(0,0,0,cols).colwise().prod()), RowVectorType::Ones(cols));
+  VERIFY_IS_APPROX((m1.template block<Dynamic,0>(0,0,rows,0).rowwise().prod()), ColVectorType::Ones(rows));
+
+  VERIFY_IS_APPROX(m1.block(0,0,0,cols).colwise().sum(), RowVectorType::Zero(cols));
+  VERIFY_IS_APPROX(m1.block(0,0,rows,0).rowwise().sum(), ColVectorType::Zero(rows));
+  VERIFY_IS_APPROX(m1.block(0,0,0,cols).colwise().prod(), RowVectorType::Ones(cols));
   VERIFY_IS_APPROX(m1.block(0,0,rows,0).rowwise().prod(), ColVectorType::Ones(rows));
   
   // verify the const accessors exist
@@ -138,7 +145,7 @@ template<typename MatrixType> void comparisons(const MatrixType& m)
   RealScalar a = m1.cwiseAbs().mean();
   VERIFY( ((m1.array()<-a).matrix() || (m1.array()>a).matrix()).count() == (m1.cwiseAbs().array()>a).count());
 
-  typedef Matrix<typename MatrixType::Index, Dynamic, 1> VectorOfIndices;
+  typedef Matrix<Index, Dynamic, 1> VectorOfIndices;
 
   // TODO allows colwise/rowwise for array
   VERIFY_IS_APPROX(((m1.array().abs()+1)>RealScalar(0.1)).matrix().colwise().count(), VectorOfIndices::Constant(cols,rows).transpose());
@@ -256,7 +263,7 @@ void regrrssion_bug_1410()
   VERIFY((internal::traits<MatrixWrapper<Array4i> >::Flags&LvalueBit)==LvalueBit);
 }
 
-void test_array_for_matrix()
+EIGEN_DECLARE_TEST(array_for_matrix)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( array_for_matrix(Matrix<float, 1, 1>()) );
diff --git a/contrib/eigen/test/array_of_string.cpp b/contrib/eigen/test/array_of_string.cpp
index e23b7c59e6eddb9f6ec18f1791ef45b2392d2b50..23e51529b105f2b45b807e8d6bc6fdcf7f2a7058 100644
--- a/contrib/eigen/test/array_of_string.cpp
+++ b/contrib/eigen/test/array_of_string.cpp
@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-void test_array_of_string()
+EIGEN_DECLARE_TEST(array_of_string)
 {
   typedef Array<std::string,1,Dynamic> ArrayXs;
   ArrayXs a1(3), a2(3), a3(3), a3ref(3);
diff --git a/contrib/eigen/test/array_replicate.cpp b/contrib/eigen/test/array_replicate.cpp
index 0dad5bace7bc0fe4a8da6370b43380c9fd06a195..057c3c77b4996fbaf92d2b0fa063ac4caa33ef22 100644
--- a/contrib/eigen/test/array_replicate.cpp
+++ b/contrib/eigen/test/array_replicate.cpp
@@ -68,7 +68,7 @@ template<typename MatrixType> void replicate(const MatrixType& m)
   VERIFY_IS_APPROX(vx1, v1.colwise().replicate(f2));
 }
 
-void test_array_replicate()
+EIGEN_DECLARE_TEST(array_replicate)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( replicate(Matrix<float, 1, 1>()) );
diff --git a/contrib/eigen/test/array_reverse.cpp b/contrib/eigen/test/array_reverse.cpp
index 9d5b9a66dd6fa3137128aba884798e8967279527..c77528a5bba280a30af5fc5c0feff638d1b4762d 100644
--- a/contrib/eigen/test/array_reverse.cpp
+++ b/contrib/eigen/test/array_reverse.cpp
@@ -123,7 +123,70 @@ template<typename MatrixType> void reverse(const MatrixType& m)
   VERIFY_IS_APPROX(x, m1(r, cols - 1 - c));
 }
 
-void test_array_reverse()
+template<int>
+void array_reverse_extra()
+{
+  Vector4f x; x << 1, 2, 3, 4;
+  Vector4f y; y << 4, 3, 2, 1;
+  VERIFY(x.reverse()[1] == 3);
+  VERIFY(x.reverse() == y);
+}
+
+// Simpler version of reverseInPlace leveraging a bug
+// in clang 6/7 with -O2 and AVX or AVX512 enabled.
+// This simpler version ensure that the clang bug is not simply hidden
+// through mis-inlining of reverseInPlace or other minor changes.
+template<typename MatrixType>
+EIGEN_DONT_INLINE
+void bug1684_job1(MatrixType& m1, MatrixType& m2)
+{
+  m2 = m1;
+  m2.col(0).swap(m2.col(3));
+  m2.col(1).swap(m2.col(2));
+}
+
+template<typename MatrixType>
+EIGEN_DONT_INLINE
+void bug1684_job2(MatrixType& m1, MatrixType& m2)
+{
+  m2 = m1; // load m1/m2 in AVX registers
+  m1.col(0) = m2.col(3); // perform 128 bits moves
+  m1.col(1) = m2.col(2);
+  m1.col(2) = m2.col(1);
+  m1.col(3) = m2.col(0);
+}
+
+template<typename MatrixType>
+EIGEN_DONT_INLINE
+void bug1684_job3(MatrixType& m1, MatrixType& m2)
+{
+  m2 = m1;
+  Vector4f tmp;
+  tmp = m2.col(0);
+  m2.col(0) = m2.col(3);
+  m2.col(3) = tmp;
+  tmp = m2.col(1);
+  m2.col(1) = m2.col(2);
+  m2.col(2) = tmp;
+  
+}
+
+template<int>
+void bug1684()
+{
+  Matrix4f m1 = Matrix4f::Random();
+  Matrix4f m2 = Matrix4f::Random();
+  bug1684_job1(m1,m2);
+  VERIFY_IS_APPROX(m2, m1.rowwise().reverse().eval());
+  bug1684_job2(m1,m2);
+  VERIFY_IS_APPROX(m2, m1.rowwise().reverse().eval());
+  // This one still fail after our swap's workaround,
+  // but I expect users not to implement their own swap.
+  // bug1684_job3(m1,m2);
+  // VERIFY_IS_APPROX(m2, m1.rowwise().reverse().eval());
+}
+
+EIGEN_DECLARE_TEST(array_reverse)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( reverse(Matrix<float, 1, 1>()) );
@@ -135,11 +198,7 @@ void test_array_reverse()
     CALL_SUBTEST_7( reverse(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_8( reverse(Matrix<float, 100, 100>()) );
     CALL_SUBTEST_9( reverse(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_3( bug1684<0>() );
   }
-#ifdef EIGEN_TEST_PART_3
-  Vector4f x; x << 1, 2, 3, 4;
-  Vector4f y; y << 4, 3, 2, 1;
-  VERIFY(x.reverse()[1] == 3);
-  VERIFY(x.reverse() == y);
-#endif
+  CALL_SUBTEST_3( array_reverse_extra<0>() );
 }
diff --git a/contrib/eigen/test/bandmatrix.cpp b/contrib/eigen/test/bandmatrix.cpp
index f8c38f7c31d935008a3d0434ca590b2052abfb37..66a1b0db4734e01baf0e4368a9bc2fb7f56c58f7 100644
--- a/contrib/eigen/test/bandmatrix.cpp
+++ b/contrib/eigen/test/bandmatrix.cpp
@@ -59,7 +59,7 @@ template<typename MatrixType> void bandmatrix(const MatrixType& _m)
 
 using Eigen::internal::BandMatrix;
 
-void test_bandmatrix()
+EIGEN_DECLARE_TEST(bandmatrix)
 {
   for(int i = 0; i < 10*g_repeat ; i++) {
     Index rows = internal::random<Index>(1,10);
diff --git a/contrib/eigen/test/basicstuff.cpp b/contrib/eigen/test/basicstuff.cpp
index 2e532f7a599fbfbb9565cdc5bd22604362a3254c..4ca607c8248d99a32efbfa53cf779177c70d3360 100644
--- a/contrib/eigen/test/basicstuff.cpp
+++ b/contrib/eigen/test/basicstuff.cpp
@@ -10,6 +10,7 @@
 #define EIGEN_NO_STATIC_ASSERT
 
 #include "main.h"
+#include "random_without_cast_overflow.h"
 
 template<typename MatrixType> void basicStuff(const MatrixType& m)
 {
@@ -48,6 +49,22 @@ template<typename MatrixType> void basicStuff(const MatrixType& m)
   v1[r] = x;
   VERIFY_IS_APPROX(x, v1[r]);
 
+  // test fetching with various index types.
+  Index r1 = internal::random<Index>(0, numext::mini(Index(127),rows-1));
+  x = v1(static_cast<char>(r1));
+  x = v1(static_cast<signed char>(r1));
+  x = v1(static_cast<unsigned char>(r1));
+  x = v1(static_cast<signed short>(r1));
+  x = v1(static_cast<unsigned short>(r1));
+  x = v1(static_cast<signed int>(r1));
+  x = v1(static_cast<unsigned int>(r1));
+  x = v1(static_cast<signed long>(r1));
+  x = v1(static_cast<unsigned long>(r1));
+#if EIGEN_HAS_CXX11
+  x = v1(static_cast<long long int>(r1));
+  x = v1(static_cast<unsigned long long int>(r1));
+#endif
+
   VERIFY_IS_APPROX(               v1,    v1);
   VERIFY_IS_NOT_APPROX(           v1,    2*v1);
   VERIFY_IS_MUCH_SMALLER_THAN(    vzero, v1);
@@ -74,7 +91,7 @@ template<typename MatrixType> void basicStuff(const MatrixType& m)
   Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> cv(rows);
   rv = square.row(r);
   cv = square.col(r);
-  
+
   VERIFY_IS_APPROX(rv, cv.transpose());
 
   if(cols!=1 && rows!=1 && MatrixType::SizeAtCompileTime!=Dynamic)
@@ -104,28 +121,28 @@ template<typename MatrixType> void basicStuff(const MatrixType& m)
   m1 = m2;
   VERIFY(m1==m2);
   VERIFY(!(m1!=m2));
-  
+
   // check automatic transposition
   sm2.setZero();
-  for(typename MatrixType::Index i=0;i<rows;++i)
+  for(Index i=0;i<rows;++i)
     sm2.col(i) = sm1.row(i);
   VERIFY_IS_APPROX(sm2,sm1.transpose());
-  
+
   sm2.setZero();
-  for(typename MatrixType::Index i=0;i<rows;++i)
+  for(Index i=0;i<rows;++i)
     sm2.col(i).noalias() = sm1.row(i);
   VERIFY_IS_APPROX(sm2,sm1.transpose());
-  
+
   sm2.setZero();
-  for(typename MatrixType::Index i=0;i<rows;++i)
+  for(Index i=0;i<rows;++i)
     sm2.col(i).noalias() += sm1.row(i);
   VERIFY_IS_APPROX(sm2,sm1.transpose());
-  
+
   sm2.setZero();
-  for(typename MatrixType::Index i=0;i<rows;++i)
+  for(Index i=0;i<rows;++i)
     sm2.col(i).noalias() -= sm1.row(i);
   VERIFY_IS_APPROX(sm2,-sm1.transpose());
-  
+
   // check ternary usage
   {
     bool b = internal::random<int>(0,10)>5;
@@ -178,16 +195,78 @@ template<typename MatrixType> void basicStuffComplex(const MatrixType& m)
   VERIFY(!static_cast<const MatrixType&>(cm).imag().isZero());
 }
 
-#ifdef EIGEN_TEST_PART_2
-void casting()
+template<typename SrcScalar, typename TgtScalar>
+struct casting_test {
+  static void run() {
+    Matrix<SrcScalar,4,4> m;
+    for (int i=0; i<m.rows(); ++i) {
+      for (int j=0; j<m.cols(); ++j) {
+        m(i, j) = internal::random_without_cast_overflow<SrcScalar,TgtScalar>::value();
+      }
+    }
+    Matrix<TgtScalar,4,4> n = m.template cast<TgtScalar>();
+    for (int i=0; i<m.rows(); ++i) {
+      for (int j=0; j<m.cols(); ++j) {
+        VERIFY_IS_APPROX(n(i, j), (internal::cast<SrcScalar,TgtScalar>(m(i, j))));
+      }
+    }
+  }
+};
+
+template<typename SrcScalar, typename EnableIf = void>
+struct casting_test_runner {
+  static void run() {
+    casting_test<SrcScalar, bool>::run();
+    casting_test<SrcScalar, int8_t>::run();
+    casting_test<SrcScalar, uint8_t>::run();
+    casting_test<SrcScalar, int16_t>::run();
+    casting_test<SrcScalar, uint16_t>::run();
+    casting_test<SrcScalar, int32_t>::run();
+    casting_test<SrcScalar, uint32_t>::run();
+#if EIGEN_HAS_CXX11
+    casting_test<SrcScalar, int64_t>::run();
+    casting_test<SrcScalar, uint64_t>::run();
+#endif
+    casting_test<SrcScalar, half>::run();
+    casting_test<SrcScalar, bfloat16>::run();
+    casting_test<SrcScalar, float>::run();
+    casting_test<SrcScalar, double>::run();
+    casting_test<SrcScalar, std::complex<float> >::run();
+    casting_test<SrcScalar, std::complex<double> >::run();
+  }
+};
+
+template<typename SrcScalar>
+struct casting_test_runner<SrcScalar, typename internal::enable_if<(NumTraits<SrcScalar>::IsComplex)>::type>
 {
-  Matrix4f m = Matrix4f::Random(), m2;
-  Matrix4d n = m.cast<double>();
-  VERIFY(m.isApprox(n.cast<float>()));
-  m2 = m.cast<float>(); // check the specialization when NewType == Type
-  VERIFY(m.isApprox(m2));
-}
+  static void run() {
+    // Only a few casts from std::complex<T> are defined.
+    casting_test<SrcScalar, half>::run();
+    casting_test<SrcScalar, bfloat16>::run();
+    casting_test<SrcScalar, std::complex<float> >::run();
+    casting_test<SrcScalar, std::complex<double> >::run();
+  }
+};
+
+void casting_all() {
+  casting_test_runner<bool>::run();
+  casting_test_runner<int8_t>::run();
+  casting_test_runner<uint8_t>::run();
+  casting_test_runner<int16_t>::run();
+  casting_test_runner<uint16_t>::run();
+  casting_test_runner<int32_t>::run();
+  casting_test_runner<uint32_t>::run();
+#if EIGEN_HAS_CXX11
+  casting_test_runner<int64_t>::run();
+  casting_test_runner<uint64_t>::run();
 #endif
+  casting_test_runner<half>::run();
+  casting_test_runner<bfloat16>::run();
+  casting_test_runner<float>::run();
+  casting_test_runner<double>::run();
+  casting_test_runner<std::complex<float> >::run();
+  casting_test_runner<std::complex<double> >::run();
+}
 
 template <typename Scalar>
 void fixedSizeMatrixConstruction()
@@ -195,12 +274,12 @@ void fixedSizeMatrixConstruction()
   Scalar raw[4];
   for(int k=0; k<4; ++k)
     raw[k] = internal::random<Scalar>();
-  
+
   {
     Matrix<Scalar,4,1> m(raw);
     Array<Scalar,4,1> a(raw);
     for(int k=0; k<4; ++k) VERIFY(m(k) == raw[k]);
-    for(int k=0; k<4; ++k) VERIFY(a(k) == raw[k]);    
+    for(int k=0; k<4; ++k) VERIFY(a(k) == raw[k]);
     VERIFY_IS_EQUAL(m,(Matrix<Scalar,4,1>(raw[0],raw[1],raw[2],raw[3])));
     VERIFY((a==(Array<Scalar,4,1>(raw[0],raw[1],raw[2],raw[3]))).all());
   }
@@ -252,7 +331,7 @@ void fixedSizeMatrixConstruction()
   }
 }
 
-void test_basicstuff()
+EIGEN_DECLARE_TEST(basicstuff)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( basicStuff(Matrix<float, 1, 1>()) );
@@ -262,6 +341,7 @@ void test_basicstuff()
     CALL_SUBTEST_5( basicStuff(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_6( basicStuff(Matrix<float, 100, 100>()) );
     CALL_SUBTEST_7( basicStuff(Matrix<long double,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE),internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_8( casting_all() );
 
     CALL_SUBTEST_3( basicStuffComplex(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_5( basicStuffComplex(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
@@ -273,6 +353,4 @@ void test_basicstuff()
   CALL_SUBTEST_1(fixedSizeMatrixConstruction<int>());
   CALL_SUBTEST_1(fixedSizeMatrixConstruction<long int>());
   CALL_SUBTEST_1(fixedSizeMatrixConstruction<std::ptrdiff_t>());
-
-  CALL_SUBTEST_2(casting());
 }
diff --git a/contrib/eigen/test/bdcsvd.cpp b/contrib/eigen/test/bdcsvd.cpp
index 3ca273635c47ea664f85c54587c2c02b0886a11b..e92a7dc97d185d7b2e67992b09435f01f1a08d31 100644
--- a/contrib/eigen/test/bdcsvd.cpp
+++ b/contrib/eigen/test/bdcsvd.cpp
@@ -50,6 +50,8 @@ void bdcsvd_method()
   VERIFY_RAISES_ASSERT(m.bdcSvd().matrixU());
   VERIFY_RAISES_ASSERT(m.bdcSvd().matrixV());
   VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).solve(m), m);
+  VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).transpose().solve(m), m);
+  VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).adjoint().solve(m), m);
 }
 
 // compare the Singular values returned with Jacobi and Bdc
@@ -66,7 +68,7 @@ void compare_bdc_jacobi(const MatrixType& a = MatrixType(), unsigned int computa
   if(computationOptions & ComputeThinV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV());
 }
 
-void test_bdcsvd()
+EIGEN_DECLARE_TEST(bdcsvd)
 {
   CALL_SUBTEST_3(( svd_verify_assert<BDCSVD<Matrix3f>  >(Matrix3f()) ));
   CALL_SUBTEST_4(( svd_verify_assert<BDCSVD<Matrix4d>  >(Matrix4d()) ));
@@ -108,7 +110,7 @@ void test_bdcsvd()
   CALL_SUBTEST_7( BDCSVD<MatrixXf>(10,10) );
 
   // Check that preallocation avoids subsequent mallocs
-  // Disbaled because not supported by BDCSVD
+  // Disabled because not supported by BDCSVD
   // CALL_SUBTEST_9( svd_preallocate<void>() );
 
   CALL_SUBTEST_2( svd_underoverflow<void>() );
diff --git a/contrib/eigen/test/bfloat16_float.cpp b/contrib/eigen/test/bfloat16_float.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c3de0b19a0ce6aaa8b65140dd2cebd88e33a1de1
--- /dev/null
+++ b/contrib/eigen/test/bfloat16_float.cpp
@@ -0,0 +1,378 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <sstream>
+#include <memory>
+#include <math.h>
+
+#include "main.h"
+
+#include <Eigen/src/Core/arch/Default/BFloat16.h>
+
+#define VERIFY_BFLOAT16_BITS_EQUAL(h, bits) \
+  VERIFY_IS_EQUAL((numext::bit_cast<numext::uint16_t>(h)), (static_cast<numext::uint16_t>(bits)))
+
+// Make sure it's possible to forward declare Eigen::bfloat16
+namespace Eigen {
+struct bfloat16;
+}
+
+using Eigen::bfloat16;
+
+float BinaryToFloat(uint32_t sign, uint32_t exponent, uint32_t high_mantissa,
+                    uint32_t low_mantissa) {
+  float dest;
+  uint32_t src = (sign << 31) + (exponent << 23) + (high_mantissa << 16) + low_mantissa;
+  memcpy(static_cast<void*>(&dest),
+         static_cast<const void*>(&src), sizeof(dest));
+  return dest;
+}
+
+template<typename T>
+ void test_roundtrip() {
+  // Representable T round trip via bfloat16
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(-std::numeric_limits<T>::infinity()))), -std::numeric_limits<T>::infinity());
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(std::numeric_limits<T>::infinity()))), std::numeric_limits<T>::infinity());
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(-1.0)))), T(-1.0));
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(-0.5)))), T(-0.5));
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(-0.0)))), T(-0.0));
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(1.0)))), T(1.0));
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(0.5)))), T(0.5));
+  VERIFY_IS_EQUAL((internal::cast<bfloat16,T>(internal::cast<T,bfloat16>(T(0.0)))), T(0.0));
+}
+
+void test_conversion()
+{
+  using Eigen::bfloat16_impl::__bfloat16_raw;
+
+  // Round-trip casts
+  VERIFY_IS_EQUAL(
+    numext::bit_cast<bfloat16>(numext::bit_cast<numext::uint16_t>(bfloat16(1.0f))),
+    bfloat16(1.0f));
+  VERIFY_IS_EQUAL(
+    numext::bit_cast<bfloat16>(numext::bit_cast<numext::uint16_t>(bfloat16(0.5f))),
+    bfloat16(0.5f));
+  VERIFY_IS_EQUAL(
+    numext::bit_cast<bfloat16>(numext::bit_cast<numext::uint16_t>(bfloat16(-0.33333f))),
+    bfloat16(-0.33333f));
+   VERIFY_IS_EQUAL(
+    numext::bit_cast<bfloat16>(numext::bit_cast<numext::uint16_t>(bfloat16(0.0f))),
+    bfloat16(0.0f));
+
+  // Conversion from float.
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(1.0f), 0x3f80);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.5f), 0x3f00);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.33333f), 0x3eab);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(3.38e38f), 0x7f7e);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(3.40e38f), 0x7f80);  // Becomes infinity.
+
+  // Verify round-to-nearest-even behavior.
+  float val1 = static_cast<float>(bfloat16(__bfloat16_raw(0x3c00)));
+  float val2 = static_cast<float>(bfloat16(__bfloat16_raw(0x3c01)));
+  float val3 = static_cast<float>(bfloat16(__bfloat16_raw(0x3c02)));
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.5f * (val1 + val2)), 0x3c00);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.5f * (val2 + val3)), 0x3c02);
+
+  // Conversion from int.
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(-1), 0xbf80);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0), 0x0000);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(1), 0x3f80);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(2), 0x4000);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(3), 0x4040);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(12), 0x4140);
+
+  // Conversion from bool.
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(false), 0x0000);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(true), 0x3f80);
+
+  // Conversion to bool
+  VERIFY_IS_EQUAL(static_cast<bool>(bfloat16(3)), true);
+  VERIFY_IS_EQUAL(static_cast<bool>(bfloat16(0.33333f)), true);
+  VERIFY_IS_EQUAL(bfloat16(-0.0), false);
+  VERIFY_IS_EQUAL(static_cast<bool>(bfloat16(0.0)), false);
+
+  // Explicit conversion to float.
+  VERIFY_IS_EQUAL(static_cast<float>(bfloat16(__bfloat16_raw(0x0000))), 0.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(bfloat16(__bfloat16_raw(0x3f80))), 1.0f);
+
+  // Implicit conversion to float
+  VERIFY_IS_EQUAL(bfloat16(__bfloat16_raw(0x0000)), 0.0f);
+  VERIFY_IS_EQUAL(bfloat16(__bfloat16_raw(0x3f80)), 1.0f);
+
+  // Zero representations
+  VERIFY_IS_EQUAL(bfloat16(0.0f), bfloat16(0.0f));
+  VERIFY_IS_EQUAL(bfloat16(-0.0f), bfloat16(0.0f));
+  VERIFY_IS_EQUAL(bfloat16(-0.0f), bfloat16(-0.0f));
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.0f), 0x0000);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(-0.0f), 0x8000);
+
+  // Default is zero
+  VERIFY_IS_EQUAL(static_cast<float>(bfloat16()), 0.0f);
+
+  // Representable floats round trip via bfloat16
+  test_roundtrip<float>();
+  test_roundtrip<double>();
+  test_roundtrip<std::complex<float> >();
+  test_roundtrip<std::complex<double> >();
+
+  // Conversion
+  Array<float,1,100> a;
+  for (int i = 0; i < 100; i++) a(i) = i + 1.25;
+  Array<bfloat16,1,100> b = a.cast<bfloat16>();
+  Array<float,1,100> c = b.cast<float>();
+  for (int i = 0; i < 100; ++i) {
+    VERIFY_LE(numext::abs(c(i) - a(i)), a(i) / 128);
+  }
+
+  // Epsilon
+  VERIFY_LE(1.0f, static_cast<float>((std::numeric_limits<bfloat16>::epsilon)() + bfloat16(1.0f)));
+  VERIFY_IS_EQUAL(1.0f, static_cast<float>((std::numeric_limits<bfloat16>::epsilon)() / bfloat16(2.0f) + bfloat16(1.0f)));
+
+  // Negate
+  VERIFY_IS_EQUAL(static_cast<float>(-bfloat16(3.0f)), -3.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(-bfloat16(-4.5f)), 4.5f);
+
+
+#if !EIGEN_COMP_MSVC
+  // Visual Studio errors out on divisions by 0
+  VERIFY((numext::isnan)(static_cast<float>(bfloat16(0.0 / 0.0))));
+  VERIFY((numext::isinf)(static_cast<float>(bfloat16(1.0 / 0.0))));
+  VERIFY((numext::isinf)(static_cast<float>(bfloat16(-1.0 / 0.0))));
+
+  // Visual Studio errors out on divisions by 0
+  VERIFY((numext::isnan)(bfloat16(0.0 / 0.0)));
+  VERIFY((numext::isinf)(bfloat16(1.0 / 0.0)));
+  VERIFY((numext::isinf)(bfloat16(-1.0 / 0.0)));
+#endif
+
+  // NaNs and infinities.
+  VERIFY(!(numext::isinf)(static_cast<float>(bfloat16(3.38e38f))));  // Largest finite number.
+  VERIFY(!(numext::isnan)(static_cast<float>(bfloat16(0.0f))));
+  VERIFY((numext::isinf)(static_cast<float>(bfloat16(__bfloat16_raw(0xff80)))));
+  VERIFY((numext::isnan)(static_cast<float>(bfloat16(__bfloat16_raw(0xffc0)))));
+  VERIFY((numext::isinf)(static_cast<float>(bfloat16(__bfloat16_raw(0x7f80)))));
+  VERIFY((numext::isnan)(static_cast<float>(bfloat16(__bfloat16_raw(0x7fc0)))));
+
+  // Exactly same checks as above, just directly on the bfloat16 representation.
+  VERIFY(!(numext::isinf)(bfloat16(__bfloat16_raw(0x7bff))));
+  VERIFY(!(numext::isnan)(bfloat16(__bfloat16_raw(0x0000))));
+  VERIFY((numext::isinf)(bfloat16(__bfloat16_raw(0xff80))));
+  VERIFY((numext::isnan)(bfloat16(__bfloat16_raw(0xffc0))));
+  VERIFY((numext::isinf)(bfloat16(__bfloat16_raw(0x7f80))));
+  VERIFY((numext::isnan)(bfloat16(__bfloat16_raw(0x7fc0))));
+
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x0, 0xff, 0x40, 0x0)), 0x7fc0);
+  VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x1, 0xff, 0x40, 0x0)), 0xffc0);
+}
+
+void test_numtraits()
+{
+  std::cout << "epsilon       = " << NumTraits<bfloat16>::epsilon() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<bfloat16>::epsilon()) << ")" << std::endl;
+  std::cout << "highest       = " << NumTraits<bfloat16>::highest() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<bfloat16>::highest()) << ")" << std::endl;
+  std::cout << "lowest        = " << NumTraits<bfloat16>::lowest() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<bfloat16>::lowest()) << ")" << std::endl;
+  std::cout << "min           = " << (std::numeric_limits<bfloat16>::min)() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>((std::numeric_limits<bfloat16>::min)()) << ")" << std::endl;
+  std::cout << "denorm min    = " << (std::numeric_limits<bfloat16>::denorm_min)() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>((std::numeric_limits<bfloat16>::denorm_min)()) << ")" << std::endl;
+  std::cout << "infinity      = " << NumTraits<bfloat16>::infinity() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<bfloat16>::infinity()) << ")" << std::endl;
+  std::cout << "quiet nan     = " << NumTraits<bfloat16>::quiet_NaN() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<bfloat16>::quiet_NaN()) << ")" << std::endl;
+  std::cout << "signaling nan = " << std::numeric_limits<bfloat16>::signaling_NaN() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(std::numeric_limits<bfloat16>::signaling_NaN()) << ")" << std::endl;
+
+  VERIFY(NumTraits<bfloat16>::IsSigned);
+
+  VERIFY_IS_EQUAL(
+    numext::bit_cast<numext::uint16_t>(std::numeric_limits<bfloat16>::infinity()),
+    numext::bit_cast<numext::uint16_t>(bfloat16(std::numeric_limits<float>::infinity())) );
+  // There is no guarantee that casting a 32-bit NaN to bfloat16 has a precise
+  // bit pattern.  We test that it is in fact a NaN, then test the signaling
+  // bit (msb of significand is 1 for quiet, 0 for signaling).
+  const numext::uint16_t BFLOAT16_QUIET_BIT = 0x0040;
+  VERIFY(
+    (numext::isnan)(std::numeric_limits<bfloat16>::quiet_NaN())
+    && (numext::isnan)(bfloat16(std::numeric_limits<float>::quiet_NaN()))
+    && ((numext::bit_cast<numext::uint16_t>(std::numeric_limits<bfloat16>::quiet_NaN()) & BFLOAT16_QUIET_BIT) > 0)
+    && ((numext::bit_cast<numext::uint16_t>(bfloat16(std::numeric_limits<float>::quiet_NaN())) & BFLOAT16_QUIET_BIT) > 0) );
+  // After a cast to bfloat16, a signaling NaN may become non-signaling. Thus,
+  // we check that both are NaN, and that only the `numeric_limits` version is
+  // signaling.
+  VERIFY(
+    (numext::isnan)(std::numeric_limits<bfloat16>::signaling_NaN())
+    && (numext::isnan)(bfloat16(std::numeric_limits<float>::signaling_NaN()))
+    && ((numext::bit_cast<numext::uint16_t>(std::numeric_limits<bfloat16>::signaling_NaN()) & BFLOAT16_QUIET_BIT) == 0) );
+
+  VERIFY( (std::numeric_limits<bfloat16>::min)() > bfloat16(0.f) );
+  VERIFY( (std::numeric_limits<bfloat16>::denorm_min)() > bfloat16(0.f) );
+  VERIFY_IS_EQUAL( (std::numeric_limits<bfloat16>::denorm_min)()/bfloat16(2), bfloat16(0.f) );
+}
+
+void test_arithmetic()
+{
+  VERIFY_IS_EQUAL(static_cast<float>(bfloat16(2) + bfloat16(2)), 4);
+  VERIFY_IS_EQUAL(static_cast<float>(bfloat16(2) + bfloat16(-2)), 0);
+  VERIFY_IS_APPROX(static_cast<float>(bfloat16(0.33333f) + bfloat16(0.66667f)), 1.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(bfloat16(2.0f) * bfloat16(-5.5f)), -11.0f);
+  VERIFY_IS_APPROX(static_cast<float>(bfloat16(1.0f) / bfloat16(3.0f)), 0.3339f);
+  VERIFY_IS_EQUAL(static_cast<float>(-bfloat16(4096.0f)), -4096.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(-bfloat16(-4096.0f)), 4096.0f);
+}
+
+void test_comparison()
+{
+  VERIFY(bfloat16(1.0f) > bfloat16(0.5f));
+  VERIFY(bfloat16(0.5f) < bfloat16(1.0f));
+  VERIFY(!(bfloat16(1.0f) < bfloat16(0.5f)));
+  VERIFY(!(bfloat16(0.5f) > bfloat16(1.0f)));
+
+  VERIFY(!(bfloat16(4.0f) > bfloat16(4.0f)));
+  VERIFY(!(bfloat16(4.0f) < bfloat16(4.0f)));
+
+  VERIFY(!(bfloat16(0.0f) < bfloat16(-0.0f)));
+  VERIFY(!(bfloat16(-0.0f) < bfloat16(0.0f)));
+  VERIFY(!(bfloat16(0.0f) > bfloat16(-0.0f)));
+  VERIFY(!(bfloat16(-0.0f) > bfloat16(0.0f)));
+
+  VERIFY(bfloat16(0.2f) > bfloat16(-1.0f));
+  VERIFY(bfloat16(-1.0f) < bfloat16(0.2f));
+  VERIFY(bfloat16(-16.0f) < bfloat16(-15.0f));
+
+  VERIFY(bfloat16(1.0f) == bfloat16(1.0f));
+  VERIFY(bfloat16(1.0f) != bfloat16(2.0f));
+
+  // Comparisons with NaNs and infinities.
+#if !EIGEN_COMP_MSVC
+  // Visual Studio errors out on divisions by 0
+  VERIFY(!(bfloat16(0.0 / 0.0) == bfloat16(0.0 / 0.0)));
+  VERIFY(bfloat16(0.0 / 0.0) != bfloat16(0.0 / 0.0));
+
+  VERIFY(!(bfloat16(1.0) == bfloat16(0.0 / 0.0)));
+  VERIFY(!(bfloat16(1.0) < bfloat16(0.0 / 0.0)));
+  VERIFY(!(bfloat16(1.0) > bfloat16(0.0 / 0.0)));
+  VERIFY(bfloat16(1.0) != bfloat16(0.0 / 0.0));
+
+  VERIFY(bfloat16(1.0) < bfloat16(1.0 / 0.0));
+  VERIFY(bfloat16(1.0) > bfloat16(-1.0 / 0.0));
+#endif
+}
+
+void test_basic_functions()
+{
+  VERIFY_IS_EQUAL(static_cast<float>(numext::abs(bfloat16(3.5f))), 3.5f);
+  VERIFY_IS_EQUAL(static_cast<float>(abs(bfloat16(3.5f))), 3.5f);
+  VERIFY_IS_EQUAL(static_cast<float>(numext::abs(bfloat16(-3.5f))), 3.5f);
+  VERIFY_IS_EQUAL(static_cast<float>(abs(bfloat16(-3.5f))), 3.5f);
+
+  VERIFY_IS_EQUAL(static_cast<float>(numext::floor(bfloat16(3.5f))), 3.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(floor(bfloat16(3.5f))), 3.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(numext::floor(bfloat16(-3.5f))), -4.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(floor(bfloat16(-3.5f))), -4.0f);
+
+  VERIFY_IS_EQUAL(static_cast<float>(numext::ceil(bfloat16(3.5f))), 4.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(ceil(bfloat16(3.5f))), 4.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(numext::ceil(bfloat16(-3.5f))), -3.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(ceil(bfloat16(-3.5f))), -3.0f);
+
+  VERIFY_IS_APPROX(static_cast<float>(numext::sqrt(bfloat16(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(static_cast<float>(sqrt(bfloat16(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(static_cast<float>(numext::sqrt(bfloat16(4.0f))), 2.0f);
+  VERIFY_IS_APPROX(static_cast<float>(sqrt(bfloat16(4.0f))), 2.0f);
+
+  VERIFY_IS_APPROX(static_cast<float>(numext::pow(bfloat16(0.0f), bfloat16(1.0f))), 0.0f);
+  VERIFY_IS_APPROX(static_cast<float>(pow(bfloat16(0.0f), bfloat16(1.0f))), 0.0f);
+  VERIFY_IS_APPROX(static_cast<float>(numext::pow(bfloat16(2.0f), bfloat16(2.0f))), 4.0f);
+  VERIFY_IS_APPROX(static_cast<float>(pow(bfloat16(2.0f), bfloat16(2.0f))), 4.0f);
+
+  VERIFY_IS_EQUAL(static_cast<float>(numext::exp(bfloat16(0.0f))), 1.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(exp(bfloat16(0.0f))), 1.0f);
+  VERIFY_IS_APPROX(static_cast<float>(numext::exp(bfloat16(EIGEN_PI))), 20.f + static_cast<float>(EIGEN_PI));
+  VERIFY_IS_APPROX(static_cast<float>(exp(bfloat16(EIGEN_PI))), 20.f + static_cast<float>(EIGEN_PI));
+
+  VERIFY_IS_EQUAL(static_cast<float>(numext::expm1(bfloat16(0.0f))), 0.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(expm1(bfloat16(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(static_cast<float>(numext::expm1(bfloat16(2.0f))), 6.375f);
+  VERIFY_IS_APPROX(static_cast<float>(expm1(bfloat16(2.0f))), 6.375f);
+
+  VERIFY_IS_EQUAL(static_cast<float>(numext::log(bfloat16(1.0f))), 0.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(log(bfloat16(1.0f))), 0.0f);
+  VERIFY_IS_APPROX(static_cast<float>(numext::log(bfloat16(10.0f))), 2.296875f);
+  VERIFY_IS_APPROX(static_cast<float>(log(bfloat16(10.0f))), 2.296875f);
+
+  VERIFY_IS_EQUAL(static_cast<float>(numext::log1p(bfloat16(0.0f))), 0.0f);
+  VERIFY_IS_EQUAL(static_cast<float>(log1p(bfloat16(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(static_cast<float>(numext::log1p(bfloat16(10.0f))), 2.390625f);
+  VERIFY_IS_APPROX(static_cast<float>(log1p(bfloat16(10.0f))), 2.390625f);
+}
+
+void test_trigonometric_functions()
+{
+  VERIFY_IS_APPROX(numext::cos(bfloat16(0.0f)), bfloat16(cosf(0.0f)));
+  VERIFY_IS_APPROX(cos(bfloat16(0.0f)), bfloat16(cosf(0.0f)));
+  VERIFY_IS_APPROX(numext::cos(bfloat16(EIGEN_PI)), bfloat16(cosf(EIGEN_PI)));
+  // VERIFY_IS_APPROX(numext::cos(bfloat16(EIGEN_PI/2)), bfloat16(cosf(EIGEN_PI/2)));
+  // VERIFY_IS_APPROX(numext::cos(bfloat16(3*EIGEN_PI/2)), bfloat16(cosf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::cos(bfloat16(3.5f)), bfloat16(cosf(3.5f)));
+
+  VERIFY_IS_APPROX(numext::sin(bfloat16(0.0f)), bfloat16(sinf(0.0f)));
+  VERIFY_IS_APPROX(sin(bfloat16(0.0f)), bfloat16(sinf(0.0f)));
+  // VERIFY_IS_APPROX(numext::sin(bfloat16(EIGEN_PI)), bfloat16(sinf(EIGEN_PI)));
+  VERIFY_IS_APPROX(numext::sin(bfloat16(EIGEN_PI/2)), bfloat16(sinf(EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::sin(bfloat16(3*EIGEN_PI/2)), bfloat16(sinf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::sin(bfloat16(3.5f)), bfloat16(sinf(3.5f)));
+
+  VERIFY_IS_APPROX(numext::tan(bfloat16(0.0f)), bfloat16(tanf(0.0f)));
+  VERIFY_IS_APPROX(tan(bfloat16(0.0f)), bfloat16(tanf(0.0f)));
+  // VERIFY_IS_APPROX(numext::tan(bfloat16(EIGEN_PI)), bfloat16(tanf(EIGEN_PI)));
+  // VERIFY_IS_APPROX(numext::tan(bfloat16(EIGEN_PI/2)), bfloat16(tanf(EIGEN_PI/2)));
+  // VERIFY_IS_APPROX(numext::tan(bfloat16(3*EIGEN_PI/2)), bfloat16(tanf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::tan(bfloat16(3.5f)), bfloat16(tanf(3.5f)));
+}
+
+void test_array()
+{
+  typedef Array<bfloat16,1,Dynamic> ArrayXh;
+  Index size = internal::random<Index>(1,10);
+  Index i = internal::random<Index>(0,size-1);
+  ArrayXh a1 = ArrayXh::Random(size), a2 = ArrayXh::Random(size);
+  VERIFY_IS_APPROX( a1+a1, bfloat16(2)*a1 );
+  VERIFY( (a1.abs() >= bfloat16(0)).all() );
+  VERIFY_IS_APPROX( (a1*a1).sqrt(), a1.abs() );
+
+  VERIFY( ((a1.min)(a2) <= (a1.max)(a2)).all() );
+  a1(i) = bfloat16(-10.);
+  VERIFY_IS_EQUAL( a1.minCoeff(), bfloat16(-10.) );
+  a1(i) = bfloat16(10.);
+  VERIFY_IS_EQUAL( a1.maxCoeff(), bfloat16(10.) );
+
+  std::stringstream ss;
+  ss << a1;
+}
+
+void test_product()
+{
+  typedef Matrix<bfloat16,Dynamic,Dynamic> MatrixXh;
+  Index rows  = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  Index cols  = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  Index depth = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  MatrixXh Ah = MatrixXh::Random(rows,depth);
+  MatrixXh Bh = MatrixXh::Random(depth,cols);
+  MatrixXh Ch = MatrixXh::Random(rows,cols);
+  MatrixXf Af = Ah.cast<float>();
+  MatrixXf Bf = Bh.cast<float>();
+  MatrixXf Cf = Ch.cast<float>();
+  VERIFY_IS_APPROX(Ch.noalias()+=Ah*Bh, (Cf.noalias()+=Af*Bf).cast<bfloat16>());
+}
+
+EIGEN_DECLARE_TEST(bfloat16_float)
+{
+  CALL_SUBTEST(test_numtraits());
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST(test_conversion());
+    CALL_SUBTEST(test_arithmetic());
+    CALL_SUBTEST(test_comparison());
+    CALL_SUBTEST(test_basic_functions());
+    CALL_SUBTEST(test_trigonometric_functions());
+    CALL_SUBTEST(test_array());
+    CALL_SUBTEST(test_product());
+  }
+}
diff --git a/contrib/eigen/test/bicgstab.cpp b/contrib/eigen/test/bicgstab.cpp
index 4cc0dd31cf4d8616e533aed3f1b347f3439a59b6..59c4b501c20e6f51ec688c65c38ecfdf70a5a1ec 100644
--- a/contrib/eigen/test/bicgstab.cpp
+++ b/contrib/eigen/test/bicgstab.cpp
@@ -10,11 +10,11 @@
 #include "sparse_solver.h"
 #include <Eigen/IterativeLinearSolvers>
 
-template<typename T, typename I> void test_bicgstab_T()
+template<typename T, typename I_> void test_bicgstab_T()
 {
-  BiCGSTAB<SparseMatrix<T,0,I>, DiagonalPreconditioner<T> >     bicgstab_colmajor_diag;
-  BiCGSTAB<SparseMatrix<T,0,I>, IdentityPreconditioner    >     bicgstab_colmajor_I;
-  BiCGSTAB<SparseMatrix<T,0,I>, IncompleteLUT<T,I> >              bicgstab_colmajor_ilut;
+  BiCGSTAB<SparseMatrix<T,0,I_>, DiagonalPreconditioner<T> >     bicgstab_colmajor_diag;
+  BiCGSTAB<SparseMatrix<T,0,I_>, IdentityPreconditioner    >     bicgstab_colmajor_I;
+  BiCGSTAB<SparseMatrix<T,0,I_>, IncompleteLUT<T,I_> >              bicgstab_colmajor_ilut;
   //BiCGSTAB<SparseMatrix<T>, SSORPreconditioner<T> >     bicgstab_colmajor_ssor;
 
   bicgstab_colmajor_diag.setTolerance(NumTraits<T>::epsilon()*4);
@@ -26,7 +26,7 @@ template<typename T, typename I> void test_bicgstab_T()
   //CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_ssor)     );
 }
 
-void test_bicgstab()
+EIGEN_DECLARE_TEST(bicgstab)
 {
   CALL_SUBTEST_1((test_bicgstab_T<double,int>()) );
   CALL_SUBTEST_2((test_bicgstab_T<std::complex<double>, int>()));
diff --git a/contrib/eigen/test/blasutil.cpp b/contrib/eigen/test/blasutil.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..845a498d675f589a4a526834fd693f024392c07b
--- /dev/null
+++ b/contrib/eigen/test/blasutil.cpp
@@ -0,0 +1,210 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Everton Constantino <everton.constantino@ibm.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/
+
+#include "main.h"
+
+// Disable "ignoring attributes on template argument"
+// for packet_traits<Packet*>
+// => The only workaround would be to wrap _m128 and the likes
+//    within wrappers.
+#if EIGEN_GNUC_AT_LEAST(6,0)
+    #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+
+#define GET(i,j) (StorageOrder == RowMajor ? (i)*stride + (j) : (i) + (j)*stride)
+#define SCATTER(i,j,k) (StorageOrder == RowMajor ? ((i)+(k))*stride + (j) : (i) + ((j)+(k))*stride)
+
+template<typename Scalar, typename Packet>
+void compare(const Packet& a, const Packet& b)
+{
+    int pktsz = internal::packet_traits<Scalar>::size;
+    Scalar *buffA = new Scalar[pktsz];
+    Scalar *buffB = new Scalar[pktsz];
+
+    internal::pstoreu<Scalar, Packet>(buffA, a);
+    internal::pstoreu<Scalar, Packet>(buffB, b);
+
+    for(int i = 0; i < pktsz; i++)
+    {
+        VERIFY_IS_EQUAL(buffA[i], buffB[i]);
+    }
+
+    delete[] buffA;
+    delete[] buffB;
+}
+
+template<typename Scalar, int StorageOrder, int n>
+struct PacketBlockSet
+{
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+
+    void setPacketBlock(internal::PacketBlock<Packet,n>& block, Scalar value)
+    {
+        for(int idx = 0; idx < n; idx++)
+        {
+            block.packet[idx] = internal::pset1<Packet>(value);
+        }
+    }
+
+    void comparePacketBlock(Scalar *data, int i, int j, int stride, internal::PacketBlock<Packet, n>& block)
+    {
+        for(int idx = 0; idx < n; idx++)
+        {
+            Packet line = internal::ploadu<Packet>(data + SCATTER(i,j,idx));
+            compare<Scalar, Packet>(block.packet[idx], line);
+        }
+    }
+};
+
+template<typename Scalar, int StorageOrder, int BlockSize>
+void run_bdmp_spec_1()
+{
+    typedef internal::blas_data_mapper<Scalar, int, StorageOrder> BlasDataMapper;
+    int packetSize = internal::packet_traits<Scalar>::size;
+    int minSize = std::max<int>(packetSize, BlockSize);
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+
+    int szm = internal::random<int>(minSize,500), szn = internal::random<int>(minSize,500);
+    int stride = StorageOrder == RowMajor ? szn : szm;
+    Scalar *d = new Scalar[szn*szm];
+
+    // Initializing with random entries
+    for(int i = 0; i < szm*szn; i++)
+    {
+        d[i] = internal::random<Scalar>(static_cast<Scalar>(3), static_cast<Scalar>(10));
+    }
+
+    BlasDataMapper bdm(d, stride);
+
+    // Testing operator()
+    for(int i = 0; i < szm; i++)
+    {
+        for(int j = 0; j < szn; j++)
+        {
+            VERIFY_IS_EQUAL(d[GET(i,j)], bdm(i,j));
+        }
+    }
+
+    // Testing getSubMapper and getLinearMapper
+    int i0 = internal::random<int>(0,szm-2);
+    int j0 = internal::random<int>(0,szn-2);
+    for(int i = i0; i < szm; i++)
+    {
+        for(int j = j0; j < szn; j++)
+        {
+            const BlasDataMapper& bdmSM = bdm.getSubMapper(i0,j0);
+            const internal::BlasLinearMapper<Scalar, int, 0>& bdmLM = bdm.getLinearMapper(i0,j0);
+
+            Scalar v = bdmSM(i - i0, j - j0);
+            Scalar vd = d[GET(i,j)];
+            VERIFY_IS_EQUAL(vd, v);
+            VERIFY_IS_EQUAL(vd, bdmLM(GET(i-i0, j-j0)));
+        }
+    }
+
+    // Testing loadPacket
+    for(int i = 0; i < szm - minSize; i++)
+    {
+        for(int j = 0; j < szn - minSize; j++)
+        {
+            Packet pktBDM = bdm.template loadPacket<Packet>(i,j);
+            Packet pktD = internal::ploadu<Packet>(d + GET(i,j));
+
+            compare<Scalar, Packet>(pktBDM, pktD);
+        }
+    }
+
+    // Testing gatherPacket
+    Scalar *buff = new Scalar[packetSize];
+    for(int i = 0; i < szm - minSize; i++)
+    {
+        for(int j = 0; j < szn - minSize; j++)
+        {
+            Packet p = bdm.template gatherPacket<Packet>(i,j);
+            internal::pstoreu<Scalar, Packet>(buff, p);
+
+            for(int k = 0; k < packetSize; k++)
+            {
+                VERIFY_IS_EQUAL(d[SCATTER(i,j,k)], buff[k]);
+            }
+
+        }
+    }
+    delete[] buff;
+
+    // Testing scatterPacket
+    for(int i = 0; i < szm - minSize; i++)
+    {
+        for(int j = 0; j < szn - minSize; j++)
+        {
+            Packet p = internal::pset1<Packet>(static_cast<Scalar>(1));
+            bdm.template scatterPacket<Packet>(i,j,p);
+            for(int k = 0; k < packetSize; k++)
+            {
+                VERIFY_IS_EQUAL(d[SCATTER(i,j,k)], static_cast<Scalar>(1));
+            }
+        }
+    }
+
+    //Testing storePacketBlock
+    internal::PacketBlock<Packet, BlockSize> block;
+
+    PacketBlockSet<Scalar, StorageOrder, BlockSize> pbs;
+    pbs.setPacketBlock(block, static_cast<Scalar>(2));
+
+    for(int i = 0; i < szm - minSize; i++)
+    {
+        for(int j = 0; j < szn - minSize; j++)
+        {
+            bdm.template storePacketBlock<Packet, BlockSize>(i, j, block);
+
+            pbs.comparePacketBlock(d, i, j, stride, block);
+        }
+    }
+
+    delete[] d;
+}
+
+template<typename Scalar>
+void run_test()
+{
+    run_bdmp_spec_1<Scalar, RowMajor, 1>();
+    run_bdmp_spec_1<Scalar, ColMajor, 1>();
+    run_bdmp_spec_1<Scalar, RowMajor, 2>();
+    run_bdmp_spec_1<Scalar, ColMajor, 2>();
+    run_bdmp_spec_1<Scalar, RowMajor, 4>();
+    run_bdmp_spec_1<Scalar, ColMajor, 4>();
+    run_bdmp_spec_1<Scalar, RowMajor, 8>();
+    run_bdmp_spec_1<Scalar, ColMajor, 8>();
+    run_bdmp_spec_1<Scalar, RowMajor, 16>();
+    run_bdmp_spec_1<Scalar, ColMajor, 16>();
+}
+
+EIGEN_DECLARE_TEST(blasutil)
+{
+    for(int i = 0; i < g_repeat; i++)
+    {
+        CALL_SUBTEST_1(run_test<numext::int8_t>());
+        CALL_SUBTEST_2(run_test<numext::int16_t>());
+        CALL_SUBTEST_3(run_test<numext::int32_t>());
+
+// TODO: Replace this by a call to numext::int64_t as soon as we have a way to
+// detect the typedef for int64_t on all platforms
+#if EIGEN_HAS_CXX11
+        CALL_SUBTEST_4(run_test<signed long long>());
+#else
+        CALL_SUBTEST_4(run_test<signed long>());
+#endif
+
+        CALL_SUBTEST_5(run_test<float_t>());
+        CALL_SUBTEST_6(run_test<double_t>());
+        CALL_SUBTEST_7(run_test<std::complex<float> >());
+        CALL_SUBTEST_8(run_test<std::complex<double> >());
+    }
+}
diff --git a/contrib/eigen/test/block.cpp b/contrib/eigen/test/block.cpp
index ca9c21fe3016a52d5c1b8911734c76eafdf44e95..84124aba642b38521d864a5b130ac3cab8b86684 100644
--- a/contrib/eigen/test/block.cpp
+++ b/contrib/eigen/test/block.cpp
@@ -29,6 +29,13 @@ block_real_only(const MatrixType &, Index, Index, Index, Index, const Scalar&) {
   return Scalar(0);
 }
 
+// Check at compile-time that T1==T2, and at runtime-time that a==b
+template<typename T1,typename T2>
+typename internal::enable_if<internal::is_same<T1,T2>::value,bool>::type
+is_same_block(const T1& a, const T2& b)
+{
+  return a.isApprox(b);
+}
 
 template<typename MatrixType> void block(const MatrixType& m)
 {
@@ -86,10 +93,9 @@ template<typename MatrixType> void block(const MatrixType& m)
   m1.block(r1,c1,r2-r1+1,c2-c1+1) = s1 * m2.block(0, 0, r2-r1+1,c2-c1+1);
   m1.block(r1,c1,r2-r1+1,c2-c1+1)(r2-r1,c2-c1) = m2.block(0, 0, r2-r1+1,c2-c1+1)(0,0);
 
-  enum {
-    BlockRows = 2,
-    BlockCols = 5
-  };
+  const Index BlockRows = 2;
+  const Index BlockCols = 5;
+
   if (rows>=5 && cols>=8)
   {
     // test fixed block() as lvalue
@@ -105,6 +111,11 @@ template<typename MatrixType> void block(const MatrixType& m)
     m1.template block<BlockRows,Dynamic>(1,1,BlockRows,BlockCols)(0,3) = m1.template block<2,5>(1,1)(1,2);
     Matrix<Scalar,Dynamic,Dynamic> b2 = m1.template block<Dynamic,BlockCols>(3,3,2,5);
     VERIFY_IS_EQUAL(b2, m1.block(3,3,BlockRows,BlockCols));
+
+    VERIFY(is_same_block(m1.block(3,3,BlockRows,BlockCols), m1.block(3,3,fix<Dynamic>(BlockRows),fix<Dynamic>(BlockCols))));
+    VERIFY(is_same_block(m1.template block<BlockRows,Dynamic>(1,1,BlockRows,BlockCols), m1.block(1,1,fix<BlockRows>,BlockCols)));
+    VERIFY(is_same_block(m1.template block<BlockRows,BlockCols>(1,1,BlockRows,BlockCols), m1.block(1,1,fix<BlockRows>(),fix<BlockCols>)));
+    VERIFY(is_same_block(m1.template block<BlockRows,BlockCols>(1,1,BlockRows,BlockCols), m1.block(1,1,fix<BlockRows>,fix<BlockCols>(BlockCols))));
   }
 
   if (rows>2)
@@ -150,9 +161,18 @@ template<typename MatrixType> void block(const MatrixType& m)
   // expressions without direct access
   VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2)) , ((m1+m2).block(r2,c2,rows-r2,cols-c2)) );
   VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)) );
+  VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).eval().row(r1).segment(c1,c2-c1+1)) );
   VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).col(0)) , ((m1+m2).col(c1).segment(r1,r2-r1+1)) );
   VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() );
   VERIFY_IS_APPROX( ((m1+m2).transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() );
+  VERIFY_IS_APPROX( ((m1+m2).template block<Dynamic,1>(r1,c1,r2-r1+1,1)) , ((m1+m2).eval().col(c1).eval().segment(r1,r2-r1+1)) );
+  VERIFY_IS_APPROX( ((m1+m2).template block<1,Dynamic>(r1,c1,1,c2-c1+1)) , ((m1+m2).eval().row(r1).eval().segment(c1,c2-c1+1)) );
+  VERIFY_IS_APPROX( ((m1+m2).transpose().template block<1,Dynamic>(c1,r1,1,r2-r1+1)) , ((m1+m2).eval().col(c1).eval().segment(r1,r2-r1+1)).transpose() );
+  VERIFY_IS_APPROX( (m1+m2).row(r1).eval(), (m1+m2).eval().row(r1) );
+  VERIFY_IS_APPROX( (m1+m2).adjoint().col(r1).eval(), (m1+m2).adjoint().eval().col(r1) );
+  VERIFY_IS_APPROX( (m1+m2).adjoint().row(c1).eval(), (m1+m2).adjoint().eval().row(c1) );
+  VERIFY_IS_APPROX( (m1*1).row(r1).segment(c1,c2-c1+1).eval(), m1.row(r1).eval().segment(c1,c2-c1+1).eval() );
+  VERIFY_IS_APPROX( m1.col(c1).reverse().segment(r1,r2-r1+1).eval(),m1.col(c1).reverse().eval().segment(r1,r2-r1+1).eval() );
 
   VERIFY_IS_APPROX( (m1*1).topRows(r1),  m1.topRows(r1) );
   VERIFY_IS_APPROX( (m1*1).leftCols(c1), m1.leftCols(c1) );
@@ -200,6 +220,23 @@ template<typename MatrixType> void block(const MatrixType& m)
     VERIFY_RAISES_ASSERT( m1.array() *= m1.col(0).array() );
     VERIFY_RAISES_ASSERT( m1.array() /= m1.col(0).array() );
   }
+
+  VERIFY_IS_EQUAL( m1.template subVector<Horizontal>(r1), m1.row(r1) );
+  VERIFY_IS_APPROX( (m1+m1).template subVector<Horizontal>(r1), (m1+m1).row(r1) );
+  VERIFY_IS_EQUAL( m1.template subVector<Vertical>(c1), m1.col(c1) );
+  VERIFY_IS_APPROX( (m1+m1).template subVector<Vertical>(c1), (m1+m1).col(c1) );
+  VERIFY_IS_EQUAL( m1.template subVectors<Horizontal>(), m1.rows() );
+  VERIFY_IS_EQUAL( m1.template subVectors<Vertical>(), m1.cols() );
+
+  if (rows>=2 || cols>=2) {
+    VERIFY_IS_EQUAL( int(m1.middleCols(0,0).IsRowMajor), int(m1.IsRowMajor) );
+    VERIFY_IS_EQUAL( m1.middleCols(0,0).outerSize(), m1.IsRowMajor ? rows : 0);
+    VERIFY_IS_EQUAL( m1.middleCols(0,0).innerSize(), m1.IsRowMajor ? 0 : rows);
+
+    VERIFY_IS_EQUAL( int(m1.middleRows(0,0).IsRowMajor), int(m1.IsRowMajor) );
+    VERIFY_IS_EQUAL( m1.middleRows(0,0).outerSize(), m1.IsRowMajor ? 0 : cols);
+    VERIFY_IS_EQUAL( m1.middleRows(0,0).innerSize(), m1.IsRowMajor ? cols : 0);
+  }
 }
 
 
@@ -256,15 +293,18 @@ void data_and_stride(const MatrixType& m)
   compare_using_data_and_stride(m1.col(c1).transpose());
 }
 
-void test_block()
+EIGEN_DECLARE_TEST(block)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( block(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_1( block(Matrix<float, 1, Dynamic>(internal::random(2,50))) );
+    CALL_SUBTEST_1( block(Matrix<float, Dynamic, 1>(internal::random(2,50))) );
     CALL_SUBTEST_2( block(Matrix4d()) );
-    CALL_SUBTEST_3( block(MatrixXcf(3, 3)) );
-    CALL_SUBTEST_4( block(MatrixXi(8, 12)) );
-    CALL_SUBTEST_5( block(MatrixXcd(20, 20)) );
-    CALL_SUBTEST_6( block(MatrixXf(20, 20)) );
+    CALL_SUBTEST_3( block(MatrixXcf(internal::random(2,50), internal::random(2,50))) );
+    CALL_SUBTEST_4( block(MatrixXi(internal::random(2,50), internal::random(2,50))) );
+    CALL_SUBTEST_5( block(MatrixXcd(internal::random(2,50), internal::random(2,50))) );
+    CALL_SUBTEST_6( block(MatrixXf(internal::random(2,50), internal::random(2,50))) );
+    CALL_SUBTEST_7( block(Matrix<int,Dynamic,Dynamic,RowMajor>(internal::random(2,50), internal::random(2,50))) );
 
     CALL_SUBTEST_8( block(Matrix<float,Dynamic,4>(3, 4)) );
 
diff --git a/contrib/eigen/test/boostmultiprec.cpp b/contrib/eigen/test/boostmultiprec.cpp
index e06e9bdaf6d287abb31f3e3a35f9b9a3fa1da081..7c79ded23bc49639ded9cda751c10bdae0fe0ee0 100644
--- a/contrib/eigen/test/boostmultiprec.cpp
+++ b/contrib/eigen/test/boostmultiprec.cpp
@@ -55,6 +55,10 @@
 #include "bdcsvd.cpp"
 #endif
 
+#ifdef EIGEN_TEST_PART_11
+#include "simplicial_cholesky.cpp"
+#endif
+
 #include <Eigen/Dense>
 
 #undef min
@@ -62,7 +66,9 @@
 #undef isnan
 #undef isinf
 #undef isfinite
+#undef I
 
+#include <boost/serialization/nvp.hpp>
 #include <boost/multiprecision/cpp_dec_float.hpp>
 #include <boost/multiprecision/number.hpp>
 #include <boost/math/special_functions.hpp>
@@ -141,7 +147,7 @@ namespace Eigen {
 
 }
 
-void test_boostmultiprec()
+EIGEN_DECLARE_TEST(boostmultiprec)
 {
   typedef Matrix<Real,Dynamic,Dynamic> Mat;
   typedef Matrix<std::complex<Real>,Dynamic,Dynamic> MatC;
@@ -152,7 +158,7 @@ void test_boostmultiprec()
   std::cout << "NumTraits<Real>::highest()         = " << NumTraits<Real>::highest() << std::endl;
   std::cout << "NumTraits<Real>::digits10()        = " << NumTraits<Real>::digits10() << std::endl;
 
-  // chekc stream output
+  // check stream output
   {
     Mat A(10,10);
     A.setRandom();
@@ -197,5 +203,6 @@ void test_boostmultiprec()
 
   CALL_SUBTEST_9(( jacobisvd(Mat(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) ));
   CALL_SUBTEST_10(( bdcsvd(Mat(internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE), internal::random<int>(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) ));
-}
 
+  CALL_SUBTEST_11(( test_simplicial_cholesky_T<Real,int,ColMajor>() ));
+}
diff --git a/contrib/eigen/test/cholesky.cpp b/contrib/eigen/test/cholesky.cpp
index 5cf842d6d2bd83c150b154dc17a8ab6b70d7a695..0b1a7b45b2c2b8d93ad03030fa195e3ac97fb6b8 100644
--- a/contrib/eigen/test/cholesky.cpp
+++ b/contrib/eigen/test/cholesky.cpp
@@ -7,15 +7,12 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_NO_ASSERTION_CHECKING
-#define EIGEN_NO_ASSERTION_CHECKING
-#endif
-
 #define TEST_ENABLE_TEMPORARY_TRACKING
 
 #include "main.h"
 #include <Eigen/Cholesky>
 #include <Eigen/QR>
+#include "solverbase.h"
 
 template<typename MatrixType, int UpLo>
 typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) {
@@ -81,15 +78,17 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
   }
 
   {
+    STATIC_CHECK(( internal::is_same<typename LLT<MatrixType,Lower>::StorageIndex,int>::value ));
+    STATIC_CHECK(( internal::is_same<typename LLT<MatrixType,Upper>::StorageIndex,int>::value ));
+
     SquareMatrixType symmUp = symm.template triangularView<Upper>();
     SquareMatrixType symmLo = symm.template triangularView<Lower>();
 
     LLT<SquareMatrixType,Lower> chollo(symmLo);
     VERIFY_IS_APPROX(symm, chollo.reconstructedMatrix());
-    vecX = chollo.solve(vecB);
-    VERIFY_IS_APPROX(symm * vecX, vecB);
-    matX = chollo.solve(matB);
-    VERIFY_IS_APPROX(symm * matX, matB);
+
+    check_solverbase<VectorType, VectorType>(symm, chollo, rows, rows, 1);
+    check_solverbase<MatrixType, MatrixType>(symm, chollo, rows, cols, rows);
 
     const MatrixType symmLo_inverse = chollo.solve(MatrixType::Identity(rows,cols));
     RealScalar rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Lower>(symmLo)) /
@@ -143,6 +142,9 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
 
   // LDLT
   {
+    STATIC_CHECK(( internal::is_same<typename LDLT<MatrixType,Lower>::StorageIndex,int>::value ));
+    STATIC_CHECK(( internal::is_same<typename LDLT<MatrixType,Upper>::StorageIndex,int>::value ));
+
     int sign = internal::random<int>()%2 ? 1 : -1;
 
     if(sign == -1)
@@ -156,10 +158,9 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
     LDLT<SquareMatrixType,Lower> ldltlo(symmLo);
     VERIFY(ldltlo.info()==Success);
     VERIFY_IS_APPROX(symm, ldltlo.reconstructedMatrix());
-    vecX = ldltlo.solve(vecB);
-    VERIFY_IS_APPROX(symm * vecX, vecB);
-    matX = ldltlo.solve(matB);
-    VERIFY_IS_APPROX(symm * matX, matB);
+
+    check_solverbase<VectorType, VectorType>(symm, ldltlo, rows, rows, 1);
+    check_solverbase<MatrixType, MatrixType>(symm, ldltlo, rows, cols, rows);
 
     const MatrixType symmLo_inverse = ldltlo.solve(MatrixType::Identity(rows,cols));
     RealScalar rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Lower>(symmLo)) /
@@ -313,10 +314,9 @@ template<typename MatrixType> void cholesky_cplx(const MatrixType& m)
 
     LLT<RealMatrixType,Lower> chollo(symmLo);
     VERIFY_IS_APPROX(symm, chollo.reconstructedMatrix());
-    vecX = chollo.solve(vecB);
-    VERIFY_IS_APPROX(symm * vecX, vecB);
-//     matX = chollo.solve(matB);
-//     VERIFY_IS_APPROX(symm * matX, matB);
+
+    check_solverbase<VectorType, VectorType>(symm, chollo, rows, rows, 1);
+    //check_solverbase<MatrixType, MatrixType>(symm, chollo, rows, cols, rows);
   }
 
   // LDLT
@@ -333,10 +333,9 @@ template<typename MatrixType> void cholesky_cplx(const MatrixType& m)
     LDLT<RealMatrixType,Lower> ldltlo(symmLo);
     VERIFY(ldltlo.info()==Success);
     VERIFY_IS_APPROX(symm, ldltlo.reconstructedMatrix());
-    vecX = ldltlo.solve(vecB);
-    VERIFY_IS_APPROX(symm * vecX, vecB);
-//     matX = ldltlo.solve(matB);
-//     VERIFY_IS_APPROX(symm * matX, matB);
+
+    check_solverbase<VectorType, VectorType>(symm, ldltlo, rows, rows, 1);
+    //check_solverbase<MatrixType, MatrixType>(symm, ldltlo, rows, cols, rows);
   }
 }
 
@@ -477,19 +476,23 @@ template<typename MatrixType> void cholesky_verify_assert()
   VERIFY_RAISES_ASSERT(llt.matrixL())
   VERIFY_RAISES_ASSERT(llt.matrixU())
   VERIFY_RAISES_ASSERT(llt.solve(tmp))
-  VERIFY_RAISES_ASSERT(llt.solveInPlace(&tmp))
+  VERIFY_RAISES_ASSERT(llt.transpose().solve(tmp))
+  VERIFY_RAISES_ASSERT(llt.adjoint().solve(tmp))
+  VERIFY_RAISES_ASSERT(llt.solveInPlace(tmp))
 
   LDLT<MatrixType> ldlt;
   VERIFY_RAISES_ASSERT(ldlt.matrixL())
-  VERIFY_RAISES_ASSERT(ldlt.permutationP())
+  VERIFY_RAISES_ASSERT(ldlt.transpositionsP())
   VERIFY_RAISES_ASSERT(ldlt.vectorD())
   VERIFY_RAISES_ASSERT(ldlt.isPositive())
   VERIFY_RAISES_ASSERT(ldlt.isNegative())
   VERIFY_RAISES_ASSERT(ldlt.solve(tmp))
-  VERIFY_RAISES_ASSERT(ldlt.solveInPlace(&tmp))
+  VERIFY_RAISES_ASSERT(ldlt.transpose().solve(tmp))
+  VERIFY_RAISES_ASSERT(ldlt.adjoint().solve(tmp))
+  VERIFY_RAISES_ASSERT(ldlt.solveInPlace(tmp))
 }
 
-void test_cholesky()
+EIGEN_DECLARE_TEST(cholesky)
 {
   int s = 0;
   for(int i = 0; i < g_repeat; i++) {
diff --git a/contrib/eigen/test/cholmod_support.cpp b/contrib/eigen/test/cholmod_support.cpp
index a7eda28f79e1bc161f4b482cdd8faeb8299dfae9..89b9cf41e0fed1f45268056058b2dbd0a5ab3ef1 100644
--- a/contrib/eigen/test/cholmod_support.cpp
+++ b/contrib/eigen/test/cholmod_support.cpp
@@ -12,21 +12,21 @@
 
 #include <Eigen/CholmodSupport>
 
-template<typename T> void test_cholmod_T()
+template<typename SparseType> void test_cholmod_ST()
 {
-  CholmodDecomposition<SparseMatrix<T>, Lower> g_chol_colmajor_lower; g_chol_colmajor_lower.setMode(CholmodSupernodalLLt);
-  CholmodDecomposition<SparseMatrix<T>, Upper> g_chol_colmajor_upper; g_chol_colmajor_upper.setMode(CholmodSupernodalLLt);
-  CholmodDecomposition<SparseMatrix<T>, Lower> g_llt_colmajor_lower;  g_llt_colmajor_lower.setMode(CholmodSimplicialLLt);
-  CholmodDecomposition<SparseMatrix<T>, Upper> g_llt_colmajor_upper;  g_llt_colmajor_upper.setMode(CholmodSimplicialLLt);
-  CholmodDecomposition<SparseMatrix<T>, Lower> g_ldlt_colmajor_lower; g_ldlt_colmajor_lower.setMode(CholmodLDLt);
-  CholmodDecomposition<SparseMatrix<T>, Upper> g_ldlt_colmajor_upper; g_ldlt_colmajor_upper.setMode(CholmodLDLt);
+  CholmodDecomposition<SparseType, Lower> g_chol_colmajor_lower; g_chol_colmajor_lower.setMode(CholmodSupernodalLLt);
+  CholmodDecomposition<SparseType, Upper> g_chol_colmajor_upper; g_chol_colmajor_upper.setMode(CholmodSupernodalLLt);
+  CholmodDecomposition<SparseType, Lower> g_llt_colmajor_lower;  g_llt_colmajor_lower.setMode(CholmodSimplicialLLt);
+  CholmodDecomposition<SparseType, Upper> g_llt_colmajor_upper;  g_llt_colmajor_upper.setMode(CholmodSimplicialLLt);
+  CholmodDecomposition<SparseType, Lower> g_ldlt_colmajor_lower; g_ldlt_colmajor_lower.setMode(CholmodLDLt);
+  CholmodDecomposition<SparseType, Upper> g_ldlt_colmajor_upper; g_ldlt_colmajor_upper.setMode(CholmodLDLt);
   
-  CholmodSupernodalLLT<SparseMatrix<T>, Lower> chol_colmajor_lower;
-  CholmodSupernodalLLT<SparseMatrix<T>, Upper> chol_colmajor_upper;
-  CholmodSimplicialLLT<SparseMatrix<T>, Lower> llt_colmajor_lower;
-  CholmodSimplicialLLT<SparseMatrix<T>, Upper> llt_colmajor_upper;
-  CholmodSimplicialLDLT<SparseMatrix<T>, Lower> ldlt_colmajor_lower;
-  CholmodSimplicialLDLT<SparseMatrix<T>, Upper> ldlt_colmajor_upper;
+  CholmodSupernodalLLT<SparseType, Lower> chol_colmajor_lower;
+  CholmodSupernodalLLT<SparseType, Upper> chol_colmajor_upper;
+  CholmodSimplicialLLT<SparseType, Lower> llt_colmajor_lower;
+  CholmodSimplicialLLT<SparseType, Upper> llt_colmajor_upper;
+  CholmodSimplicialLDLT<SparseType, Lower> ldlt_colmajor_lower;
+  CholmodSimplicialLDLT<SparseType, Upper> ldlt_colmajor_upper;
 
   check_sparse_spd_solving(g_chol_colmajor_lower);
   check_sparse_spd_solving(g_chol_colmajor_upper);
@@ -50,8 +50,20 @@ template<typename T> void test_cholmod_T()
   check_sparse_spd_determinant(ldlt_colmajor_upper);
 }
 
-void test_cholmod_support()
+template<typename T, int flags, typename IdxType> void test_cholmod_T()
 {
-  CALL_SUBTEST_1(test_cholmod_T<double>());
-  CALL_SUBTEST_2(test_cholmod_T<std::complex<double> >());
+    test_cholmod_ST<SparseMatrix<T, flags, IdxType> >();
+}
+
+EIGEN_DECLARE_TEST(cholmod_support)
+{
+  CALL_SUBTEST_11( (test_cholmod_T<double              , ColMajor, int >()) );
+  CALL_SUBTEST_12( (test_cholmod_T<double              , ColMajor, long>()) );
+  CALL_SUBTEST_13( (test_cholmod_T<double              , RowMajor, int >()) );
+  CALL_SUBTEST_14( (test_cholmod_T<double              , RowMajor, long>()) );
+  CALL_SUBTEST_21( (test_cholmod_T<std::complex<double>, ColMajor, int >()) );
+  CALL_SUBTEST_22( (test_cholmod_T<std::complex<double>, ColMajor, long>()) );
+  // TODO complex row-major matrices do not work at the moment:
+  // CALL_SUBTEST_23( (test_cholmod_T<std::complex<double>, RowMajor, int >()) );
+  // CALL_SUBTEST_24( (test_cholmod_T<std::complex<double>, RowMajor, long>()) );
 }
diff --git a/contrib/eigen/test/commainitializer.cpp b/contrib/eigen/test/commainitializer.cpp
index 9844adbd2286435697cd74c5035d87aadd3e4a05..eb275be9cf4527e8fac96cb1f25b2c929695d800 100644
--- a/contrib/eigen/test/commainitializer.cpp
+++ b/contrib/eigen/test/commainitializer.cpp
@@ -34,8 +34,14 @@ void test_blocks()
 
   if(N1 > 0)
   {
-    VERIFY_RAISES_ASSERT((m_fixed << mat11, mat12, mat11, mat21, mat22));
-    VERIFY_RAISES_ASSERT((m_fixed << mat11, mat12, mat21, mat21, mat22));
+    if(M1 > 0)
+    {
+      VERIFY_RAISES_ASSERT((m_fixed << mat11, mat12, mat11, mat21, mat22));
+    }
+    if(M2 > 0)
+    {
+      VERIFY_RAISES_ASSERT((m_fixed << mat11, mat12, mat21, mat21, mat22));
+    }
   }
   else
   {
@@ -49,24 +55,25 @@ void test_blocks()
 }
 
 
-template<int N>
+template<int depth, int N=0>
 struct test_block_recursion
 {
   static void run()
   {
-    test_blocks<(N>>6)&3, (N>>4)&3, (N>>2)&3, N & 3>();
-    test_block_recursion<N-1>::run();
+    test_block_recursion<depth-1, N>::run();
+    test_block_recursion<depth-1, N + (1 << (depth-1))>::run();
   }
 };
 
-template<>
-struct test_block_recursion<-1>
+template<int N>
+struct test_block_recursion<0,N>
 {
-  static void run() { }
+  static void run() {
+    test_blocks<(N>>6)&3, (N>>4)&3, (N>>2)&3, N & 3>();
+  }
 };
 
-void test_commainitializer()
-{
+void test_basics() {
   Matrix3d m3;
   Matrix4d m4;
 
@@ -99,8 +106,13 @@ void test_commainitializer()
         4, 5, 6,
         vec[2].transpose();
   VERIFY_IS_APPROX(m3, ref);
+}
+
+EIGEN_DECLARE_TEST(commainitializer)
+{
 
+  CALL_SUBTEST_1(test_basics());
 
   // recursively test all block-sizes from 0 to 3:
-  test_block_recursion<(1<<8) - 1>();
+  CALL_SUBTEST_2(test_block_recursion<8>::run());
 }
diff --git a/contrib/eigen/test/conjugate_gradient.cpp b/contrib/eigen/test/conjugate_gradient.cpp
index 9622fd86ddd67dbebba8bba342071176fca49376..b076a126b7cd8352eeb3199d8bb07640dc82a3f6 100644
--- a/contrib/eigen/test/conjugate_gradient.cpp
+++ b/contrib/eigen/test/conjugate_gradient.cpp
@@ -10,9 +10,9 @@
 #include "sparse_solver.h"
 #include <Eigen/IterativeLinearSolvers>
 
-template<typename T, typename I> void test_conjugate_gradient_T()
+template<typename T, typename I_> void test_conjugate_gradient_T()
 {
-  typedef SparseMatrix<T,0,I> SparseMatrixType;
+  typedef SparseMatrix<T,0,I_> SparseMatrixType;
   ConjugateGradient<SparseMatrixType, Lower      > cg_colmajor_lower_diag;
   ConjugateGradient<SparseMatrixType, Upper      > cg_colmajor_upper_diag;
   ConjugateGradient<SparseMatrixType, Lower|Upper> cg_colmajor_loup_diag;
@@ -26,7 +26,7 @@ template<typename T, typename I> void test_conjugate_gradient_T()
   CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_I)     );
 }
 
-void test_conjugate_gradient()
+EIGEN_DECLARE_TEST(conjugate_gradient)
 {
   CALL_SUBTEST_1(( test_conjugate_gradient_T<double,int>() ));
   CALL_SUBTEST_2(( test_conjugate_gradient_T<std::complex<double>, int>() ));
diff --git a/contrib/eigen/test/conservative_resize.cpp b/contrib/eigen/test/conservative_resize.cpp
index 21a1db4acb90c3639462e244bb126fe7272a97b3..d48eb126fdb1fca779981e258f0c82430c1a01d1 100644
--- a/contrib/eigen/test/conservative_resize.cpp
+++ b/contrib/eigen/test/conservative_resize.cpp
@@ -10,6 +10,7 @@
 #include "main.h"
 
 #include <Eigen/Core>
+#include "AnnoyingScalar.h"
 
 using namespace Eigen;
 
@@ -109,7 +110,33 @@ void run_vector_tests()
   }
 }
 
-void test_conservative_resize()
+// Basic memory leak check with a non-copyable scalar type
+template<int> void noncopyable()
+{
+  typedef Eigen::Matrix<AnnoyingScalar,Dynamic,1> VectorType;
+  typedef Eigen::Matrix<AnnoyingScalar,Dynamic,Dynamic> MatrixType;
+
+  {
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+    AnnoyingScalar::dont_throw = true;
+#endif
+    int n = 50;
+    VectorType v0(n), v1(n);
+    MatrixType m0(n,n), m1(n,n), m2(n,n);
+    v0.setOnes(); v1.setOnes();
+    m0.setOnes(); m1.setOnes(); m2.setOnes();
+    VERIFY(m0==m1);
+    m0.conservativeResize(2*n,2*n);
+    VERIFY(m0.topLeftCorner(n,n) == m1);
+    
+    VERIFY(v0.head(n) == v1);
+    v0.conservativeResize(2*n);
+    VERIFY(v0.head(n) == v1);
+  }
+  VERIFY(AnnoyingScalar::instances==0 && "global memory leak detected in noncopyable");
+}
+
+EIGEN_DECLARE_TEST(conservative_resize)
 {
   for(int i=0; i<g_repeat; ++i)
   {
@@ -122,12 +149,19 @@ void test_conservative_resize()
     CALL_SUBTEST_4((run_matrix_tests<std::complex<float>, Eigen::RowMajor>()));
     CALL_SUBTEST_4((run_matrix_tests<std::complex<float>, Eigen::ColMajor>()));
     CALL_SUBTEST_5((run_matrix_tests<std::complex<double>, Eigen::RowMajor>()));
-    CALL_SUBTEST_6((run_matrix_tests<std::complex<double>, Eigen::ColMajor>()));
+    CALL_SUBTEST_5((run_matrix_tests<std::complex<double>, Eigen::ColMajor>()));
+    CALL_SUBTEST_1((run_matrix_tests<int, Eigen::RowMajor | Eigen::DontAlign>()));
 
     CALL_SUBTEST_1((run_vector_tests<int>()));
     CALL_SUBTEST_2((run_vector_tests<float>()));
     CALL_SUBTEST_3((run_vector_tests<double>()));
     CALL_SUBTEST_4((run_vector_tests<std::complex<float> >()));
     CALL_SUBTEST_5((run_vector_tests<std::complex<double> >()));
+
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+    AnnoyingScalar::dont_throw = true;
+#endif
+    CALL_SUBTEST_6(( run_vector_tests<AnnoyingScalar>() ));
+    CALL_SUBTEST_6(( noncopyable<0>() ));
   }
 }
diff --git a/contrib/eigen/test/constructor.cpp b/contrib/eigen/test/constructor.cpp
index 9885399510adc5abcb338b931c8a561817550186..ffd5e802abfb04a7fb28a8316ab065a9fa6ee9b5 100644
--- a/contrib/eigen/test/constructor.cpp
+++ b/contrib/eigen/test/constructor.cpp
@@ -39,7 +39,7 @@ template<typename MatrixType> void ctor_init1(const MatrixType& m)
 }
 
 
-void test_constructor()
+EIGEN_DECLARE_TEST(constructor)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( ctor_init1(Matrix<float, 1, 1>()) );
diff --git a/contrib/eigen/test/corners.cpp b/contrib/eigen/test/corners.cpp
index 32edadb25ad3bf6f66045f020574dbd58a787d9e..73342a8ddb692d0ce0a60ffa1d1f547881896761 100644
--- a/contrib/eigen/test/corners.cpp
+++ b/contrib/eigen/test/corners.cpp
@@ -101,7 +101,7 @@ template<typename MatrixType, int CRows, int CCols, int SRows, int SCols> void c
   VERIFY_IS_EQUAL((const_matrix.template rightCols<c>()), (const_matrix.template block<rows,c>(0,cols-c)));
 }
 
-void test_corners()
+EIGEN_DECLARE_TEST(corners)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( corners(Matrix<float, 1, 1>()) );
diff --git a/contrib/eigen/test/ctorleak.cpp b/contrib/eigen/test/ctorleak.cpp
index c158f5e4ee47c547dc383a23db54d54ccda40a2b..73904176bf0d9f1b37e0324eb6ba611cfb68b851 100644
--- a/contrib/eigen/test/ctorleak.cpp
+++ b/contrib/eigen/test/ctorleak.cpp
@@ -8,7 +8,7 @@ struct Foo
   static Index object_limit;
   int dummy;
 
-  Foo()
+  Foo() : dummy(0)
   {
 #ifdef EIGEN_EXCEPTIONS
     // TODO: Is this the correct way to handle this?
@@ -33,26 +33,37 @@ Index Foo::object_limit = 0;
 #undef EIGEN_TEST_MAX_SIZE
 #define EIGEN_TEST_MAX_SIZE 3
 
-void test_ctorleak()
+EIGEN_DECLARE_TEST(ctorleak)
 {
   typedef Matrix<Foo, Dynamic, Dynamic> MatrixX;
   typedef Matrix<Foo, Dynamic, 1> VectorX;
+  
   Foo::object_count = 0;
   for(int i = 0; i < g_repeat; i++) {
     Index rows = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE);
-    Foo::object_limit = internal::random<Index>(0, rows*cols - 2);
+    Foo::object_limit = rows*cols;
+    {
+    MatrixX r(rows, cols);
+    Foo::object_limit = r.size()+internal::random<Index>(0, rows*cols - 2);
     std::cout << "object_limit =" << Foo::object_limit << std::endl;
 #ifdef EIGEN_EXCEPTIONS
     try
     {
 #endif
-    	std::cout <<       "\nMatrixX m(" << rows << ", " << cols << ");\n";
-      MatrixX m(rows, cols);
+      if(internal::random<bool>()) {
+        std::cout <<       "\nMatrixX m(" << rows << ", " << cols << ");\n";
+        MatrixX m(rows, cols);
+      }
+      else {
+        std::cout <<       "\nMatrixX m(r);\n";
+        MatrixX m(r);
+      }
 #ifdef EIGEN_EXCEPTIONS
       VERIFY(false);  // not reached if exceptions are enabled
     }
     catch (const Foo::Fail&) { /* ignore */ }
 #endif
+    }
     VERIFY_IS_EQUAL(Index(0), Foo::object_count);
 
     {
@@ -66,4 +77,5 @@ void test_ctorleak()
     }
     VERIFY_IS_EQUAL(Index(0), Foo::object_count);
   }
+  std::cout << "\n";
 }
diff --git a/contrib/eigen/test/cuda_basic.cu b/contrib/eigen/test/cuda_basic.cu
deleted file mode 100644
index ce66c2c7868b3fe6e27135bea79b083380f97bfc..0000000000000000000000000000000000000000
--- a/contrib/eigen/test/cuda_basic.cu
+++ /dev/null
@@ -1,170 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// workaround issue between gcc >= 4.7 and cuda 5.5
-#if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7)
-  #undef _GLIBCXX_ATOMIC_BUILTINS
-  #undef _GLIBCXX_USE_INT128
-#endif
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cuda_basic
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-
-#include <math_constants.h>
-#include <cuda.h>
-#include "main.h"
-#include "cuda_common.h"
-
-// Check that dense modules can be properly parsed by nvcc
-#include <Eigen/Dense>
-
-// struct Foo{
-//   EIGEN_DEVICE_FUNC
-//   void operator()(int i, const float* mats, float* vecs) const {
-//     using namespace Eigen;
-//   //   Matrix3f M(data);
-//   //   Vector3f x(data+9);
-//   //   Map<Vector3f>(data+9) = M.inverse() * x;
-//     Matrix3f M(mats+i/16);
-//     Vector3f x(vecs+i*3);
-//   //   using std::min;
-//   //   using std::sqrt;
-//     Map<Vector3f>(vecs+i*3) << x.minCoeff(), 1, 2;// / x.dot(x);//(M.inverse() *  x) / x.x();
-//     //x = x*2 + x.y() * x + x * x.maxCoeff() - x / x.sum();
-//   }
-// };
-
-template<typename T>
-struct coeff_wise {
-  EIGEN_DEVICE_FUNC
-  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
-  {
-    using namespace Eigen;
-    T x1(in+i);
-    T x2(in+i+1);
-    T x3(in+i+2);
-    Map<T> res(out+i*T::MaxSizeAtCompileTime);
-    
-    res.array() += (in[0] * x1 + x2).array() * x3.array();
-  }
-};
-
-template<typename T>
-struct replicate {
-  EIGEN_DEVICE_FUNC
-  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
-  {
-    using namespace Eigen;
-    T x1(in+i);
-    int step   = x1.size() * 4;
-    int stride = 3 * step;
-    
-    typedef Map<Array<typename T::Scalar,Dynamic,Dynamic> > MapType;
-    MapType(out+i*stride+0*step, x1.rows()*2, x1.cols()*2) = x1.replicate(2,2);
-    MapType(out+i*stride+1*step, x1.rows()*3, x1.cols()) = in[i] * x1.colwise().replicate(3);
-    MapType(out+i*stride+2*step, x1.rows(), x1.cols()*3) = in[i] * x1.rowwise().replicate(3);
-  }
-};
-
-template<typename T>
-struct redux {
-  EIGEN_DEVICE_FUNC
-  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
-  {
-    using namespace Eigen;
-    int N = 10;
-    T x1(in+i);
-    out[i*N+0] = x1.minCoeff();
-    out[i*N+1] = x1.maxCoeff();
-    out[i*N+2] = x1.sum();
-    out[i*N+3] = x1.prod();
-    out[i*N+4] = x1.matrix().squaredNorm();
-    out[i*N+5] = x1.matrix().norm();
-    out[i*N+6] = x1.colwise().sum().maxCoeff();
-    out[i*N+7] = x1.rowwise().maxCoeff().sum();
-    out[i*N+8] = x1.matrix().colwise().squaredNorm().sum();
-  }
-};
-
-template<typename T1, typename T2>
-struct prod_test {
-  EIGEN_DEVICE_FUNC
-  void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
-  {
-    using namespace Eigen;
-    typedef Matrix<typename T1::Scalar, T1::RowsAtCompileTime, T2::ColsAtCompileTime> T3;
-    T1 x1(in+i);
-    T2 x2(in+i+1);
-    Map<T3> res(out+i*T3::MaxSizeAtCompileTime);
-    res += in[i] * x1 * x2;
-  }
-};
-
-template<typename T1, typename T2>
-struct diagonal {
-  EIGEN_DEVICE_FUNC
-  void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
-  {
-    using namespace Eigen;
-    T1 x1(in+i);
-    Map<T2> res(out+i*T2::MaxSizeAtCompileTime);
-    res += x1.diagonal();
-  }
-};
-
-template<typename T>
-struct eigenvalues {
-  EIGEN_DEVICE_FUNC
-  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
-  {
-    using namespace Eigen;
-    typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
-    T M(in+i);
-    Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
-    T A = M*M.adjoint();
-    SelfAdjointEigenSolver<T> eig;
-    eig.computeDirect(M);
-    res = eig.eigenvalues();
-  }
-};
-
-void test_cuda_basic()
-{
-  ei_test_init_cuda();
-  
-  int nthreads = 100;
-  Eigen::VectorXf in, out;
-  
-  #ifndef __CUDA_ARCH__
-  int data_size = nthreads * 512;
-  in.setRandom(data_size);
-  out.setRandom(data_size);
-  #endif
-  
-  CALL_SUBTEST( run_and_compare_to_cuda(coeff_wise<Vector3f>(), nthreads, in, out) );
-  CALL_SUBTEST( run_and_compare_to_cuda(coeff_wise<Array44f>(), nthreads, in, out) );
-  
-  CALL_SUBTEST( run_and_compare_to_cuda(replicate<Array4f>(), nthreads, in, out) );
-  CALL_SUBTEST( run_and_compare_to_cuda(replicate<Array33f>(), nthreads, in, out) );
-  
-  CALL_SUBTEST( run_and_compare_to_cuda(redux<Array4f>(), nthreads, in, out) );
-  CALL_SUBTEST( run_and_compare_to_cuda(redux<Matrix3f>(), nthreads, in, out) );
-  
-  CALL_SUBTEST( run_and_compare_to_cuda(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
-  CALL_SUBTEST( run_and_compare_to_cuda(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
-  
-  CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
-  CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
-  
-  CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix3f>(), nthreads, in, out) );
-  CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix2f>(), nthreads, in, out) );
-
-}
diff --git a/contrib/eigen/test/cuda_common.h b/contrib/eigen/test/cuda_common.h
deleted file mode 100644
index 9737693acfa14dcd467ce7e27d365248ba3253c5..0000000000000000000000000000000000000000
--- a/contrib/eigen/test/cuda_common.h
+++ /dev/null
@@ -1,101 +0,0 @@
-
-#ifndef EIGEN_TEST_CUDA_COMMON_H
-#define EIGEN_TEST_CUDA_COMMON_H
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <iostream>
-
-#ifndef __CUDACC__
-dim3 threadIdx, blockDim, blockIdx;
-#endif
-
-template<typename Kernel, typename Input, typename Output>
-void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out)
-{
-  for(int i=0; i<n; i++)
-    ker(i, in.data(), out.data());
-}
-
-
-template<typename Kernel, typename Input, typename Output>
-__global__
-void run_on_cuda_meta_kernel(const Kernel ker, int n, const Input* in, Output* out)
-{
-  int i = threadIdx.x + blockIdx.x*blockDim.x;
-  if(i<n) {
-    ker(i, in, out);
-  }
-}
-
-
-template<typename Kernel, typename Input, typename Output>
-void run_on_cuda(const Kernel& ker, int n, const Input& in, Output& out)
-{
-  typename Input::Scalar*  d_in;
-  typename Output::Scalar* d_out;
-  std::ptrdiff_t in_bytes  = in.size()  * sizeof(typename Input::Scalar);
-  std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar);
-  
-  cudaMalloc((void**)(&d_in),  in_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
-  
-  cudaMemcpy(d_in,  in.data(),  in_bytes,  cudaMemcpyHostToDevice);
-  cudaMemcpy(d_out, out.data(), out_bytes, cudaMemcpyHostToDevice);
-  
-  // Simple and non-optimal 1D mapping assuming n is not too large
-  // That's only for unit testing!
-  dim3 Blocks(128);
-  dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) );
-
-  cudaThreadSynchronize();
-  run_on_cuda_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
-  cudaThreadSynchronize();
-  
-  // check inputs have not been modified
-  cudaMemcpy(const_cast<typename Input::Scalar*>(in.data()),  d_in,  in_bytes,  cudaMemcpyDeviceToHost);
-  cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost);
-  
-  cudaFree(d_in);
-  cudaFree(d_out);
-}
-
-
-template<typename Kernel, typename Input, typename Output>
-void run_and_compare_to_cuda(const Kernel& ker, int n, const Input& in, Output& out)
-{
-  Input  in_ref,  in_cuda;
-  Output out_ref, out_cuda;
-  #ifndef __CUDA_ARCH__
-  in_ref = in_cuda = in;
-  out_ref = out_cuda = out;
-  #endif
-  run_on_cpu (ker, n, in_ref,  out_ref);
-  run_on_cuda(ker, n, in_cuda, out_cuda);
-  #ifndef __CUDA_ARCH__
-  VERIFY_IS_APPROX(in_ref, in_cuda);
-  VERIFY_IS_APPROX(out_ref, out_cuda);
-  #endif
-}
-
-
-void ei_test_init_cuda()
-{
-  int device = 0;
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, device);
-  std::cout << "CUDA device info:\n";
-  std::cout << "  name:                        " << deviceProp.name << "\n";
-  std::cout << "  capability:                  " << deviceProp.major << "." << deviceProp.minor << "\n";
-  std::cout << "  multiProcessorCount:         " << deviceProp.multiProcessorCount << "\n";
-  std::cout << "  maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n";
-  std::cout << "  warpSize:                    " << deviceProp.warpSize << "\n";
-  std::cout << "  regsPerBlock:                " << deviceProp.regsPerBlock << "\n";
-  std::cout << "  concurrentKernels:           " << deviceProp.concurrentKernels << "\n";
-  std::cout << "  clockRate:                   " << deviceProp.clockRate << "\n";
-  std::cout << "  canMapHostMemory:            " << deviceProp.canMapHostMemory << "\n";
-  std::cout << "  computeMode:                 " << deviceProp.computeMode << "\n";
-}
-
-#endif // EIGEN_TEST_CUDA_COMMON_H
diff --git a/contrib/eigen/test/denseLM.cpp b/contrib/eigen/test/denseLM.cpp
index 0aa736ea3fbe199eefb42c2d84e4fb48125776db..afb8004b1e35fe909d580133c1736ec4bf19179b 100644
--- a/contrib/eigen/test/denseLM.cpp
+++ b/contrib/eigen/test/denseLM.cpp
@@ -182,7 +182,7 @@ void test_denseLM_T()
   
 }
 
-void test_denseLM()
+EIGEN_DECLARE_TEST(denseLM)
 {
   CALL_SUBTEST_2(test_denseLM_T<double>());
   
diff --git a/contrib/eigen/test/dense_storage.cpp b/contrib/eigen/test/dense_storage.cpp
index e63712b1a4962da6b98dca4fd939bf47eedf6278..45c2bd728a993ee4747901536aea2876cbf18126 100644
--- a/contrib/eigen/test/dense_storage.cpp
+++ b/contrib/eigen/test/dense_storage.cpp
@@ -8,17 +8,27 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "main.h"
+#include "AnnoyingScalar.h"
+#include "SafeScalar.h"
 
 #include <Eigen/Core>
 
-template <typename T, int Rows, int Cols>
-void dense_storage_copy()
+#if EIGEN_HAS_TYPE_TRAITS && EIGEN_HAS_CXX11
+using DenseStorageD3x3 = Eigen::DenseStorage<double, 3, 3, 3, 3>;
+static_assert(std::is_trivially_move_constructible<DenseStorageD3x3>::value, "DenseStorage not trivially_move_constructible");
+static_assert(std::is_trivially_move_assignable<DenseStorageD3x3>::value, "DenseStorage not trivially_move_assignable");
+#if !defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN)
+static_assert(std::is_trivially_copy_constructible<DenseStorageD3x3>::value, "DenseStorage not trivially_copy_constructible");
+static_assert(std::is_trivially_copy_assignable<DenseStorageD3x3>::value, "DenseStorage not trivially_copy_assignable");
+static_assert(std::is_trivially_copyable<DenseStorageD3x3>::value, "DenseStorage not trivially_copyable");
+#endif
+#endif
+
+template <typename T, int Size, int Rows, int Cols>
+void dense_storage_copy(int rows, int cols)
 {
-  static const int Size = ((Rows==Dynamic || Cols==Dynamic) ? Dynamic : Rows*Cols);
-  typedef DenseStorage<T,Size, Rows,Cols, 0> DenseStorageType;
+  typedef DenseStorage<T, Size, Rows, Cols, 0> DenseStorageType;
   
-  const int rows = (Rows==Dynamic) ? 4 : Rows;
-  const int cols = (Cols==Dynamic) ? 3 : Cols;
   const int size = rows*cols;
   DenseStorageType reference(size, rows, cols);
   T* raw_reference = reference.data();
@@ -31,14 +41,11 @@ void dense_storage_copy()
     VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]);
 }
 
-template <typename T, int Rows, int Cols>
-void dense_storage_assignment()
+template <typename T, int Size, int Rows, int Cols>
+void dense_storage_assignment(int rows, int cols)
 {
-  static const int Size = ((Rows==Dynamic || Cols==Dynamic) ? Dynamic : Rows*Cols);
-  typedef DenseStorage<T,Size, Rows,Cols, 0> DenseStorageType;
+  typedef DenseStorage<T, Size, Rows, Cols, 0> DenseStorageType;
   
-  const int rows = (Rows==Dynamic) ? 4 : Rows;
-  const int cols = (Cols==Dynamic) ? 3 : Cols;
   const int size = rows*cols;
   DenseStorageType reference(size, rows, cols);
   T* raw_reference = reference.data();
@@ -52,25 +59,132 @@ void dense_storage_assignment()
     VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]);
 }
 
-void test_dense_storage()
+template <typename T, int Size, int Rows, int Cols>
+void dense_storage_swap(int rows0, int cols0, int rows1, int cols1)
 {
-  dense_storage_copy<int,Dynamic,Dynamic>();  
-  dense_storage_copy<int,Dynamic,3>();
-  dense_storage_copy<int,4,Dynamic>();
-  dense_storage_copy<int,4,3>();
+  typedef DenseStorage<T, Size, Rows, Cols, 0> DenseStorageType;
+  
+  const int size0 = rows0*cols0;
+  DenseStorageType a(size0, rows0, cols0);
+  for (int i=0; i<size0; ++i) {
+    a.data()[i] = static_cast<T>(i);
+  }
+  
+  const int size1 = rows1*cols1;
+  DenseStorageType b(size1, rows1, cols1);
+  for (int i=0; i<size1; ++i) {
+    b.data()[i] = static_cast<T>(-i);
+  }
+  
+  a.swap(b);
+  
+  for (int i=0; i<size0; ++i) {
+    VERIFY_IS_EQUAL(b.data()[i], static_cast<T>(i));
+  }
+  
+  for (int i=0; i<size1; ++i) {
+    VERIFY_IS_EQUAL(a.data()[i], static_cast<T>(-i));
+  }
+}
 
-  dense_storage_copy<float,Dynamic,Dynamic>();
-  dense_storage_copy<float,Dynamic,3>();
-  dense_storage_copy<float,4,Dynamic>();  
-  dense_storage_copy<float,4,3>();
+template<typename T, int Size, std::size_t Alignment>
+void dense_storage_alignment()
+{
+  #if EIGEN_HAS_ALIGNAS
   
-  dense_storage_assignment<int,Dynamic,Dynamic>();  
-  dense_storage_assignment<int,Dynamic,3>();
-  dense_storage_assignment<int,4,Dynamic>();
-  dense_storage_assignment<int,4,3>();
+  struct alignas(Alignment) Empty1 {};
+  VERIFY_IS_EQUAL(std::alignment_of<Empty1>::value, Alignment);
+
+  struct EIGEN_ALIGN_TO_BOUNDARY(Alignment) Empty2 {};
+  VERIFY_IS_EQUAL(std::alignment_of<Empty2>::value, Alignment);
+
+  struct Nested1 { EIGEN_ALIGN_TO_BOUNDARY(Alignment) T data[Size]; };
+  VERIFY_IS_EQUAL(std::alignment_of<Nested1>::value, Alignment);
+
+  VERIFY_IS_EQUAL( (std::alignment_of<internal::plain_array<T,Size,AutoAlign,Alignment> >::value), Alignment);
 
-  dense_storage_assignment<float,Dynamic,Dynamic>();
-  dense_storage_assignment<float,Dynamic,3>();
-  dense_storage_assignment<float,4,Dynamic>();  
-  dense_storage_assignment<float,4,3>();  
+  const std::size_t default_alignment = internal::compute_default_alignment<T,Size>::value;
+
+  VERIFY_IS_EQUAL( (std::alignment_of<DenseStorage<T,Size,1,1,AutoAlign> >::value), default_alignment);
+  VERIFY_IS_EQUAL( (std::alignment_of<Matrix<T,Size,1,AutoAlign> >::value), default_alignment);
+  struct Nested2 { Matrix<T,Size,1,AutoAlign> mat; };
+  VERIFY_IS_EQUAL(std::alignment_of<Nested2>::value, default_alignment);
+
+  #endif
+}
+
+template<typename T>
+void dense_storage_tests() {
+  // Dynamic Storage.
+  dense_storage_copy<T,Dynamic,Dynamic,Dynamic>(4, 3);  
+  dense_storage_copy<T,Dynamic,Dynamic,3>(4, 3);
+  dense_storage_copy<T,Dynamic,4,Dynamic>(4, 3);
+  // Fixed Storage.
+  dense_storage_copy<T,12,4,3>(4, 3);
+  dense_storage_copy<T,12,Dynamic,Dynamic>(4, 3);
+  dense_storage_copy<T,12,4,Dynamic>(4, 3);
+  dense_storage_copy<T,12,Dynamic,3>(4, 3);
+  // Fixed Storage with Uninitialized Elements.
+  dense_storage_copy<T,18,Dynamic,Dynamic>(4, 3);
+  dense_storage_copy<T,18,4,Dynamic>(4, 3);
+  dense_storage_copy<T,18,Dynamic,3>(4, 3);
+  
+  // Dynamic Storage.
+  dense_storage_assignment<T,Dynamic,Dynamic,Dynamic>(4, 3);  
+  dense_storage_assignment<T,Dynamic,Dynamic,3>(4, 3);
+  dense_storage_assignment<T,Dynamic,4,Dynamic>(4, 3);
+  // Fixed Storage.
+  dense_storage_assignment<T,12,4,3>(4, 3);
+  dense_storage_assignment<T,12,Dynamic,Dynamic>(4, 3);
+  dense_storage_assignment<T,12,4,Dynamic>(4, 3);
+  dense_storage_assignment<T,12,Dynamic,3>(4, 3);
+  // Fixed Storage with Uninitialized Elements.
+  dense_storage_assignment<T,18,Dynamic,Dynamic>(4, 3);
+  dense_storage_assignment<T,18,4,Dynamic>(4, 3);
+  dense_storage_assignment<T,18,Dynamic,3>(4, 3);
+  
+  // Dynamic Storage.
+  dense_storage_swap<T,Dynamic,Dynamic,Dynamic>(4, 3, 4, 3); 
+  dense_storage_swap<T,Dynamic,Dynamic,Dynamic>(4, 3, 2, 1);  
+  dense_storage_swap<T,Dynamic,Dynamic,Dynamic>(2, 1, 4, 3);
+  dense_storage_swap<T,Dynamic,Dynamic,3>(4, 3, 4, 3);
+  dense_storage_swap<T,Dynamic,Dynamic,3>(4, 3, 2, 3);
+  dense_storage_swap<T,Dynamic,Dynamic,3>(2, 3, 4, 3);
+  dense_storage_swap<T,Dynamic,4,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,Dynamic,4,Dynamic>(4, 3, 4, 1);
+  dense_storage_swap<T,Dynamic,4,Dynamic>(4, 1, 4, 3);
+  // Fixed Storage.
+  dense_storage_swap<T,12,4,3>(4, 3, 4, 3);
+  dense_storage_swap<T,12,Dynamic,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,12,Dynamic,Dynamic>(4, 3, 2, 1);
+  dense_storage_swap<T,12,Dynamic,Dynamic>(2, 1, 4, 3);
+  dense_storage_swap<T,12,4,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,12,4,Dynamic>(4, 3, 4, 1);
+  dense_storage_swap<T,12,4,Dynamic>(4, 1, 4, 3);
+  dense_storage_swap<T,12,Dynamic,3>(4, 3, 4, 3);
+  dense_storage_swap<T,12,Dynamic,3>(4, 3, 2, 3);
+  dense_storage_swap<T,12,Dynamic,3>(2, 3, 4, 3);
+  // Fixed Storage with Uninitialized Elements.
+  dense_storage_swap<T,18,Dynamic,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,18,Dynamic,Dynamic>(4, 3, 2, 1);
+  dense_storage_swap<T,18,Dynamic,Dynamic>(2, 1, 4, 3);
+  dense_storage_swap<T,18,4,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,18,4,Dynamic>(4, 3, 4, 1);
+  dense_storage_swap<T,18,4,Dynamic>(4, 1, 4, 3);
+  dense_storage_swap<T,18,Dynamic,3>(4, 3, 4, 3);
+  dense_storage_swap<T,18,Dynamic,3>(4, 3, 2, 3);
+  dense_storage_swap<T,18,Dynamic,3>(2, 3, 4, 3);
+  
+  dense_storage_alignment<T,16,8>();
+  dense_storage_alignment<T,16,16>();
+  dense_storage_alignment<T,16,32>();
+  dense_storage_alignment<T,16,64>();
+}
+
+EIGEN_DECLARE_TEST(dense_storage)
+{
+  dense_storage_tests<int>();
+  dense_storage_tests<float>();
+  dense_storage_tests<SafeScalar<float> >();
+  dense_storage_tests<AnnoyingScalar>();
 }
diff --git a/contrib/eigen/test/determinant.cpp b/contrib/eigen/test/determinant.cpp
index b8c9babb3a82f9b97a8a10b0a4dce5e60ac78dbd..7dd33c37389aa6b6ccdb8768a26696d3159e67b9 100644
--- a/contrib/eigen/test/determinant.cpp
+++ b/contrib/eigen/test/determinant.cpp
@@ -50,7 +50,7 @@ template<typename MatrixType> void determinant(const MatrixType& m)
   VERIFY_IS_APPROX(m2.block(0,0,0,0).determinant(), Scalar(1));
 }
 
-void test_determinant()
+EIGEN_DECLARE_TEST(determinant)
 {
   for(int i = 0; i < g_repeat; i++) {
     int s = 0;
diff --git a/contrib/eigen/test/diagonal.cpp b/contrib/eigen/test/diagonal.cpp
index 8ed9b4682a72012f96449cee07bd5d210f302247..4e8c4b3c97a05145202856626c2f6dd41c451f68 100644
--- a/contrib/eigen/test/diagonal.cpp
+++ b/contrib/eigen/test/diagonal.cpp
@@ -88,7 +88,7 @@ template<typename MatrixType> void diagonal_assert(const MatrixType& m) {
   VERIFY_RAISES_ASSERT( m1.diagonal(-(rows+1)) );
 }
 
-void test_diagonal()
+EIGEN_DECLARE_TEST(diagonal)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( diagonal(Matrix<float, 1, 1>()) );
diff --git a/contrib/eigen/test/diagonal_matrix_variadic_ctor.cpp b/contrib/eigen/test/diagonal_matrix_variadic_ctor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fbc8f8470c702e73957232625c6364e52e6a94af
--- /dev/null
+++ b/contrib/eigen/test/diagonal_matrix_variadic_ctor.cpp
@@ -0,0 +1,185 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 David Tellenbach <david.tellenbach@tellnotes.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_STATIC_ASSERT
+
+#include "main.h"
+
+template <typename Scalar>
+void assertionTest()
+{
+  typedef DiagonalMatrix<Scalar, 5> DiagMatrix5;
+  typedef DiagonalMatrix<Scalar, 7> DiagMatrix7;
+  typedef DiagonalMatrix<Scalar, Dynamic> DiagMatrixX;
+
+  Scalar raw[6];
+  for (int i = 0; i < 6; ++i) {
+    raw[i] = internal::random<Scalar>();
+  }
+
+  VERIFY_RAISES_ASSERT((DiagMatrix5{raw[0], raw[1], raw[2], raw[3]}));
+  VERIFY_RAISES_ASSERT((DiagMatrix5{raw[0], raw[1], raw[3]}));
+  VERIFY_RAISES_ASSERT((DiagMatrix7{raw[0], raw[1], raw[2], raw[3]}));
+
+  VERIFY_RAISES_ASSERT((DiagMatrixX {
+    {raw[0], raw[1], raw[2]},
+    {raw[3], raw[4], raw[5]}
+  }));
+}
+
+#define VERIFY_IMPLICIT_CONVERSION_3(DIAGTYPE, V0, V1, V2) \
+  DIAGTYPE d(V0, V1, V2);                                  \
+  DIAGTYPE::DenseMatrixType Dense = d.toDenseMatrix();     \
+  VERIFY_IS_APPROX(Dense(0, 0), (Scalar)V0);               \
+  VERIFY_IS_APPROX(Dense(1, 1), (Scalar)V1);               \
+  VERIFY_IS_APPROX(Dense(2, 2), (Scalar)V2);
+
+#define VERIFY_IMPLICIT_CONVERSION_4(DIAGTYPE, V0, V1, V2, V3) \
+  DIAGTYPE d(V0, V1, V2, V3);                                  \
+  DIAGTYPE::DenseMatrixType Dense = d.toDenseMatrix();         \
+  VERIFY_IS_APPROX(Dense(0, 0), (Scalar)V0);                   \
+  VERIFY_IS_APPROX(Dense(1, 1), (Scalar)V1);                   \
+  VERIFY_IS_APPROX(Dense(2, 2), (Scalar)V2);                   \
+  VERIFY_IS_APPROX(Dense(3, 3), (Scalar)V3);
+
+#define VERIFY_IMPLICIT_CONVERSION_5(DIAGTYPE, V0, V1, V2, V3, V4) \
+  DIAGTYPE d(V0, V1, V2, V3, V4);                                  \
+  DIAGTYPE::DenseMatrixType Dense = d.toDenseMatrix();             \
+  VERIFY_IS_APPROX(Dense(0, 0), (Scalar)V0);                       \
+  VERIFY_IS_APPROX(Dense(1, 1), (Scalar)V1);                       \
+  VERIFY_IS_APPROX(Dense(2, 2), (Scalar)V2);                       \
+  VERIFY_IS_APPROX(Dense(3, 3), (Scalar)V3);                       \
+  VERIFY_IS_APPROX(Dense(4, 4), (Scalar)V4);
+
+template<typename Scalar>
+void constructorTest()
+{
+  typedef DiagonalMatrix<Scalar, 0> DiagonalMatrix0;
+  typedef DiagonalMatrix<Scalar, 3> DiagonalMatrix3;
+  typedef DiagonalMatrix<Scalar, 4> DiagonalMatrix4;
+  typedef DiagonalMatrix<Scalar, Dynamic> DiagonalMatrixX;
+
+  Scalar raw[7];
+  for (int k = 0; k < 7; ++k) raw[k] = internal::random<Scalar>();
+
+  // Fixed-sized matrices
+  {
+    DiagonalMatrix0 a {{}};
+    VERIFY(a.rows() == 0);
+    VERIFY(a.cols() == 0);
+    typename DiagonalMatrix0::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+  {
+    DiagonalMatrix3 a {{raw[0], raw[1], raw[2]}};
+    VERIFY(a.rows() == 3);
+    VERIFY(a.cols() == 3);
+    typename DiagonalMatrix3::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+  {
+    DiagonalMatrix4 a {{raw[0], raw[1], raw[2], raw[3]}};
+    VERIFY(a.rows() == 4);
+    VERIFY(a.cols() == 4);
+    typename DiagonalMatrix4::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+
+  // dynamically sized matrices
+  {
+    DiagonalMatrixX a{{}};
+    VERIFY(a.rows() == 0);
+    VERIFY(a.rows() == 0);
+    typename DiagonalMatrixX::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+  {
+    DiagonalMatrixX a{{raw[0], raw[1], raw[2], raw[3], raw[4], raw[5], raw[6]}};
+    VERIFY(a.rows() == 7);
+    VERIFY(a.rows() == 7);
+    typename DiagonalMatrixX::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+}
+
+template<>
+void constructorTest<float>()
+{
+  typedef float Scalar;
+
+  typedef DiagonalMatrix<Scalar, 0> DiagonalMatrix0;
+  typedef DiagonalMatrix<Scalar, 3> DiagonalMatrix3;
+  typedef DiagonalMatrix<Scalar, 4> DiagonalMatrix4;
+  typedef DiagonalMatrix<Scalar, 5> DiagonalMatrix5;
+  typedef DiagonalMatrix<Scalar, Dynamic> DiagonalMatrixX;
+
+  Scalar raw[7];
+  for (int k = 0; k < 7; ++k) raw[k] = internal::random<Scalar>();
+
+  // Fixed-sized matrices
+  {
+    DiagonalMatrix0 a {{}};
+    VERIFY(a.rows() == 0);
+    VERIFY(a.cols() == 0);
+    typename DiagonalMatrix0::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+  {
+    DiagonalMatrix3 a {{raw[0], raw[1], raw[2]}};
+    VERIFY(a.rows() == 3);
+    VERIFY(a.cols() == 3);
+    typename DiagonalMatrix3::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+  {
+    DiagonalMatrix4 a {{raw[0], raw[1], raw[2], raw[3]}};
+    VERIFY(a.rows() == 4);
+    VERIFY(a.cols() == 4);
+    typename DiagonalMatrix4::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+
+  // dynamically sized matrices
+  {
+    DiagonalMatrixX a{{}};
+    VERIFY(a.rows() == 0);
+    VERIFY(a.rows() == 0);
+    typename DiagonalMatrixX::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+  {
+    DiagonalMatrixX a{{raw[0], raw[1], raw[2], raw[3], raw[4], raw[5], raw[6]}};
+    VERIFY(a.rows() == 7);
+    VERIFY(a.rows() == 7);
+    typename DiagonalMatrixX::DenseMatrixType m = a.toDenseMatrix();
+    for (Index k = 0; k < a.rows(); ++k) VERIFY(m(k, k) == raw[k]);
+  }
+  { VERIFY_IMPLICIT_CONVERSION_3(DiagonalMatrix3, 1.2647, 2.56f, -3); }
+  { VERIFY_IMPLICIT_CONVERSION_4(DiagonalMatrix4, 1.2647, 2.56f, -3, 3.23f); }
+  { VERIFY_IMPLICIT_CONVERSION_5(DiagonalMatrix5, 1.2647, 2.56f, -3, 3.23f, 2); }
+}
+
+EIGEN_DECLARE_TEST(diagonal_matrix_variadic_ctor)
+{
+  CALL_SUBTEST_1(assertionTest<unsigned char>());
+  CALL_SUBTEST_1(assertionTest<float>());
+  CALL_SUBTEST_1(assertionTest<Index>());
+  CALL_SUBTEST_1(assertionTest<int>());
+  CALL_SUBTEST_1(assertionTest<long int>());
+  CALL_SUBTEST_1(assertionTest<std::ptrdiff_t>());
+  CALL_SUBTEST_1(assertionTest<std::complex<double>>());
+
+  CALL_SUBTEST_2(constructorTest<unsigned char>());
+  CALL_SUBTEST_2(constructorTest<float>());
+  CALL_SUBTEST_2(constructorTest<Index>());
+  CALL_SUBTEST_2(constructorTest<int>());
+  CALL_SUBTEST_2(constructorTest<long int>());
+  CALL_SUBTEST_2(constructorTest<std::ptrdiff_t>());
+  CALL_SUBTEST_2(constructorTest<std::complex<double>>());
+}
diff --git a/contrib/eigen/test/diagonalmatrices.cpp b/contrib/eigen/test/diagonalmatrices.cpp
index c55733df8c8ba3a53fe9e7126c355a9bdaca1225..276beade05ee7fd8b93e36006cdeece3580cc55d 100644
--- a/contrib/eigen/test/diagonalmatrices.cpp
+++ b/contrib/eigen/test/diagonalmatrices.cpp
@@ -105,6 +105,13 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
   sq_m2 = sq_m1 * sq_m2;
   VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).col(i), sq_m2.col(i) );
   VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).row(i), sq_m2.row(i) );
+
+  sq_m1 = v1.asDiagonal();
+  sq_m2 = v2.asDiagonal();
+  SquareMatrixType sq_m3 = v1.asDiagonal();
+  VERIFY_IS_APPROX( sq_m3 = v1.asDiagonal() + v2.asDiagonal(), sq_m1 + sq_m2);
+  VERIFY_IS_APPROX( sq_m3 = v1.asDiagonal() - v2.asDiagonal(), sq_m1 - sq_m2);
+  VERIFY_IS_APPROX( sq_m3 = v1.asDiagonal() - 2*v2.asDiagonal() + v1.asDiagonal(), sq_m1 - 2*sq_m2 + sq_m1);
 }
 
 template<typename MatrixType> void as_scalar_product(const MatrixType& m)
@@ -144,7 +151,7 @@ void bug987()
   VERIFY_IS_APPROX(( res1 = points.topLeftCorner<2,2>()*diag.asDiagonal()) , res2 = tmp2*diag.asDiagonal() );
 }
 
-void test_diagonalmatrices()
+EIGEN_DECLARE_TEST(diagonalmatrices)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( diagonalmatrices(Matrix<float, 1, 1>()) );
diff --git a/contrib/eigen/test/dontalign.cpp b/contrib/eigen/test/dontalign.cpp
index ac00112edb252c59776695cdb88cd48849f0f848..2e4102b86f63ad8c0e96b2ab1c039e4c08b4fe23 100644
--- a/contrib/eigen/test/dontalign.cpp
+++ b/contrib/eigen/test/dontalign.cpp
@@ -44,7 +44,7 @@ void dontalign(const MatrixType& m)
   internal::aligned_delete(array, rows);
 }
 
-void test_dontalign()
+EIGEN_DECLARE_TEST(dontalign)
 {
 #if defined EIGEN_TEST_PART_1 || defined EIGEN_TEST_PART_5
   dontalign(Matrix3d());
diff --git a/contrib/eigen/test/dynalloc.cpp b/contrib/eigen/test/dynalloc.cpp
index f1cc70beeb154f6cd2dccbdfb7be0c6b00ce1434..23c90a7b5e901a2916b2ff7a418bcf51dad98a36 100644
--- a/contrib/eigen/test/dynalloc.cpp
+++ b/contrib/eigen/test/dynalloc.cpp
@@ -15,6 +15,7 @@
 #define ALIGNMENT 1
 #endif
 
+typedef Matrix<float,16,1> Vector16f;
 typedef Matrix<float,8,1> Vector8f;
 
 void check_handmade_aligned_malloc()
@@ -70,7 +71,7 @@ struct MyStruct
 {
   EIGEN_MAKE_ALIGNED_OPERATOR_NEW
   char dummychar;
-  Vector8f avec;
+  Vector16f avec;
 };
 
 class MyClassA
@@ -78,7 +79,7 @@ class MyClassA
   public:
     EIGEN_MAKE_ALIGNED_OPERATOR_NEW
     char dummychar;
-    Vector8f avec;
+    Vector16f avec;
 };
 
 template<typename T> void check_dynaligned()
@@ -106,7 +107,7 @@ template<typename T> void check_custom_new_delete()
     delete[] t;
   }
   
-#if EIGEN_MAX_ALIGN_BYTES>0
+#if EIGEN_MAX_ALIGN_BYTES>0 && (!EIGEN_HAS_CXX17_OVERALIGN)
   {
     T* t = static_cast<T *>((T::operator new)(sizeof(T)));
     (T::operator delete)(t, sizeof(T));
@@ -119,7 +120,7 @@ template<typename T> void check_custom_new_delete()
 #endif
 }
 
-void test_dynalloc()
+EIGEN_DECLARE_TEST(dynalloc)
 {
   // low level dynamic memory allocation
   CALL_SUBTEST(check_handmade_aligned_malloc());
@@ -145,6 +146,7 @@ void test_dynalloc()
     CALL_SUBTEST(check_dynaligned<Vector4d>() );
     CALL_SUBTEST(check_dynaligned<Vector4i>() );
     CALL_SUBTEST(check_dynaligned<Vector8f>() );
+    CALL_SUBTEST(check_dynaligned<Vector16f>() );
   }
 
   {
diff --git a/contrib/eigen/test/eigen2support.cpp b/contrib/eigen/test/eigen2support.cpp
index ac6931a0e6af5ed7e89557df41bdb4a8881ee2dd..49d7328e9e15db9509d2777288820b487bad33cd 100644
--- a/contrib/eigen/test/eigen2support.cpp
+++ b/contrib/eigen/test/eigen2support.cpp
@@ -52,7 +52,7 @@ template<typename MatrixType> void eigen2support(const MatrixType& m)
   m1.minor(0,0);
 }
 
-void test_eigen2support()
+EIGEN_DECLARE_TEST(eigen2support)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( eigen2support(Matrix<double,1,1>()) );
diff --git a/contrib/eigen/test/eigensolver_complex.cpp b/contrib/eigen/test/eigensolver_complex.cpp
index 7269452598ffe5c7faf3852022e73b9d5983d275..c5373f420e482f390f5efdfea516462840cffee9 100644
--- a/contrib/eigen/test/eigensolver_complex.cpp
+++ b/contrib/eigen/test/eigensolver_complex.cpp
@@ -47,7 +47,7 @@ template<typename MatrixType> bool find_pivot(typename MatrixType::Scalar tol, M
   return false;
 }
 
-/* Check that two column vectors are approximately equal upto permutations.
+/* Check that two column vectors are approximately equal up to permutations.
  * Initially, this method checked that the k-th power sums are equal for all k = 1, ..., vec1.rows(),
  * however this strategy is numerically inacurate because of numerical cancellation issues.
  */
@@ -152,7 +152,7 @@ template<typename MatrixType> void eigensolver_verify_assert(const MatrixType& m
   VERIFY_RAISES_ASSERT(eig.eigenvectors());
 }
 
-void test_eigensolver_complex()
+EIGEN_DECLARE_TEST(eigensolver_complex)
 {
   int s = 0;
   for(int i = 0; i < g_repeat; i++) {
diff --git a/contrib/eigen/test/eigensolver_generalized_real.cpp b/contrib/eigen/test/eigensolver_generalized_real.cpp
index 9dd44c89da434371bae0f31717bfa8cd08b85348..95ed431db70a040e63f1183a9ccfce8cb41ed76c 100644
--- a/contrib/eigen/test/eigensolver_generalized_real.cpp
+++ b/contrib/eigen/test/eigensolver_generalized_real.cpp
@@ -85,7 +85,7 @@ template<typename MatrixType> void generalized_eigensolver_real(const MatrixType
   }
 }
 
-void test_eigensolver_generalized_real()
+EIGEN_DECLARE_TEST(eigensolver_generalized_real)
 {
   for(int i = 0; i < g_repeat; i++) {
     int s = 0;
diff --git a/contrib/eigen/test/eigensolver_generic.cpp b/contrib/eigen/test/eigensolver_generic.cpp
index 5c13175690b5eb92ae0329fd4a366230fff6f0ba..7adb986658675525847cf40eb9b74cbb18d52f08 100644
--- a/contrib/eigen/test/eigensolver_generic.cpp
+++ b/contrib/eigen/test/eigensolver_generic.cpp
@@ -12,6 +12,21 @@
 #include <limits>
 #include <Eigen/Eigenvalues>
 
+template<typename EigType,typename MatType>
+void check_eigensolver_for_given_mat(const EigType &eig, const MatType& a)
+{
+  typedef typename NumTraits<typename MatType::Scalar>::Real RealScalar;
+  typedef Matrix<RealScalar, MatType::RowsAtCompileTime, 1> RealVectorType;
+  typedef typename std::complex<RealScalar> Complex;
+  Index n = a.rows();
+  VERIFY_IS_EQUAL(eig.info(), Success);
+  VERIFY_IS_APPROX(a * eig.pseudoEigenvectors(), eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix());
+  VERIFY_IS_APPROX(a.template cast<Complex>() * eig.eigenvectors(),
+                   eig.eigenvectors() * eig.eigenvalues().asDiagonal());
+  VERIFY_IS_APPROX(eig.eigenvectors().colwise().norm(), RealVectorType::Ones(n).transpose());
+  VERIFY_IS_APPROX(a.eigenvalues(), eig.eigenvalues());
+}
+
 template<typename MatrixType> void eigensolver(const MatrixType& m)
 {
   /* this test covers the following files:
@@ -22,8 +37,7 @@ template<typename MatrixType> void eigensolver(const MatrixType& m)
 
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<RealScalar, MatrixType::RowsAtCompileTime, 1> RealVectorType;
-  typedef typename std::complex<typename NumTraits<typename MatrixType::Scalar>::Real> Complex;
+  typedef typename std::complex<RealScalar> Complex;
 
   MatrixType a = MatrixType::Random(rows,cols);
   MatrixType a1 = MatrixType::Random(rows,cols);
@@ -36,12 +50,7 @@ template<typename MatrixType> void eigensolver(const MatrixType& m)
     (ei0.pseudoEigenvectors().template cast<Complex>()) * (ei0.eigenvalues().asDiagonal()));
 
   EigenSolver<MatrixType> ei1(a);
-  VERIFY_IS_EQUAL(ei1.info(), Success);
-  VERIFY_IS_APPROX(a * ei1.pseudoEigenvectors(), ei1.pseudoEigenvectors() * ei1.pseudoEigenvalueMatrix());
-  VERIFY_IS_APPROX(a.template cast<Complex>() * ei1.eigenvectors(),
-                   ei1.eigenvectors() * ei1.eigenvalues().asDiagonal());
-  VERIFY_IS_APPROX(ei1.eigenvectors().colwise().norm(), RealVectorType::Ones(rows).transpose());
-  VERIFY_IS_APPROX(a.eigenvalues(), ei1.eigenvalues());
+  CALL_SUBTEST( check_eigensolver_for_given_mat(ei1,a) );
 
   EigenSolver<MatrixType> ei2;
   ei2.setMaxIterations(RealSchur<MatrixType>::m_maxIterationsPerRow * rows).compute(a);
@@ -100,7 +109,104 @@ template<typename MatrixType> void eigensolver_verify_assert(const MatrixType& m
   VERIFY_RAISES_ASSERT(eig.pseudoEigenvectors());
 }
 
-void test_eigensolver_generic()
+
+template<typename CoeffType>
+Matrix<typename CoeffType::Scalar,Dynamic,Dynamic>
+make_companion(const CoeffType& coeffs)
+{
+  Index n = coeffs.size()-1;
+  Matrix<typename CoeffType::Scalar,Dynamic,Dynamic> res(n,n);
+  res.setZero();
+	res.row(0) = -coeffs.tail(n) / coeffs(0);
+	res.diagonal(-1).setOnes();
+  return res;
+}
+
+template<int>
+void eigensolver_generic_extra()
+{
+  {
+    // regression test for bug 793
+    MatrixXd a(3,3);
+    a << 0,  0,  1,
+        1,  1, 1,
+        1, 1e+200,  1;
+    Eigen::EigenSolver<MatrixXd> eig(a);
+    double scale = 1e-200; // scale to avoid overflow during the comparisons
+    VERIFY_IS_APPROX(a * eig.pseudoEigenvectors()*scale, eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()*scale);
+    VERIFY_IS_APPROX(a * eig.eigenvectors()*scale, eig.eigenvectors() * eig.eigenvalues().asDiagonal()*scale);
+  }
+  {
+    // check a case where all eigenvalues are null.
+    MatrixXd a(2,2);
+    a << 1,  1,
+        -1, -1;
+    Eigen::EigenSolver<MatrixXd> eig(a);
+    VERIFY_IS_APPROX(eig.pseudoEigenvectors().squaredNorm(), 2.);
+    VERIFY_IS_APPROX((a * eig.pseudoEigenvectors()).norm()+1., 1.);
+    VERIFY_IS_APPROX((eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()).norm()+1., 1.);
+    VERIFY_IS_APPROX((a * eig.eigenvectors()).norm()+1., 1.);
+    VERIFY_IS_APPROX((eig.eigenvectors() * eig.eigenvalues().asDiagonal()).norm()+1., 1.);
+  }
+
+  // regression test for bug 933
+  {
+    {
+      VectorXd coeffs(5); coeffs << 1, -3, -175, -225, 2250;
+      MatrixXd C = make_companion(coeffs);
+      EigenSolver<MatrixXd> eig(C);
+      CALL_SUBTEST( check_eigensolver_for_given_mat(eig,C) );
+    }
+    {
+      // this test is tricky because it requires high accuracy in smallest eigenvalues
+      VectorXd coeffs(5); coeffs << 6.154671e-15, -1.003870e-10, -9.819570e-01, 3.995715e+03, 2.211511e+08;
+      MatrixXd C = make_companion(coeffs);
+      EigenSolver<MatrixXd> eig(C);
+      CALL_SUBTEST( check_eigensolver_for_given_mat(eig,C) );
+      Index n = C.rows();
+      for(Index i=0;i<n;++i)
+      {
+        typedef std::complex<double> Complex;
+        MatrixXcd ac = C.cast<Complex>();
+        ac.diagonal().array() -= eig.eigenvalues()(i);
+        VectorXd sv = ac.jacobiSvd().singularValues();
+        // comparing to sv(0) is not enough here to catch the "bug",
+        // the hard-coded 1.0 is important!
+        VERIFY_IS_MUCH_SMALLER_THAN(sv(n-1), 1.0);
+      }
+    }
+  }
+  // regression test for bug 1557
+  {
+    // this test is interesting because it contains zeros on the diagonal.
+    MatrixXd A_bug1557(3,3);
+    A_bug1557 << 0, 0, 0, 1, 0, 0.5887907064808635127, 0, 1, 0;
+    EigenSolver<MatrixXd> eig(A_bug1557);
+    CALL_SUBTEST( check_eigensolver_for_given_mat(eig,A_bug1557) );
+  }
+
+  // regression test for bug 1174
+  {
+    Index n = 12;
+    MatrixXf A_bug1174(n,n);
+    A_bug1174 <<  262144, 0, 0, 262144, 786432, 0, 0, 0, 0, 0, 0, 786432,
+                  262144, 0, 0, 262144, 786432, 0, 0, 0, 0, 0, 0, 786432,
+                  262144, 0, 0, 262144, 786432, 0, 0, 0, 0, 0, 0, 786432,
+                  262144, 0, 0, 262144, 786432, 0, 0, 0, 0, 0, 0, 786432,
+                  0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0,
+                  0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0,
+                  0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0,
+                  0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0,
+                  0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0,
+                  0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0,
+                  0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0,
+                  0, 262144, 262144, 0, 0, 262144, 262144, 262144, 262144, 262144, 262144, 0;
+    EigenSolver<MatrixXf> eig(A_bug1174);
+    CALL_SUBTEST( check_eigensolver_for_given_mat(eig,A_bug1174) );
+  }
+}
+
+EIGEN_DECLARE_TEST(eigensolver_generic)
 {
   int s = 0;
   for(int i = 0; i < g_repeat; i++) {
@@ -135,31 +241,7 @@ void test_eigensolver_generic()
   }
   );
   
-#ifdef EIGEN_TEST_PART_2
-  {
-    // regression test for bug 793
-    MatrixXd a(3,3);
-    a << 0,  0,  1,
-        1,  1, 1,
-        1, 1e+200,  1;
-    Eigen::EigenSolver<MatrixXd> eig(a);
-    double scale = 1e-200; // scale to avoid overflow during the comparisons
-    VERIFY_IS_APPROX(a * eig.pseudoEigenvectors()*scale, eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()*scale);
-    VERIFY_IS_APPROX(a * eig.eigenvectors()*scale, eig.eigenvectors() * eig.eigenvalues().asDiagonal()*scale);
-  }
-  {
-    // check a case where all eigenvalues are null.
-    MatrixXd a(2,2);
-    a << 1,  1,
-        -1, -1;
-    Eigen::EigenSolver<MatrixXd> eig(a);
-    VERIFY_IS_APPROX(eig.pseudoEigenvectors().squaredNorm(), 2.);
-    VERIFY_IS_APPROX((a * eig.pseudoEigenvectors()).norm()+1., 1.);
-    VERIFY_IS_APPROX((eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()).norm()+1., 1.);
-    VERIFY_IS_APPROX((a * eig.eigenvectors()).norm()+1., 1.);
-    VERIFY_IS_APPROX((eig.eigenvectors() * eig.eigenvalues().asDiagonal()).norm()+1., 1.);
-  }
-#endif
+  CALL_SUBTEST_2( eigensolver_generic_extra<0>() );
   
   TEST_SET_BUT_UNUSED_VARIABLE(s)
 }
diff --git a/contrib/eigen/test/eigensolver_selfadjoint.cpp b/contrib/eigen/test/eigensolver_selfadjoint.cpp
index 0e39b5364e663a56526020ac235d54a550126ce6..0fb2f4da766209ab394459bdc969e97bba1d6bb5 100644
--- a/contrib/eigen/test/eigensolver_selfadjoint.cpp
+++ b/contrib/eigen/test/eigensolver_selfadjoint.cpp
@@ -230,19 +230,25 @@ void bug_1204()
   SelfAdjointEigenSolver<Eigen::SparseMatrix<double> > eig(A);
 }
 
-void test_eigensolver_selfadjoint()
+EIGEN_DECLARE_TEST(eigensolver_selfadjoint)
 {
   int s = 0;
   for(int i = 0; i < g_repeat; i++) {
+
     // trivial test for 1x1 matrices:
     CALL_SUBTEST_1( selfadjointeigensolver(Matrix<float, 1, 1>()));
     CALL_SUBTEST_1( selfadjointeigensolver(Matrix<double, 1, 1>()));
+    CALL_SUBTEST_1( selfadjointeigensolver(Matrix<std::complex<double>, 1, 1>()));
+
     // very important to test 3x3 and 2x2 matrices since we provide special paths for them
     CALL_SUBTEST_12( selfadjointeigensolver(Matrix2f()) );
     CALL_SUBTEST_12( selfadjointeigensolver(Matrix2d()) );
+    CALL_SUBTEST_12( selfadjointeigensolver(Matrix2cd()) );
     CALL_SUBTEST_13( selfadjointeigensolver(Matrix3f()) );
     CALL_SUBTEST_13( selfadjointeigensolver(Matrix3d()) );
+    CALL_SUBTEST_13( selfadjointeigensolver(Matrix3cd()) );
     CALL_SUBTEST_2( selfadjointeigensolver(Matrix4d()) );
+    CALL_SUBTEST_2( selfadjointeigensolver(Matrix4cd()) );
     
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
     CALL_SUBTEST_3( selfadjointeigensolver(MatrixXf(s,s)) );
@@ -254,6 +260,8 @@ void test_eigensolver_selfadjoint()
     // some trivial but implementation-wise tricky cases
     CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(1,1)) );
     CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(2,2)) );
+    CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(1,1)) );
+    CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(2,2)) );
     CALL_SUBTEST_6( selfadjointeigensolver(Matrix<double,1,1>()) );
     CALL_SUBTEST_7( selfadjointeigensolver(Matrix<double,2,2>()) );
   }
diff --git a/contrib/eigen/test/evaluators.cpp b/contrib/eigen/test/evaluators.cpp
index aed5a05a7fa8768a067aab7d7f8ebb6d4fef4e80..2810cd265187c27c669ee055f9a04528571a3664 100644
--- a/contrib/eigen/test/evaluators.cpp
+++ b/contrib/eigen/test/evaluators.cpp
@@ -90,6 +90,12 @@ namespace Eigen {
     {
       call_assignment_no_alias(dst.expression(), src, func);
     }
+
+    template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
+    EIGEN_DEVICE_FUNC void call_restricted_packet_assignment(const NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
+    {
+      call_restricted_packet_assignment_no_alias(dst.expression(), src, func);
+    }
   }
   
 }
@@ -101,7 +107,7 @@ using namespace std;
 #define VERIFY_IS_APPROX_EVALUATOR(DEST,EXPR) VERIFY_IS_APPROX(copy_using_evaluator(DEST,(EXPR)), (EXPR).eval());
 #define VERIFY_IS_APPROX_EVALUATOR2(DEST,EXPR,REF) VERIFY_IS_APPROX(copy_using_evaluator(DEST,(EXPR)), (REF).eval());
 
-void test_evaluators()
+EIGEN_DECLARE_TEST(evaluators)
 {
   // Testing Matrix evaluator and Transpose
   Vector2d v = Vector2d::Random();
@@ -496,4 +502,24 @@ void test_evaluators()
     VERIFY_IS_EQUAL( get_cost(a*(a+b)), 1);
     VERIFY_IS_EQUAL( get_cost(a.lazyProduct(a+b)), 15);
   }
+
+  // regression test for PR 544 and bug 1622 (introduced in #71609c4)
+  {
+    // test restricted_packet_assignment with an unaligned destination
+    const size_t M = 2;
+    const size_t K = 2;
+    const size_t N = 5;
+    float *destMem = new float[(M*N) + 1];
+    float *dest = (internal::UIntPtr(destMem)%EIGEN_MAX_ALIGN_BYTES) == 0 ? destMem+1 : destMem;
+
+    const Matrix<float, Dynamic, Dynamic, RowMajor> a = Matrix<float, Dynamic, Dynamic, RowMajor>::Random(M, K);
+    const Matrix<float, Dynamic, Dynamic, RowMajor> b = Matrix<float, Dynamic, Dynamic, RowMajor>::Random(K, N);
+    
+    Map<Matrix<float, Dynamic, Dynamic, RowMajor> > z(dest, M, N);;
+    Product<Matrix<float, Dynamic, Dynamic, RowMajor>, Matrix<float, Dynamic, Dynamic, RowMajor>, LazyProduct> tmp(a,b);
+    internal::call_restricted_packet_assignment(z.noalias(), tmp.derived(), internal::assign_op<float, float>());
+    
+    VERIFY_IS_APPROX(z, a*b);
+    delete[] destMem;
+  }
 }
diff --git a/contrib/eigen/test/exceptions.cpp b/contrib/eigen/test/exceptions.cpp
index 015b9fd33f31637d698aa2297302187653520d17..3d93060aba2c31e028624b087cede4ffa6054c24 100644
--- a/contrib/eigen/test/exceptions.cpp
+++ b/contrib/eigen/test/exceptions.cpp
@@ -8,93 +8,34 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 
-// Various sanity tests with exceptions:
+// Various sanity tests with exceptions and non trivially copyable scalar type.
 //  - no memory leak when a custom scalar type trow an exceptions
 //  - todo: complete the list of tests!
 
 #define EIGEN_STACK_ALLOCATION_LIMIT 100000000
 
 #include "main.h"
-
-struct my_exception
-{
-  my_exception() {}
-  ~my_exception() {}
-};
-    
-class ScalarWithExceptions
-{
-  public:
-    ScalarWithExceptions() { init(); }
-    ScalarWithExceptions(const float& _v) { init(); *v = _v; }
-    ScalarWithExceptions(const ScalarWithExceptions& other) { init(); *v = *(other.v); }
-    ~ScalarWithExceptions() {
-      delete v;
-      instances--;
-    }
-
-    void init() {
-      v = new float;
-      instances++;
-    }
-
-    ScalarWithExceptions operator+(const ScalarWithExceptions& other) const
-    {
-      countdown--;
-      if(countdown<=0)
-        throw my_exception();
-      return ScalarWithExceptions(*v+*other.v);
-    }
-    
-    ScalarWithExceptions operator-(const ScalarWithExceptions& other) const
-    { return ScalarWithExceptions(*v-*other.v); }
-    
-    ScalarWithExceptions operator*(const ScalarWithExceptions& other) const
-    { return ScalarWithExceptions((*v)*(*other.v)); }
-    
-    ScalarWithExceptions& operator+=(const ScalarWithExceptions& other)
-    { *v+=*other.v; return *this; }
-    ScalarWithExceptions& operator-=(const ScalarWithExceptions& other)
-    { *v-=*other.v; return *this; }
-    ScalarWithExceptions& operator=(const ScalarWithExceptions& other)
-    { *v = *(other.v); return *this; }
-  
-    bool operator==(const ScalarWithExceptions& other) const
-    { return *v==*other.v; }
-    bool operator!=(const ScalarWithExceptions& other) const
-    { return *v!=*other.v; }
-    
-    float* v;
-    static int instances;
-    static int countdown;
-};
-
-ScalarWithExceptions real(const ScalarWithExceptions &x) { return x; }
-ScalarWithExceptions imag(const ScalarWithExceptions & ) { return 0; }
-ScalarWithExceptions conj(const ScalarWithExceptions &x) { return x; }
-
-int ScalarWithExceptions::instances = 0;
-int ScalarWithExceptions::countdown = 0;
-
+#include "AnnoyingScalar.h"
 
 #define CHECK_MEMLEAK(OP) {                                 \
-    ScalarWithExceptions::countdown = 100;                  \
-    int before = ScalarWithExceptions::instances;           \
-    bool exception_thrown = false;                         \
-    try { OP; }                              \
+    AnnoyingScalar::countdown = 100;                        \
+    int before = AnnoyingScalar::instances;                 \
+    bool exception_thrown = false;                          \
+    try { OP; }                                             \
     catch (my_exception) {                                  \
       exception_thrown = true;                              \
-      VERIFY(ScalarWithExceptions::instances==before && "memory leak detected in " && EIGEN_MAKESTRING(OP)); \
+      VERIFY(AnnoyingScalar::instances==before && "memory leak detected in " && EIGEN_MAKESTRING(OP)); \
     } \
-    VERIFY(exception_thrown && " no exception thrown in " && EIGEN_MAKESTRING(OP)); \
+    VERIFY( (AnnoyingScalar::dont_throw) || (exception_thrown && " no exception thrown in " && EIGEN_MAKESTRING(OP)) ); \
   }
 
-void memoryleak()
+EIGEN_DECLARE_TEST(exceptions)
 {
-  typedef Eigen::Matrix<ScalarWithExceptions,Dynamic,1> VectorType;
-  typedef Eigen::Matrix<ScalarWithExceptions,Dynamic,Dynamic> MatrixType;
+  typedef Eigen::Matrix<AnnoyingScalar,Dynamic,1> VectorType;
+  typedef Eigen::Matrix<AnnoyingScalar,Dynamic,Dynamic> MatrixType;
   
   {
+    AnnoyingScalar::dont_throw = false;
     int n = 50;
     VectorType v0(n), v1(n);
     MatrixType m0(n,n), m1(n,n), m2(n,n);
@@ -104,12 +45,5 @@ void memoryleak()
     CHECK_MEMLEAK(m2 = m0 * m1 * m2);
     CHECK_MEMLEAK((v0+v1).dot(v0+v1));
   }
-  VERIFY(ScalarWithExceptions::instances==0 && "global memory leak detected in " && EIGEN_MAKESTRING(OP)); \
-}
-
-void test_exceptions()
-{
-  EIGEN_TRY {
-    CALL_SUBTEST( memoryleak() );
-  } EIGEN_CATCH(...) {}
+  VERIFY(AnnoyingScalar::instances==0 && "global memory leak detected in " && EIGEN_MAKESTRING(OP));
 }
diff --git a/contrib/eigen/test/fastmath.cpp b/contrib/eigen/test/fastmath.cpp
index e84bdc97232892eb1fbf7e20bc0a915ab3d93ab9..00a1a59b8b51b72b88611585706da5b1cb6b1221 100644
--- a/contrib/eigen/test/fastmath.cpp
+++ b/contrib/eigen/test/fastmath.cpp
@@ -88,7 +88,7 @@ void check_inf_nan(bool dryrun) {
   }
 }
 
-void test_fastmath() {
+EIGEN_DECLARE_TEST(fastmath) {
   std::cout << "*** float *** \n\n"; check_inf_nan<float>(true);
   std::cout << "*** double ***\n\n"; check_inf_nan<double>(true);
   std::cout << "*** long double *** \n\n"; check_inf_nan<long double>(true);
diff --git a/contrib/eigen/test/first_aligned.cpp b/contrib/eigen/test/first_aligned.cpp
index ae2d4bc4241efa1547352deb630d6e37ef021391..ed9945077b97129414b63e27eef65d403f23310a 100644
--- a/contrib/eigen/test/first_aligned.cpp
+++ b/contrib/eigen/test/first_aligned.cpp
@@ -26,7 +26,7 @@ void test_none_aligned_helper(Scalar *array, int size)
 
 struct some_non_vectorizable_type { float x; };
 
-void test_first_aligned()
+EIGEN_DECLARE_TEST(first_aligned)
 {
   EIGEN_ALIGN16 float array_float[100];
   test_first_aligned_helper(array_float, 50);
diff --git a/contrib/eigen/test/geo_alignedbox.cpp b/contrib/eigen/test/geo_alignedbox.cpp
index 4cf51aafb58a70f258816befb914fed847f16273..7b1684f292cdefc934c015b52f4c6c71ed03f6e9 100644
--- a/contrib/eigen/test/geo_alignedbox.cpp
+++ b/contrib/eigen/test/geo_alignedbox.cpp
@@ -9,27 +9,33 @@
 
 #include "main.h"
 #include <Eigen/Geometry>
-#include <Eigen/LU>
-#include <Eigen/QR>
 
-#include<iostream>
 using namespace std;
 
-// TODO not sure if this is actually still necessary anywhere ...
+// NOTE the following workaround was needed on some 32 bits builds to kill extra precision of x87 registers.
+// It seems that it is not needed anymore, but let's keep it here, just in case...
+
 template<typename T> EIGEN_DONT_INLINE
-void kill_extra_precision(T& ) {  }
+void kill_extra_precision(T& /* x */) {
+  // This one worked but triggered a warning:
+  /* eigen_assert((void*)(&x) != (void*)0); */
+  // An alternative could be:
+  /* volatile T tmp = x; */
+  /* x = tmp; */
+}
 
 
-template<typename BoxType> void alignedbox(const BoxType& _box)
+template<typename BoxType> void alignedbox(const BoxType& box)
 {
   /* this test covers the following files:
      AlignedBox.h
   */
   typedef typename BoxType::Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef NumTraits<Scalar> ScalarTraits;
+  typedef typename ScalarTraits::Real RealScalar;
   typedef Matrix<Scalar, BoxType::AmbientDimAtCompileTime, 1> VectorType;
 
-  const Index dim = _box.dim();
+  const Index dim = box.dim();
 
   VectorType p0 = VectorType::Random(dim);
   VectorType p1 = VectorType::Random(dim);
@@ -40,7 +46,7 @@ template<typename BoxType> void alignedbox(const BoxType& _box)
   BoxType b0(dim);
   BoxType b1(VectorType::Random(dim),VectorType::Random(dim));
   BoxType b2;
-  
+
   kill_extra_precision(b1);
   kill_extra_precision(p0);
   kill_extra_precision(p1);
@@ -62,7 +68,7 @@ template<typename BoxType> void alignedbox(const BoxType& _box)
   BoxType box2(VectorType::Random(dim));
   box2.extend(VectorType::Random(dim));
 
-  VERIFY(box1.intersects(box2) == !box1.intersection(box2).isEmpty()); 
+  VERIFY(box1.intersects(box2) == !box1.intersection(box2).isEmpty());
 
   // alignment -- make sure there is no memory alignment assertion
   BoxType *bp0 = new BoxType(dim);
@@ -80,16 +86,353 @@ template<typename BoxType> void alignedbox(const BoxType& _box)
 
 }
 
+template<typename BoxType> void alignedboxTranslatable(const BoxType& box)
+{
+  typedef typename BoxType::Scalar Scalar;
+  typedef Matrix<Scalar, BoxType::AmbientDimAtCompileTime, 1> VectorType;
+  typedef Transform<Scalar, BoxType::AmbientDimAtCompileTime, Isometry> IsometryTransform;
+  typedef Transform<Scalar, BoxType::AmbientDimAtCompileTime, Affine> AffineTransform;
+
+  alignedbox(box);
+
+  const VectorType Ones = VectorType::Ones();
+  const VectorType UnitX = VectorType::UnitX();
+  const Index dim = box.dim();
+
+  // box((-1, -1, -1), (1, 1, 1))
+  BoxType a(-Ones, Ones);
+
+  VERIFY_IS_APPROX(a.sizes(), Ones * Scalar(2));
+
+  BoxType b = a;
+  VectorType translate = Ones;
+  translate[0] = Scalar(2);
+  b.translate(translate);
+  // translate by (2, 1, 1) -> box((1, 0, 0), (3, 2, 2))
+
+  VERIFY_IS_APPROX(b.sizes(), Ones * Scalar(2));
+  VERIFY_IS_APPROX((b.min)(), UnitX);
+  VERIFY_IS_APPROX((b.max)(), Ones * Scalar(2) + UnitX);
+
+  // Test transform
+
+  IsometryTransform tf = IsometryTransform::Identity();
+  tf.translation() = -translate;
+
+  BoxType c = b.transformed(tf);
+  // translate by (-2, -1, -1) -> box((-1, -1, -1), (1, 1, 1))
+  VERIFY_IS_APPROX(c.sizes(), a.sizes());
+  VERIFY_IS_APPROX((c.min)(), (a.min)());
+  VERIFY_IS_APPROX((c.max)(), (a.max)());
+
+  c.transform(tf);
+  // translate by (-2, -1, -1) -> box((-3, -2, -2), (-1, 0, 0))
+  VERIFY_IS_APPROX(c.sizes(), a.sizes());
+  VERIFY_IS_APPROX((c.min)(), Ones * Scalar(-2) - UnitX);
+  VERIFY_IS_APPROX((c.max)(), -UnitX);
+
+  // Scaling
+
+  AffineTransform atf = AffineTransform::Identity();
+  atf.scale(Scalar(3));
+  c.transform(atf);
+  // scale by 3 -> box((-9, -6, -6), (-3, 0, 0))
+  VERIFY_IS_APPROX(c.sizes(), Scalar(3) * a.sizes());
+  VERIFY_IS_APPROX((c.min)(), Ones * Scalar(-6) - UnitX * Scalar(3));
+  VERIFY_IS_APPROX((c.max)(), UnitX * Scalar(-3));
+
+  atf = AffineTransform::Identity();
+  atf.scale(Scalar(-3));
+  c.transform(atf);
+  // scale by -3 -> box((27, 18, 18), (9, 0, 0))
+  VERIFY_IS_APPROX(c.sizes(), Scalar(9) * a.sizes());
+  VERIFY_IS_APPROX((c.min)(), UnitX * Scalar(9));
+  VERIFY_IS_APPROX((c.max)(), Ones * Scalar(18) + UnitX * Scalar(9));
+
+  // Check identity transform within numerical precision.
+  BoxType transformedC = c.transformed(IsometryTransform::Identity());
+  VERIFY_IS_APPROX(transformedC, c);
+
+  for (size_t i = 0; i < 10; ++i)
+  {
+    VectorType minCorner;
+    VectorType maxCorner;
+    for (Index d = 0; d < dim; ++d)
+    {
+      minCorner[d] = internal::random<Scalar>(-10,10);
+      maxCorner[d] = minCorner[d] + internal::random<Scalar>(0, 10);
+    }
+
+    c = BoxType(minCorner, maxCorner);
+
+    translate = VectorType::Random();
+    c.translate(translate);
+
+    VERIFY_IS_APPROX((c.min)(), minCorner + translate);
+    VERIFY_IS_APPROX((c.max)(), maxCorner + translate);
+  }
+}
+
+template<typename Scalar, typename Rotation>
+Rotation rotate2D(Scalar angle) {
+  return Rotation2D<Scalar>(angle);
+}
+
+template<typename Scalar, typename Rotation>
+Rotation rotate2DIntegral(typename NumTraits<Scalar>::NonInteger angle) {
+  typedef typename NumTraits<Scalar>::NonInteger NonInteger;
+  return Rotation2D<NonInteger>(angle).toRotationMatrix().
+      template cast<Scalar>();
+}
+
+template<typename Scalar, typename Rotation>
+Rotation rotate3DZAxis(Scalar angle) {
+  return AngleAxis<Scalar>(angle, Matrix<Scalar, 3, 1>(0, 0, 1));
+}
+
+template<typename Scalar, typename Rotation>
+Rotation rotate3DZAxisIntegral(typename NumTraits<Scalar>::NonInteger angle) {
+  typedef typename NumTraits<Scalar>::NonInteger NonInteger;
+  return AngleAxis<NonInteger>(angle, Matrix<NonInteger, 3, 1>(0, 0, 1)).
+      toRotationMatrix().template cast<Scalar>();
+}
+
+template<typename Scalar, typename Rotation>
+Rotation rotate4DZWAxis(Scalar angle) {
+  Rotation result = Matrix<Scalar, 4, 4>::Identity();
+  result.block(0, 0, 3, 3) = rotate3DZAxis<Scalar, AngleAxisd>(angle).toRotationMatrix();
+  return result;
+}
+
+template <typename MatrixType>
+MatrixType randomRotationMatrix()
+{
+  // algorithm from
+  // https://www.isprs-ann-photogramm-remote-sens-spatial-inf-sci.net/III-7/103/2016/isprs-annals-III-7-103-2016.pdf
+  const MatrixType rand = MatrixType::Random();
+  const MatrixType q = rand.householderQr().householderQ();
+  const JacobiSVD<MatrixType> svd = q.jacobiSvd(ComputeFullU | ComputeFullV);
+  const typename MatrixType::Scalar det = (svd.matrixU() * svd.matrixV().transpose()).determinant();
+  MatrixType diag = rand.Identity();
+  diag(MatrixType::RowsAtCompileTime - 1, MatrixType::ColsAtCompileTime - 1) = det;
+  const MatrixType rotation = svd.matrixU() * diag * svd.matrixV().transpose();
+  return rotation;
+}
+
+template <typename Scalar, int Dim>
+Matrix<Scalar, Dim, (1<<Dim)> boxGetCorners(const Matrix<Scalar, Dim, 1>& min_, const Matrix<Scalar, Dim, 1>& max_)
+{
+  Matrix<Scalar, Dim, (1<<Dim) > result;
+  for(Index i=0; i<(1<<Dim); ++i)
+  {
+    for(Index j=0; j<Dim; ++j)
+      result(j,i) = (i & (1<<j)) ? min_(j) : max_(j);
+  }
+  return result;
+}
+
+template<typename BoxType, typename Rotation> void alignedboxRotatable(
+    const BoxType& box,
+    Rotation (*rotate)(typename NumTraits<typename BoxType::Scalar>::NonInteger /*_angle*/))
+{
+  alignedboxTranslatable(box);
+
+  typedef typename BoxType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::NonInteger NonInteger;
+  typedef Matrix<Scalar, BoxType::AmbientDimAtCompileTime, 1> VectorType;
+  typedef Transform<Scalar, BoxType::AmbientDimAtCompileTime, Isometry> IsometryTransform;
+  typedef Transform<Scalar, BoxType::AmbientDimAtCompileTime, Affine> AffineTransform;
+
+  const VectorType Zero = VectorType::Zero();
+  const VectorType Ones = VectorType::Ones();
+  const VectorType UnitX = VectorType::UnitX();
+  const VectorType UnitY = VectorType::UnitY();
+  // this is vector (0, 0, -1, -1, -1, ...), i.e. with zeros at first and second dimensions
+  const VectorType UnitZ = Ones - UnitX - UnitY;
+
+  // in this kind of comments the 3D case values will be illustrated
+  // box((-1, -1, -1), (1, 1, 1))
+  BoxType a(-Ones, Ones);
+
+  // to allow templating this test for both 2D and 3D cases, we always set all
+  // but the first coordinate to the same value; so basically 3D case works as
+  // if you were looking at the scene from top
+
+  VectorType minPoint = -2 * Ones;
+  minPoint[0] = -3;
+  VectorType maxPoint = Zero;
+  maxPoint[0] = -1;
+  BoxType c(minPoint, maxPoint);
+  // box((-3, -2, -2), (-1, 0, 0))
+
+  IsometryTransform tf2 = IsometryTransform::Identity();
+  // for some weird reason the following statement has to be put separate from
+  // the following rotate call, otherwise precision problems arise...
+  Rotation rot = rotate(NonInteger(EIGEN_PI));
+  tf2.rotate(rot);
+
+  c.transform(tf2);
+  // rotate by 180 deg around origin -> box((1, 0, -2), (3, 2, 0))
+
+  VERIFY_IS_APPROX(c.sizes(), a.sizes());
+  VERIFY_IS_APPROX((c.min)(), UnitX - UnitZ * Scalar(2));
+  VERIFY_IS_APPROX((c.max)(), UnitX * Scalar(3) + UnitY * Scalar(2));
+
+  rot = rotate(NonInteger(EIGEN_PI / 2));
+  tf2.setIdentity();
+  tf2.rotate(rot);
+
+  c.transform(tf2);
+  // rotate by 90 deg around origin ->  box((-2, 1, -2), (0, 3, 0))
+
+  VERIFY_IS_APPROX(c.sizes(), a.sizes());
+  VERIFY_IS_APPROX((c.min)(), Ones * Scalar(-2) + UnitY * Scalar(3));
+  VERIFY_IS_APPROX((c.max)(), UnitY * Scalar(3));
+
+  // box((-1, -1, -1), (1, 1, 1))
+  AffineTransform atf = AffineTransform::Identity();
+  atf.linearExt()(0, 1) = Scalar(1);
+  c = BoxType(-Ones, Ones);
+  c.transform(atf);
+  // 45 deg shear in x direction -> box((-2, -1, -1), (2, 1, 1))
+
+  VERIFY_IS_APPROX(c.sizes(), Ones * Scalar(2) + UnitX * Scalar(2));
+  VERIFY_IS_APPROX((c.min)(), -Ones - UnitX);
+  VERIFY_IS_APPROX((c.max)(), Ones + UnitX);
+}
+
+template<typename BoxType, typename Rotation> void alignedboxNonIntegralRotatable(
+    const BoxType& box,
+    Rotation (*rotate)(typename NumTraits<typename BoxType::Scalar>::NonInteger /*_angle*/))
+{
+  alignedboxRotatable(box, rotate);
+
+  typedef typename BoxType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::NonInteger NonInteger;
+  enum { Dim = BoxType::AmbientDimAtCompileTime };
+  typedef Matrix<Scalar, Dim, 1> VectorType;
+  typedef Matrix<Scalar, Dim, (1 << Dim)> CornersType;
+  typedef Transform<Scalar, Dim, Isometry> IsometryTransform;
+  typedef Transform<Scalar, Dim, Affine> AffineTransform;
+
+  const Index dim = box.dim();
+  const VectorType Zero = VectorType::Zero();
+  const VectorType Ones = VectorType::Ones();
+
+  VectorType minPoint = -2 * Ones;
+  minPoint[1] = 1;
+  VectorType maxPoint = Zero;
+  maxPoint[1] = 3;
+  BoxType c(minPoint, maxPoint);
+  // ((-2, 1, -2), (0, 3, 0))
+
+  VectorType cornerBL = (c.min)();
+  VectorType cornerTR = (c.max)();
+  VectorType cornerBR = (c.min)(); cornerBR[0] = cornerTR[0];
+  VectorType cornerTL = (c.max)(); cornerTL[0] = cornerBL[0];
+
+  NonInteger angle = NonInteger(EIGEN_PI/3);
+  Rotation rot = rotate(angle);
+  IsometryTransform tf2;
+  tf2.setIdentity();
+  tf2.rotate(rot);
+
+  c.transform(tf2);
+  // rotate by 60 deg ->  box((-3.59, -1.23, -2), (-0.86, 1.5, 0))
+
+  cornerBL = tf2 * cornerBL;
+  cornerBR = tf2 * cornerBR;
+  cornerTL = tf2 * cornerTL;
+  cornerTR = tf2 * cornerTR;
+
+  VectorType minCorner = Ones * Scalar(-2);
+  VectorType maxCorner = Zero;
+  minCorner[0] = (min)((min)(cornerBL[0], cornerBR[0]), (min)(cornerTL[0], cornerTR[0]));
+  maxCorner[0] = (max)((max)(cornerBL[0], cornerBR[0]), (max)(cornerTL[0], cornerTR[0]));
+  minCorner[1] = (min)((min)(cornerBL[1], cornerBR[1]), (min)(cornerTL[1], cornerTR[1]));
+  maxCorner[1] = (max)((max)(cornerBL[1], cornerBR[1]), (max)(cornerTL[1], cornerTR[1]));
+
+  for (Index d = 2; d < dim; ++d)
+    VERIFY_IS_APPROX(c.sizes()[d], Scalar(2));
+
+  VERIFY_IS_APPROX((c.min)(), minCorner);
+  VERIFY_IS_APPROX((c.max)(), maxCorner);
+
+  VectorType minCornerValue = Ones * Scalar(-2);
+  VectorType maxCornerValue = Zero;
+  minCornerValue[0] = Scalar(Scalar(-sqrt(2*2 + 3*3)) * Scalar(cos(Scalar(atan(2.0/3.0)) - angle/2)));
+  minCornerValue[1] = Scalar(Scalar(-sqrt(1*1 + 2*2)) * Scalar(sin(Scalar(atan(2.0/1.0)) - angle/2)));
+  maxCornerValue[0] = Scalar(-sin(angle));
+  maxCornerValue[1] = Scalar(3 * cos(angle));
+  VERIFY_IS_APPROX((c.min)(), minCornerValue);
+  VERIFY_IS_APPROX((c.max)(), maxCornerValue);
+
+  // randomized test - translate and rotate the box and compare to a box made of transformed vertices
+  for (size_t i = 0; i < 10; ++i)
+  {
+    for (Index d = 0; d < dim; ++d)
+    {
+      minCorner[d] = internal::random<Scalar>(-10,10);
+      maxCorner[d] = minCorner[d] + internal::random<Scalar>(0, 10);
+    }
+
+    c = BoxType(minCorner, maxCorner);
+
+    CornersType corners = boxGetCorners(minCorner, maxCorner);
+
+    typename AffineTransform::LinearMatrixType rotation =
+        randomRotationMatrix<typename AffineTransform::LinearMatrixType>();
 
+    tf2.setIdentity();
+    tf2.rotate(rotation);
+    tf2.translate(VectorType::Random());
+
+    c.transform(tf2);
+    corners = tf2 * corners;
+
+    minCorner = corners.rowwise().minCoeff();
+    maxCorner = corners.rowwise().maxCoeff();
+
+    VERIFY_IS_APPROX((c.min)(), minCorner);
+    VERIFY_IS_APPROX((c.max)(), maxCorner);
+  }
+
+  // randomized test - transform the box with a random affine matrix and compare to a box made of transformed vertices
+  for (size_t i = 0; i < 10; ++i)
+  {
+    for (Index d = 0; d < dim; ++d)
+    {
+      minCorner[d] = internal::random<Scalar>(-10,10);
+      maxCorner[d] = minCorner[d] + internal::random<Scalar>(0, 10);
+    }
+
+    c = BoxType(minCorner, maxCorner);
+
+    CornersType corners = boxGetCorners(minCorner, maxCorner);
+
+    AffineTransform atf = AffineTransform::Identity();
+    atf.linearExt() = AffineTransform::LinearPart::Random();
+    atf.translate(VectorType::Random());
+
+    c.transform(atf);
+    corners = atf * corners;
+
+    minCorner = corners.rowwise().minCoeff();
+    maxCorner = corners.rowwise().maxCoeff();
+
+    VERIFY_IS_APPROX((c.min)(), minCorner);
+    VERIFY_IS_APPROX((c.max)(), maxCorner);
+  }
+}
 
 template<typename BoxType>
-void alignedboxCastTests(const BoxType& _box)
+void alignedboxCastTests(const BoxType& box)
 {
-  // casting  
+  // casting
   typedef typename BoxType::Scalar Scalar;
   typedef Matrix<Scalar, BoxType::AmbientDimAtCompileTime, 1> VectorType;
 
-  const Index dim = _box.dim();
+  const Index dim = box.dim();
 
   VectorType p0 = VectorType::Random(dim);
   VectorType p1 = VectorType::Random(dim);
@@ -161,25 +504,25 @@ void specificTest2()
 }
 
 
-void test_geo_alignedbox()
+EIGEN_DECLARE_TEST(geo_alignedbox)
 {
   for(int i = 0; i < g_repeat; i++)
   {
-    CALL_SUBTEST_1( alignedbox(AlignedBox2f()) );
+    CALL_SUBTEST_1( (alignedboxNonIntegralRotatable<AlignedBox2f, Rotation2Df>(AlignedBox2f(), &rotate2D)) );
     CALL_SUBTEST_2( alignedboxCastTests(AlignedBox2f()) );
 
-    CALL_SUBTEST_3( alignedbox(AlignedBox3f()) );
+    CALL_SUBTEST_3( (alignedboxNonIntegralRotatable<AlignedBox3f, AngleAxisf>(AlignedBox3f(), &rotate3DZAxis)) );
     CALL_SUBTEST_4( alignedboxCastTests(AlignedBox3f()) );
 
-    CALL_SUBTEST_5( alignedbox(AlignedBox4d()) );
+    CALL_SUBTEST_5( (alignedboxNonIntegralRotatable<AlignedBox4d, Matrix4d>(AlignedBox4d(), &rotate4DZWAxis)) );
     CALL_SUBTEST_6( alignedboxCastTests(AlignedBox4d()) );
 
-    CALL_SUBTEST_7( alignedbox(AlignedBox1d()) );
+    CALL_SUBTEST_7( alignedboxTranslatable(AlignedBox1d()) );
     CALL_SUBTEST_8( alignedboxCastTests(AlignedBox1d()) );
 
-    CALL_SUBTEST_9( alignedbox(AlignedBox1i()) );
-    CALL_SUBTEST_10( alignedbox(AlignedBox2i()) );
-    CALL_SUBTEST_11( alignedbox(AlignedBox3i()) );
+    CALL_SUBTEST_9( alignedboxTranslatable(AlignedBox1i()) );
+    CALL_SUBTEST_10( (alignedboxRotatable<AlignedBox2i, Matrix2i>(AlignedBox2i(), &rotate2DIntegral<int, Matrix2i>)) );
+    CALL_SUBTEST_11( (alignedboxRotatable<AlignedBox3i, Matrix3i>(AlignedBox3i(), &rotate3DZAxisIntegral<int, Matrix3i>)) );
 
     CALL_SUBTEST_14( alignedbox(AlignedBox<double,Dynamic>(4)) );
   }
diff --git a/contrib/eigen/test/geo_eulerangles.cpp b/contrib/eigen/test/geo_eulerangles.cpp
index 932ebe7732f62b9f611f608b83c21c973a485e60..693c627a9a451cdebf9c447d4729ed10ec64689a 100644
--- a/contrib/eigen/test/geo_eulerangles.cpp
+++ b/contrib/eigen/test/geo_eulerangles.cpp
@@ -103,7 +103,7 @@ template<typename Scalar> void eulerangles()
   check_all_var(ea);
 }
 
-void test_geo_eulerangles()
+EIGEN_DECLARE_TEST(geo_eulerangles)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( eulerangles<float>() );
diff --git a/contrib/eigen/test/geo_homogeneous.cpp b/contrib/eigen/test/geo_homogeneous.cpp
index 2187c7bf988036e3b393fadc36a299ae3ad3ca17..9aebe6226ff23907cb202c590cdc2b247ac09506 100644
--- a/contrib/eigen/test/geo_homogeneous.cpp
+++ b/contrib/eigen/test/geo_homogeneous.cpp
@@ -115,7 +115,7 @@ template<typename Scalar,int Size> void homogeneous(void)
   VERIFY_IS_APPROX( (t2.template triangularView<Lower>() * v0.homogeneous()).eval(), (t2.template triangularView<Lower>()*hv0) );
 }
 
-void test_geo_homogeneous()
+EIGEN_DECLARE_TEST(geo_homogeneous)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(( homogeneous<float,1>() ));
diff --git a/contrib/eigen/test/geo_hyperplane.cpp b/contrib/eigen/test/geo_hyperplane.cpp
index b3a48c58591c78f2bd2d94afa64fe9f3f450673a..44b2f2aecb3e8e9b239f93cc586778f0b11b779d 100644
--- a/contrib/eigen/test/geo_hyperplane.cpp
+++ b/contrib/eigen/test/geo_hyperplane.cpp
@@ -117,7 +117,7 @@ template<typename Scalar> void lines()
       VERIFY_IS_APPROX(result, center);
 
     // check conversions between two types of lines
-    PLine pl(line_u); // gcc 3.3 will commit suicide if we don't name this variable
+    PLine pl(line_u); // gcc 3.3 will crash if we don't name this variable.
     HLine line_u2(pl);
     CoeffsType converted_coeffs = line_u2.coeffs();
     if(line_u2.normal().dot(line_u.normal())<Scalar(0))
@@ -172,15 +172,10 @@ template<typename Scalar> void hyperplane_alignment()
 
   VERIFY_IS_APPROX(p1->coeffs(), p2->coeffs());
   VERIFY_IS_APPROX(p1->coeffs(), p3->coeffs());
-  
-  #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES > 0
-  if(internal::packet_traits<Scalar>::Vectorizable && internal::packet_traits<Scalar>::size<=4)
-    VERIFY_RAISES_ASSERT((::new(reinterpret_cast<void*>(array3u)) Plane3a));
-  #endif
 }
 
 
-void test_geo_hyperplane()
+EIGEN_DECLARE_TEST(geo_hyperplane)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( hyperplane(Hyperplane<float,2>()) );
diff --git a/contrib/eigen/test/geo_orthomethods.cpp b/contrib/eigen/test/geo_orthomethods.cpp
index e178df2575148fdfbfca4c773689e9ac9b32d57d..b7b660740284d3bb94b14693a4a1134bfb8f6eac 100644
--- a/contrib/eigen/test/geo_orthomethods.cpp
+++ b/contrib/eigen/test/geo_orthomethods.cpp
@@ -115,7 +115,7 @@ template<typename Scalar, int Size> void orthomethods(int size=Size)
   VERIFY_IS_APPROX(mcrossN3.row(i), matN3.row(i).cross(vec3));
 }
 
-void test_geo_orthomethods()
+EIGEN_DECLARE_TEST(geo_orthomethods)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( orthomethods_3<float>() );
diff --git a/contrib/eigen/test/geo_parametrizedline.cpp b/contrib/eigen/test/geo_parametrizedline.cpp
index 6a8794726088aadc7df4b307778782ba8e3eff32..e4b194abc407a7b45bbc437688462051b1132864 100644
--- a/contrib/eigen/test/geo_parametrizedline.cpp
+++ b/contrib/eigen/test/geo_parametrizedline.cpp
@@ -24,6 +24,8 @@ template<typename LineType> void parametrizedline(const LineType& _line)
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef Matrix<Scalar, LineType::AmbientDimAtCompileTime, 1> VectorType;
   typedef Hyperplane<Scalar,LineType::AmbientDimAtCompileTime> HyperplaneType;
+  typedef Matrix<Scalar, HyperplaneType::AmbientDimAtCompileTime,
+                         HyperplaneType::AmbientDimAtCompileTime> MatrixType;
 
   VectorType p0 = VectorType::Random(dim);
   VectorType p1 = VectorType::Random(dim);
@@ -58,6 +60,31 @@ template<typename LineType> void parametrizedline(const LineType& _line)
   VERIFY_IS_MUCH_SMALLER_THAN(hp.signedDistance(pi), RealScalar(1));
   VERIFY_IS_MUCH_SMALLER_THAN(l0.distance(pi), RealScalar(1));
   VERIFY_IS_APPROX(l0.intersectionPoint(hp), pi);
+
+  // transform
+  if (!NumTraits<Scalar>::IsComplex)
+  {
+    MatrixType rot = MatrixType::Random(dim,dim).householderQr().householderQ();
+    DiagonalMatrix<Scalar,LineType::AmbientDimAtCompileTime> scaling(VectorType::Random());
+    Translation<Scalar,LineType::AmbientDimAtCompileTime> translation(VectorType::Random());
+
+    while(scaling.diagonal().cwiseAbs().minCoeff()<RealScalar(1e-4)) scaling.diagonal() = VectorType::Random();
+
+    LineType l1 = l0;
+    VectorType p3 = l0.pointAt(Scalar(1));
+    VERIFY_IS_MUCH_SMALLER_THAN( l1.transform(rot).distance(rot * p3), Scalar(1) );
+    l1 = l0;
+    VERIFY_IS_MUCH_SMALLER_THAN( l1.transform(rot,Isometry).distance(rot * p3), Scalar(1) );
+    l1 = l0;
+    VERIFY_IS_MUCH_SMALLER_THAN( l1.transform(rot*scaling).distance((rot*scaling) * p3), Scalar(1) );
+    l1 = l0;
+    VERIFY_IS_MUCH_SMALLER_THAN( l1.transform(rot*scaling*translation)
+                                   .distance((rot*scaling*translation) * p3), Scalar(1) );
+    l1 = l0;
+    VERIFY_IS_MUCH_SMALLER_THAN( l1.transform(rot*translation,Isometry)
+                                   .distance((rot*translation) * p3), Scalar(1) );
+  }
+
 }
 
 template<typename Scalar> void parametrizedline_alignment()
@@ -83,14 +110,9 @@ template<typename Scalar> void parametrizedline_alignment()
   VERIFY_IS_APPROX(p1->origin(), p3->origin());
   VERIFY_IS_APPROX(p1->direction(), p2->direction());
   VERIFY_IS_APPROX(p1->direction(), p3->direction());
-  
-  #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
-  if(internal::packet_traits<Scalar>::Vectorizable && internal::packet_traits<Scalar>::size<=4)
-    VERIFY_RAISES_ASSERT((::new(reinterpret_cast<void*>(array3u)) Line4a));
-  #endif
 }
 
-void test_geo_parametrizedline()
+EIGEN_DECLARE_TEST(geo_parametrizedline)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( parametrizedline(ParametrizedLine<float,2>()) );
diff --git a/contrib/eigen/test/geo_quaternion.cpp b/contrib/eigen/test/geo_quaternion.cpp
index 87680f1ccac21149e52e50837dc0c92ddca125f8..c561fc89d0e8f314354468824cce8a3bfd8d3511 100644
--- a/contrib/eigen/test/geo_quaternion.cpp
+++ b/contrib/eigen/test/geo_quaternion.cpp
@@ -12,6 +12,7 @@
 #include <Eigen/Geometry>
 #include <Eigen/LU>
 #include <Eigen/SVD>
+#include "AnnoyingScalar.h"
 
 template<typename T> T bounded_acos(T v)
 {
@@ -74,6 +75,13 @@ template<typename Scalar, int Options> void quaternion(void)
   q1.coeffs().setRandom();
   VERIFY_IS_APPROX(q1.coeffs(), (q1*q2).coeffs());
 
+#ifndef EIGEN_NO_IO
+  // Printing
+  std::ostringstream ss;
+  ss << q2;
+  VERIFY(ss.str() == "0i + 0j + 0k + 1");
+#endif
+
   // concatenation
   q1 *= q2;
 
@@ -85,7 +93,7 @@ template<typename Scalar, int Options> void quaternion(void)
   if (refangle>Scalar(EIGEN_PI))
     refangle = Scalar(2)*Scalar(EIGEN_PI) - refangle;
 
-  if((q1.coeffs()-q2.coeffs()).norm() > 10*largeEps)
+  if((q1.coeffs()-q2.coeffs()).norm() > Scalar(10)*largeEps)
   {
     VERIFY_IS_MUCH_SMALLER_THAN(abs(q1.angularDistance(q2) - refangle), Scalar(1));
   }
@@ -113,7 +121,7 @@ template<typename Scalar, int Options> void quaternion(void)
 
   // Do not execute the test if the rotation angle is almost zero, or
   // the rotation axis and v1 are almost parallel.
-  if (abs(aa.angle()) > 5*test_precision<Scalar>()
+  if (abs(aa.angle()) > Scalar(5)*test_precision<Scalar>()
       && (aa.axis() - v1.normalized()).norm() < Scalar(1.99)
       && (aa.axis() + v1.normalized()).norm() < Scalar(1.99))
   {
@@ -210,10 +218,6 @@ template<typename Scalar> void mapQuaternion(void){
   VERIFY_IS_APPROX(q1.coeffs(), q2.coeffs());
   VERIFY_IS_APPROX(q1.coeffs(), q3.coeffs());
   VERIFY_IS_APPROX(q4.coeffs(), q3.coeffs());
-  #ifdef EIGEN_VECTORIZE
-  if(internal::packet_traits<Scalar>::Vectorizable)
-    VERIFY_RAISES_ASSERT((MQuaternionA(array3unaligned)));
-  #endif
     
   VERIFY_IS_APPROX(mq1 * (mq1.inverse() * v1), v1);
   VERIFY_IS_APPROX(mq1 * (mq1.conjugate() * v1), v1);
@@ -241,7 +245,7 @@ template<typename Scalar> void mapQuaternion(void){
   const MQuaternionUA& cmq3(mq3);
   VERIFY( &cmq3.x() == &mq3.x() );
   // FIXME the following should be ok. The problem is that currently the LValueBit flag
-  // is used to determine wether we can return a coeff by reference or not, which is not enough for Map<const ...>.
+  // is used to determine whether we can return a coeff by reference or not, which is not enough for Map<const ...>.
   //const MCQuaternionUA& cmcq3(mcq3);
   //VERIFY( &cmcq3.x() == &mcq3.x() );
 
@@ -273,10 +277,6 @@ template<typename Scalar> void quaternionAlignment(void){
 
   VERIFY_IS_APPROX(q1->coeffs(), q2->coeffs());
   VERIFY_IS_APPROX(q1->coeffs(), q3->coeffs());
-  #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
-  if(internal::packet_traits<Scalar>::Vectorizable && internal::packet_traits<Scalar>::size<=4)
-    VERIFY_RAISES_ASSERT((::new(reinterpret_cast<void*>(arrayunaligned)) QuaternionA));
-  #endif
 }
 
 template<typename PlainObjectType> void check_const_correctness(const PlainObjectType&)
@@ -293,18 +293,40 @@ template<typename PlainObjectType> void check_const_correctness(const PlainObjec
   VERIFY( !(Map<ConstPlainObjectType, Aligned>::Flags & LvalueBit) );
 }
 
-void test_geo_quaternion()
+#if EIGEN_HAS_RVALUE_REFERENCES
+
+// Regression for bug 1573
+struct MovableClass {
+  // The following line is a workaround for gcc 4.7 and 4.8 (see bug 1573 comments).
+  static_assert(std::is_nothrow_move_constructible<Quaternionf>::value,"");
+  MovableClass() = default;
+  MovableClass(const MovableClass&) = default;
+  MovableClass(MovableClass&&) noexcept = default;
+  MovableClass& operator=(const MovableClass&) = default;
+  MovableClass& operator=(MovableClass&&) = default;
+  Quaternionf m_quat;
+};
+
+#endif
+
+EIGEN_DECLARE_TEST(geo_quaternion)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(( quaternion<float,AutoAlign>() ));
     CALL_SUBTEST_1( check_const_correctness(Quaternionf()) );
+    CALL_SUBTEST_1(( quaternion<float,DontAlign>() ));
+    CALL_SUBTEST_1(( quaternionAlignment<float>() ));
+    CALL_SUBTEST_1( mapQuaternion<float>() );
+
     CALL_SUBTEST_2(( quaternion<double,AutoAlign>() ));
     CALL_SUBTEST_2( check_const_correctness(Quaterniond()) );
-    CALL_SUBTEST_3(( quaternion<float,DontAlign>() ));
-    CALL_SUBTEST_4(( quaternion<double,DontAlign>() ));
-    CALL_SUBTEST_5(( quaternionAlignment<float>() ));
-    CALL_SUBTEST_6(( quaternionAlignment<double>() ));
-    CALL_SUBTEST_1( mapQuaternion<float>() );
+    CALL_SUBTEST_2(( quaternion<double,DontAlign>() ));
+    CALL_SUBTEST_2(( quaternionAlignment<double>() ));
     CALL_SUBTEST_2( mapQuaternion<double>() );
+
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+    AnnoyingScalar::dont_throw = true;
+#endif
+    CALL_SUBTEST_3(( quaternion<AnnoyingScalar,AutoAlign>() ));
   }
 }
diff --git a/contrib/eigen/test/geo_transformations.cpp b/contrib/eigen/test/geo_transformations.cpp
old mode 100755
new mode 100644
index 8d064ddc3bc889e1a05fda60fa15b57df3cd7b43..72c6edac14708ffb3969855dd7096ca94504773f
--- a/contrib/eigen/test/geo_transformations.cpp
+++ b/contrib/eigen/test/geo_transformations.cpp
@@ -582,11 +582,6 @@ template<typename Scalar> void transform_alignment()
   VERIFY_IS_APPROX(p1->matrix(), p3->matrix());
   
   VERIFY_IS_APPROX( (*p1) * (*p1), (*p2)*(*p3));
-  
-  #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
-  if(internal::packet_traits<Scalar>::Vectorizable)
-    VERIFY_RAISES_ASSERT((::new(reinterpret_cast<void*>(array3u)) Projective3a));
-  #endif
 }
 
 template<typename Scalar, int Dim, int Options> void transform_products()
@@ -666,13 +661,45 @@ template<typename Scalar, int Mode, int Options> void transformations_no_scale()
   VERIFY((m3 * m3.inverse()).isIdentity(test_precision<Scalar>()));
   // Verify implicit last row is initialized.
   VERIFY_IS_APPROX(Vector4(m3.row(3)), Vector4(0.0, 0.0, 0.0, 1.0));
+
+  VERIFY_IS_APPROX(t3.rotation(), t3.linear());
+  if(Mode==Isometry)
+    VERIFY(t3.rotation().data()==t3.linear().data());
+}
+
+template<typename Scalar, int Mode, int Options> void transformations_computed_scaling_continuity()
+{
+  typedef Matrix<Scalar, 3, 1> Vector3;
+  typedef Transform<Scalar, 3, Mode, Options> Transform3;
+  typedef Matrix<Scalar, 3, 3> Matrix3;
+
+  // Given: two transforms that differ by '2*eps'.
+  Scalar eps(1e-3);
+  Vector3 v0 = Vector3::Random().normalized(),
+    v1 = Vector3::Random().normalized(),
+    v3 = Vector3::Random().normalized();
+  Transform3 t0, t1;
+  // The interesting case is when their determinants have different signs.
+  Matrix3 rank2 = 50 * v0 * v0.adjoint() + 20 * v1 * v1.adjoint();
+  t0.linear() = rank2 + eps * v3 * v3.adjoint();
+  t1.linear() = rank2 - eps * v3 * v3.adjoint();
+
+  // When: computing the rotation-scaling parts
+  Matrix3 r0, s0, r1, s1;
+  t0.computeRotationScaling(&r0, &s0);
+  t1.computeRotationScaling(&r1, &s1);
+
+  // Then: the scaling parts should differ by no more than '2*eps'.
+  const Scalar c(2.1); // 2 + room for rounding errors
+  VERIFY((s0 - s1).norm() < c * eps);
 }
 
-void test_geo_transformations()
+EIGEN_DECLARE_TEST(geo_transformations)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(( transformations<double,Affine,AutoAlign>() ));
     CALL_SUBTEST_1(( non_projective_only<double,Affine,AutoAlign>() ));
+    CALL_SUBTEST_1(( transformations_computed_scaling_continuity<double,Affine,AutoAlign>() ));   
     
     CALL_SUBTEST_2(( transformations<float,AffineCompact,AutoAlign>() ));
     CALL_SUBTEST_2(( non_projective_only<float,AffineCompact,AutoAlign>() ));
diff --git a/contrib/eigen/test/gpu_basic.cu b/contrib/eigen/test/gpu_basic.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4298da3bb6e12138cbd3e2e3ccb77e42e2df7c39
--- /dev/null
+++ b/contrib/eigen/test/gpu_basic.cu
@@ -0,0 +1,461 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// workaround issue between gcc >= 4.7 and cuda 5.5
+#if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7)
+  #undef _GLIBCXX_ATOMIC_BUILTINS
+  #undef _GLIBCXX_USE_INT128
+#endif
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#include "main.h"
+#include "gpu_common.h"
+
+// Check that dense modules can be properly parsed by nvcc
+#include <Eigen/Dense>
+
+// struct Foo{
+//   EIGEN_DEVICE_FUNC
+//   void operator()(int i, const float* mats, float* vecs) const {
+//     using namespace Eigen;
+//   //   Matrix3f M(data);
+//   //   Vector3f x(data+9);
+//   //   Map<Vector3f>(data+9) = M.inverse() * x;
+//     Matrix3f M(mats+i/16);
+//     Vector3f x(vecs+i*3);
+//   //   using std::min;
+//   //   using std::sqrt;
+//     Map<Vector3f>(vecs+i*3) << x.minCoeff(), 1, 2;// / x.dot(x);//(M.inverse() *  x) / x.x();
+//     //x = x*2 + x.y() * x + x * x.maxCoeff() - x / x.sum();
+//   }
+// };
+
+template<typename T>
+struct coeff_wise {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    T x1(in+i);
+    T x2(in+i+1);
+    T x3(in+i+2);
+    Map<T> res(out+i*T::MaxSizeAtCompileTime);
+    
+    res.array() += (in[0] * x1 + x2).array() * x3.array();
+  }
+};
+
+template<typename T>
+struct complex_sqrt {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef typename T::Scalar ComplexType;
+    typedef typename T::Scalar::value_type ValueType;
+    const int num_special_inputs = 18;
+    
+    if (i == 0) {
+      const ValueType nan = std::numeric_limits<ValueType>::quiet_NaN();
+      typedef Eigen::Vector<ComplexType, num_special_inputs> SpecialInputs;
+      SpecialInputs special_in;
+      special_in.setZero();
+      int idx = 0;
+      special_in[idx++] = ComplexType(0, 0);
+      special_in[idx++] = ComplexType(-0, 0);
+      special_in[idx++] = ComplexType(0, -0);
+      special_in[idx++] = ComplexType(-0, -0);
+      // GCC's fallback sqrt implementation fails for inf inputs.
+      // It is called when _GLIBCXX_USE_C99_COMPLEX is false or if
+      // clang includes the GCC header (which temporarily disables
+      // _GLIBCXX_USE_C99_COMPLEX)
+      #if !defined(_GLIBCXX_COMPLEX) || \
+        (_GLIBCXX_USE_C99_COMPLEX && !defined(__CLANG_CUDA_WRAPPERS_COMPLEX))
+      const ValueType inf = std::numeric_limits<ValueType>::infinity();
+      special_in[idx++] = ComplexType(1.0, inf);
+      special_in[idx++] = ComplexType(nan, inf);
+      special_in[idx++] = ComplexType(1.0, -inf);
+      special_in[idx++] = ComplexType(nan, -inf);
+      special_in[idx++] = ComplexType(-inf, 1.0);
+      special_in[idx++] = ComplexType(inf, 1.0);
+      special_in[idx++] = ComplexType(-inf, -1.0);
+      special_in[idx++] = ComplexType(inf, -1.0);
+      special_in[idx++] = ComplexType(-inf, nan);
+      special_in[idx++] = ComplexType(inf, nan);
+      #endif
+      special_in[idx++] = ComplexType(1.0, nan);
+      special_in[idx++] = ComplexType(nan, 1.0);
+      special_in[idx++] = ComplexType(nan, -1.0);
+      special_in[idx++] = ComplexType(nan, nan);
+      
+      Map<SpecialInputs> special_out(out);
+      special_out = special_in.cwiseSqrt();
+    }
+    
+    T x1(in + i);
+    Map<T> res(out + num_special_inputs + i*T::MaxSizeAtCompileTime);
+    res = x1.cwiseSqrt();
+  }
+};
+
+template<typename T>
+struct complex_operators {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef typename T::Scalar ComplexType;
+    typedef typename T::Scalar::value_type ValueType;
+    const int num_scalar_operators = 24;
+    const int num_vector_operators = 23;  // no unary + operator.
+    int out_idx = i * (num_scalar_operators + num_vector_operators * T::MaxSizeAtCompileTime);
+    
+    // Scalar operators.
+    const ComplexType a = in[i];
+    const ComplexType b = in[i + 1];
+    
+    out[out_idx++] = +a;
+    out[out_idx++] = -a;
+    
+    out[out_idx++] = a + b;
+    out[out_idx++] = a + numext::real(b);
+    out[out_idx++] = numext::real(a) + b;
+    out[out_idx++] = a - b;
+    out[out_idx++] = a - numext::real(b);
+    out[out_idx++] = numext::real(a) - b;
+    out[out_idx++] = a * b;
+    out[out_idx++] = a * numext::real(b);
+    out[out_idx++] = numext::real(a) * b;
+    out[out_idx++] = a / b;
+    out[out_idx++] = a / numext::real(b);
+    out[out_idx++] = numext::real(a) / b;
+    
+    out[out_idx] = a; out[out_idx++] += b;
+    out[out_idx] = a; out[out_idx++] -= b;
+    out[out_idx] = a; out[out_idx++] *= b;
+    out[out_idx] = a; out[out_idx++] /= b;
+    
+    const ComplexType true_value = ComplexType(ValueType(1), ValueType(0));
+    const ComplexType false_value = ComplexType(ValueType(0), ValueType(0));
+    out[out_idx++] = (a == b ? true_value : false_value);
+    out[out_idx++] = (a == numext::real(b) ? true_value : false_value);
+    out[out_idx++] = (numext::real(a) == b ? true_value : false_value);
+    out[out_idx++] = (a != b ? true_value : false_value);
+    out[out_idx++] = (a != numext::real(b) ? true_value : false_value);
+    out[out_idx++] = (numext::real(a) != b ? true_value : false_value);
+    
+    // Vector versions.
+    T x1(in + i);
+    T x2(in + i + 1);
+    const int res_size = T::MaxSizeAtCompileTime * num_scalar_operators;
+    const int size = T::MaxSizeAtCompileTime;
+    int block_idx = 0;
+    
+    Map<VectorX<ComplexType>> res(out + out_idx, res_size);
+    res.segment(block_idx, size) = -x1;
+    block_idx += size;
+    
+    res.segment(block_idx, size) = x1 + x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1 + x2.real();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.real() + x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1 - x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1 - x2.real();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.real() - x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1.array() * x2.array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.array() * x2.real().array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.real().array() * x2.array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.array() / x2.array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.array() / x2.real().array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1.real().array() / x2.array();
+    block_idx += size;
+    
+    res.segment(block_idx, size) = x1; res.segment(block_idx, size) += x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1; res.segment(block_idx, size) -= x2;
+    block_idx += size;
+    res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() *= x2.array();
+    block_idx += size;
+    res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() /= x2.array();
+    block_idx += size;
+
+    const T true_vector = T::Constant(true_value);
+    const T false_vector = T::Constant(false_value);
+    res.segment(block_idx, size) = (x1 == x2 ? true_vector : false_vector);
+    block_idx += size;
+    // Mixing types in equality comparison does not work.
+    // res.segment(block_idx, size) = (x1 == x2.real() ? true_vector : false_vector);
+    // block_idx += size;
+    // res.segment(block_idx, size) = (x1.real() == x2 ? true_vector : false_vector);
+    // block_idx += size;
+    res.segment(block_idx, size) = (x1 != x2 ? true_vector : false_vector);
+    block_idx += size;
+    // res.segment(block_idx, size) = (x1 != x2.real() ? true_vector : false_vector);
+    // block_idx += size;
+    // res.segment(block_idx, size) = (x1.real() != x2 ? true_vector : false_vector);
+    // block_idx += size;
+  }
+};
+
+template<typename T>
+struct replicate {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    T x1(in+i);
+    int step   = x1.size() * 4;
+    int stride = 3 * step;
+    
+    typedef Map<Array<typename T::Scalar,Dynamic,Dynamic> > MapType;
+    MapType(out+i*stride+0*step, x1.rows()*2, x1.cols()*2) = x1.replicate(2,2);
+    MapType(out+i*stride+1*step, x1.rows()*3, x1.cols()) = in[i] * x1.colwise().replicate(3);
+    MapType(out+i*stride+2*step, x1.rows(), x1.cols()*3) = in[i] * x1.rowwise().replicate(3);
+  }
+};
+
+template<typename T>
+struct alloc_new_delete {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    int offset = 2*i*T::MaxSizeAtCompileTime;
+    T* x = new T(in + offset);
+    Eigen::Map<T> u(out + offset);
+    u = *x;
+    delete x;
+    
+    offset += T::MaxSizeAtCompileTime;
+    T* y = new T[1];
+    y[0] = T(in + offset);
+    Eigen::Map<T> v(out + offset);
+    v = y[0];    
+    delete[] y;
+  }
+};
+
+template<typename T>
+struct redux {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    int N = 10;
+    T x1(in+i);
+    out[i*N+0] = x1.minCoeff();
+    out[i*N+1] = x1.maxCoeff();
+    out[i*N+2] = x1.sum();
+    out[i*N+3] = x1.prod();
+    out[i*N+4] = x1.matrix().squaredNorm();
+    out[i*N+5] = x1.matrix().norm();
+    out[i*N+6] = x1.colwise().sum().maxCoeff();
+    out[i*N+7] = x1.rowwise().maxCoeff().sum();
+    out[i*N+8] = x1.matrix().colwise().squaredNorm().sum();
+  }
+};
+
+template<typename T1, typename T2>
+struct prod_test {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef Matrix<typename T1::Scalar, T1::RowsAtCompileTime, T2::ColsAtCompileTime> T3;
+    T1 x1(in+i);
+    T2 x2(in+i+1);
+    Map<T3> res(out+i*T3::MaxSizeAtCompileTime);
+    res += in[i] * x1 * x2;
+  }
+};
+
+template<typename T1, typename T2>
+struct diagonal {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
+  {
+    using namespace Eigen;
+    T1 x1(in+i);
+    Map<T2> res(out+i*T2::MaxSizeAtCompileTime);
+    res += x1.diagonal();
+  }
+};
+
+template<typename T>
+struct eigenvalues_direct {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
+    T M(in+i);
+    Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
+    T A = M*M.adjoint();
+    SelfAdjointEigenSolver<T> eig;
+    eig.computeDirect(A);
+    res = eig.eigenvalues();
+  }
+};
+
+template<typename T>
+struct eigenvalues {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
+    T M(in+i);
+    Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
+    T A = M*M.adjoint();
+    SelfAdjointEigenSolver<T> eig;
+    eig.compute(A);
+    res = eig.eigenvalues();
+  }
+};
+
+template<typename T>
+struct matrix_inverse {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    T M(in+i);
+    Map<T> res(out+i*T::MaxSizeAtCompileTime);
+    res = M.inverse();
+  }
+};
+
+template<typename T>
+struct numeric_limits_test {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    EIGEN_UNUSED_VARIABLE(in)
+    int out_idx = i * 5;
+    out[out_idx++] = numext::numeric_limits<float>::epsilon();
+    out[out_idx++] = (numext::numeric_limits<float>::max)();
+    out[out_idx++] = (numext::numeric_limits<float>::min)();
+    out[out_idx++] = numext::numeric_limits<float>::infinity();
+    out[out_idx++] = numext::numeric_limits<float>::quiet_NaN();
+  }
+};
+
+template<typename Type1, typename Type2>
+bool verifyIsApproxWithInfsNans(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only
+{
+  if (a.rows() != b.rows()) {
+    return false;
+  }
+  if (a.cols() != b.cols()) {
+    return false;
+  }
+  for (Index r = 0; r < a.rows(); ++r) {
+    for (Index c = 0; c < a.cols(); ++c) {
+      if (a(r, c) != b(r, c)
+          && !((numext::isnan)(a(r, c)) && (numext::isnan)(b(r, c))) 
+          && !test_isApprox(a(r, c), b(r, c))) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+template<typename Kernel, typename Input, typename Output>
+void test_with_infs_nans(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  Output out_ref, out_gpu;
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
+  out_ref = out_gpu = out;
+  #else
+  EIGEN_UNUSED_VARIABLE(in);
+  EIGEN_UNUSED_VARIABLE(out);
+  #endif
+  run_on_cpu (ker, n, in,  out_ref);
+  run_on_gpu(ker, n, in, out_gpu);
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
+  verifyIsApproxWithInfsNans(out_ref, out_gpu);
+  #endif
+}
+
+EIGEN_DECLARE_TEST(gpu_basic)
+{
+  ei_test_init_gpu();
+  
+  int nthreads = 100;
+  Eigen::VectorXf in, out;
+  Eigen::VectorXcf cfin, cfout;
+  
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
+  int data_size = nthreads * 512;
+  in.setRandom(data_size);
+  out.setConstant(data_size, -1);
+  cfin.setRandom(data_size);
+  cfout.setConstant(data_size, -1);
+  #endif
+  
+  CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Array44f>(), nthreads, in, out) );
+
+#if !defined(EIGEN_USE_HIP)
+  // FIXME
+  // These subtests result in a compile failure on the HIP platform
+  //
+  //  eigen-upstream/Eigen/src/Core/Replicate.h:61:65: error:
+  //           base class 'internal::dense_xpr_base<Replicate<Array<float, 4, 1, 0, 4, 1>, -1, -1> >::type'
+  //           (aka 'ArrayBase<Eigen::Replicate<Eigen::Array<float, 4, 1, 0, 4, 1>, -1, -1> >') has protected default constructor
+  CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array4f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array33f>(), nthreads, in, out) );
+
+  // HIP does not support new/delete on device.
+  CALL_SUBTEST( run_and_compare_to_gpu(alloc_new_delete<Vector3f>(), nthreads, in, out) );
+#endif
+  
+  CALL_SUBTEST( run_and_compare_to_gpu(redux<Array4f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(redux<Matrix3f>(), nthreads, in, out) );
+  
+  CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
+  
+  CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
+
+  CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix2f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix4f>(), nthreads, in, out) );
+  
+  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
+
+  // Test std::complex.
+  CALL_SUBTEST( run_and_compare_to_gpu(complex_operators<Vector3cf>(), nthreads, cfin, cfout) );
+  CALL_SUBTEST( test_with_infs_nans(complex_sqrt<Vector3cf>(), nthreads, cfin, cfout) );
+
+  // numeric_limits
+  CALL_SUBTEST( test_with_infs_nans(numeric_limits_test<Vector3f>(), 1, in, out) );
+
+#if defined(__NVCC__)
+  // FIXME
+  // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
+  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
+  typedef Matrix<float,6,6> Matrix6f;
+  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix6f>(), nthreads, in, out) );
+#endif
+}
diff --git a/contrib/eigen/test/gpu_common.h b/contrib/eigen/test/gpu_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..c37eaa13fc372c792d9ff5ee97fafb0007055e92
--- /dev/null
+++ b/contrib/eigen/test/gpu_common.h
@@ -0,0 +1,176 @@
+#ifndef EIGEN_TEST_GPU_COMMON_H
+#define EIGEN_TEST_GPU_COMMON_H
+
+#ifdef EIGEN_USE_HIP
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+#else
+  #include <cuda.h>
+  #include <cuda_runtime.h>
+  #include <cuda_runtime_api.h>
+#endif
+
+#include <iostream>
+
+#define EIGEN_USE_GPU
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
+dim3 threadIdx, blockDim, blockIdx;
+#endif
+
+template<typename Kernel, typename Input, typename Output>
+void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  for(int i=0; i<n; i++)
+    ker(i, in.data(), out.data());
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+__global__
+EIGEN_HIP_LAUNCH_BOUNDS_1024
+void run_on_gpu_meta_kernel(const Kernel ker, int n, const Input* in, Output* out)
+{
+  int i = threadIdx.x + blockIdx.x*blockDim.x;
+  if(i<n) {
+    ker(i, in, out);
+  }
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  typename Input::Scalar*  d_in;
+  typename Output::Scalar* d_out;
+  std::ptrdiff_t in_bytes  = in.size()  * sizeof(typename Input::Scalar);
+  std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar);
+  
+  gpuMalloc((void**)(&d_in),  in_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
+  
+  gpuMemcpy(d_in,  in.data(),  in_bytes,  gpuMemcpyHostToDevice);
+  gpuMemcpy(d_out, out.data(), out_bytes, gpuMemcpyHostToDevice);
+  
+  // Simple and non-optimal 1D mapping assuming n is not too large
+  // That's only for unit testing!
+  dim3 Blocks(128);
+  dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) );
+
+  gpuDeviceSynchronize();
+  
+#ifdef EIGEN_USE_HIP
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(run_on_gpu_meta_kernel<Kernel,
+				     typename std::decay<decltype(*d_in)>::type,
+				     typename std::decay<decltype(*d_out)>::type>), 
+		     dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out);
+#else
+  run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
+#endif
+  // Pre-launch errors.
+  gpuError_t err = gpuGetLastError();
+  if (err != gpuSuccess) {
+    printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
+    gpu_assert(false);
+  }
+  
+  // Kernel execution errors.
+  err = gpuDeviceSynchronize();
+  if (err != gpuSuccess) {
+    printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
+    gpu_assert(false);
+  }
+  
+  
+  // check inputs have not been modified
+  gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()),  d_in,  in_bytes,  gpuMemcpyDeviceToHost);
+  gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost);
+  
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  Input  in_ref,  in_gpu;
+  Output out_ref, out_gpu;
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
+  in_ref = in_gpu = in;
+  out_ref = out_gpu = out;
+  #else
+  EIGEN_UNUSED_VARIABLE(in);
+  EIGEN_UNUSED_VARIABLE(out);
+  #endif
+  run_on_cpu (ker, n, in_ref,  out_ref);
+  run_on_gpu(ker, n, in_gpu, out_gpu);
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
+  VERIFY_IS_APPROX(in_ref, in_gpu);
+  VERIFY_IS_APPROX(out_ref, out_gpu);
+  #endif
+}
+
+struct compile_time_device_info {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const int* /*in*/, int* info) const
+  {
+    if (i == 0) {
+      EIGEN_UNUSED_VARIABLE(info)
+      #if defined(__CUDA_ARCH__)
+      info[0] = int(__CUDA_ARCH__ +0);
+      #endif
+      #if defined(EIGEN_HIP_DEVICE_COMPILE)
+      info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
+      #endif
+    }
+  }
+};
+
+void ei_test_init_gpu()
+{
+  int device = 0;
+  gpuDeviceProp_t deviceProp;
+  gpuGetDeviceProperties(&deviceProp, device);
+
+  ArrayXi dummy(1), info(10);
+  info = -1;
+  run_on_gpu(compile_time_device_info(),10,dummy,info);
+
+
+  std::cout << "GPU compile-time info:\n";
+  
+  #ifdef EIGEN_CUDACC
+  std::cout << "  EIGEN_CUDACC:                 " << int(EIGEN_CUDACC) << "\n";
+  #endif
+  
+  #ifdef EIGEN_CUDA_SDK_VER
+  std::cout << "  EIGEN_CUDA_SDK_VER:             " << int(EIGEN_CUDA_SDK_VER) << "\n";
+  #endif
+
+  #ifdef EIGEN_COMP_NVCC
+  std::cout << "  EIGEN_COMP_NVCC:             " << int(EIGEN_COMP_NVCC) << "\n";
+  #endif
+  
+  #ifdef EIGEN_HIPCC
+  std::cout << "  EIGEN_HIPCC:                 " << int(EIGEN_HIPCC) << "\n";
+  #endif
+
+  std::cout << "  EIGEN_CUDA_ARCH:             " << info[0] << "\n";  
+  std::cout << "  EIGEN_HIP_DEVICE_COMPILE:    " << info[1] << "\n";
+
+  std::cout << "GPU device info:\n";
+  std::cout << "  name:                        " << deviceProp.name << "\n";
+  std::cout << "  capability:                  " << deviceProp.major << "." << deviceProp.minor << "\n";
+  std::cout << "  multiProcessorCount:         " << deviceProp.multiProcessorCount << "\n";
+  std::cout << "  maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n";
+  std::cout << "  warpSize:                    " << deviceProp.warpSize << "\n";
+  std::cout << "  regsPerBlock:                " << deviceProp.regsPerBlock << "\n";
+  std::cout << "  concurrentKernels:           " << deviceProp.concurrentKernels << "\n";
+  std::cout << "  clockRate:                   " << deviceProp.clockRate << "\n";
+  std::cout << "  canMapHostMemory:            " << deviceProp.canMapHostMemory << "\n";
+  std::cout << "  computeMode:                 " << deviceProp.computeMode << "\n";
+}
+
+#endif // EIGEN_TEST_GPU_COMMON_H
diff --git a/contrib/eigen/test/half_float.cpp b/contrib/eigen/test/half_float.cpp
index b37b81903a35f754f46b08832713b13f9c2eeb6e..729de1bc725d2b0d4db43c7f9e48f90de92b50be 100644
--- a/contrib/eigen/test/half_float.cpp
+++ b/contrib/eigen/test/half_float.cpp
@@ -9,11 +9,10 @@
 
 #include "main.h"
 
-#include <Eigen/src/Core/arch/CUDA/Half.h>
+#include <Eigen/src/Core/arch/Default/Half.h>
 
-#ifdef EIGEN_HAS_CUDA_FP16
-#error "EIGEN_HAS_CUDA_FP16 should not be defined in this CPU unit test"
-#endif
+#define VERIFY_HALF_BITS_EQUAL(h, bits) \
+  VERIFY_IS_EQUAL((numext::bit_cast<numext::uint16_t>(h)), (static_cast<numext::uint16_t>(bits)))
 
 // Make sure it's possible to forward declare Eigen::half
 namespace Eigen {
@@ -26,37 +25,51 @@ void test_conversion()
 {
   using Eigen::half_impl::__half_raw;
 
+  // Round-trip bit-cast with uint16.
+  VERIFY_IS_EQUAL(
+    numext::bit_cast<half>(numext::bit_cast<numext::uint16_t>(half(1.0f))),
+    half(1.0f));
+  VERIFY_IS_EQUAL(
+    numext::bit_cast<half>(numext::bit_cast<numext::uint16_t>(half(0.5f))),
+    half(0.5f));
+  VERIFY_IS_EQUAL(
+    numext::bit_cast<half>(numext::bit_cast<numext::uint16_t>(half(-0.33333f))),
+    half(-0.33333f));
+   VERIFY_IS_EQUAL(
+    numext::bit_cast<half>(numext::bit_cast<numext::uint16_t>(half(0.0f))),
+    half(0.0f));
+
   // Conversion from float.
-  VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00);
-  VERIFY_IS_EQUAL(half(0.5f).x, 0x3800);
-  VERIFY_IS_EQUAL(half(0.33333f).x, 0x3555);
-  VERIFY_IS_EQUAL(half(0.0f).x, 0x0000);
-  VERIFY_IS_EQUAL(half(-0.0f).x, 0x8000);
-  VERIFY_IS_EQUAL(half(65504.0f).x, 0x7bff);
-  VERIFY_IS_EQUAL(half(65536.0f).x, 0x7c00);  // Becomes infinity.
+  VERIFY_HALF_BITS_EQUAL(half(1.0f), 0x3c00);
+  VERIFY_HALF_BITS_EQUAL(half(0.5f), 0x3800);
+  VERIFY_HALF_BITS_EQUAL(half(0.33333f), 0x3555);
+  VERIFY_HALF_BITS_EQUAL(half(0.0f), 0x0000);
+  VERIFY_HALF_BITS_EQUAL(half(-0.0f), 0x8000);
+  VERIFY_HALF_BITS_EQUAL(half(65504.0f), 0x7bff);
+  VERIFY_HALF_BITS_EQUAL(half(65536.0f), 0x7c00);  // Becomes infinity.
 
   // Denormals.
-  VERIFY_IS_EQUAL(half(-5.96046e-08f).x, 0x8001);
-  VERIFY_IS_EQUAL(half(5.96046e-08f).x, 0x0001);
-  VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002);
+  VERIFY_HALF_BITS_EQUAL(half(-5.96046e-08f), 0x8001);
+  VERIFY_HALF_BITS_EQUAL(half(5.96046e-08f), 0x0001);
+  VERIFY_HALF_BITS_EQUAL(half(1.19209e-07f), 0x0002);
 
   // Verify round-to-nearest-even behavior.
   float val1 = float(half(__half_raw(0x3c00)));
   float val2 = float(half(__half_raw(0x3c01)));
   float val3 = float(half(__half_raw(0x3c02)));
-  VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00);
-  VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02);
+  VERIFY_HALF_BITS_EQUAL(half(0.5f * (val1 + val2)), 0x3c00);
+  VERIFY_HALF_BITS_EQUAL(half(0.5f * (val2 + val3)), 0x3c02);
 
   // Conversion from int.
-  VERIFY_IS_EQUAL(half(-1).x, 0xbc00);
-  VERIFY_IS_EQUAL(half(0).x, 0x0000);
-  VERIFY_IS_EQUAL(half(1).x, 0x3c00);
-  VERIFY_IS_EQUAL(half(2).x, 0x4000);
-  VERIFY_IS_EQUAL(half(3).x, 0x4200);
+  VERIFY_HALF_BITS_EQUAL(half(-1), 0xbc00);
+  VERIFY_HALF_BITS_EQUAL(half(0), 0x0000);
+  VERIFY_HALF_BITS_EQUAL(half(1), 0x3c00);
+  VERIFY_HALF_BITS_EQUAL(half(2), 0x4000);
+  VERIFY_HALF_BITS_EQUAL(half(3), 0x4200);
 
   // Conversion from bool.
-  VERIFY_IS_EQUAL(half(false).x, 0x0000);
-  VERIFY_IS_EQUAL(half(true).x, 0x3c00);
+  VERIFY_HALF_BITS_EQUAL(half(false), 0x0000);
+  VERIFY_HALF_BITS_EQUAL(half(true), 0x3c00);
 
   // Conversion to float.
   VERIFY_IS_EQUAL(float(half(__half_raw(0x0000))), 0.0f);
@@ -96,24 +109,50 @@ void test_conversion()
   VERIFY((numext::isinf)(half(1.0 / 0.0)));
   VERIFY((numext::isinf)(half(-1.0 / 0.0)));
 #endif
+
+  // Conversion to bool
+  VERIFY(!static_cast<bool>(half(0.0)));
+  VERIFY(!static_cast<bool>(half(-0.0)));
+  VERIFY(static_cast<bool>(half(__half_raw(0x7bff))));
+  VERIFY(static_cast<bool>(half(-0.33333)));
+  VERIFY(static_cast<bool>(half(1.0)));
+  VERIFY(static_cast<bool>(half(-1.0)));
+  VERIFY(static_cast<bool>(half(-5.96046e-08f)));
 }
 
 void test_numtraits()
 {
-  std::cout << "epsilon       = " << NumTraits<half>::epsilon() << "  (0x" << std::hex << NumTraits<half>::epsilon().x << ")" << std::endl;
-  std::cout << "highest       = " << NumTraits<half>::highest() << "  (0x" << std::hex << NumTraits<half>::highest().x << ")" << std::endl;
-  std::cout << "lowest        = " << NumTraits<half>::lowest() << "  (0x" << std::hex << NumTraits<half>::lowest().x << ")" << std::endl;
-  std::cout << "min           = " << (std::numeric_limits<half>::min)() << "  (0x" << std::hex << half((std::numeric_limits<half>::min)()).x << ")" << std::endl;
-  std::cout << "denorm min    = " << (std::numeric_limits<half>::denorm_min)() << "  (0x" << std::hex << half((std::numeric_limits<half>::denorm_min)()).x << ")" << std::endl;
-  std::cout << "infinity      = " << NumTraits<half>::infinity() << "  (0x" << std::hex << NumTraits<half>::infinity().x << ")" << std::endl;
-  std::cout << "quiet nan     = " << NumTraits<half>::quiet_NaN() << "  (0x" << std::hex << NumTraits<half>::quiet_NaN().x << ")" << std::endl;
-  std::cout << "signaling nan = " << std::numeric_limits<half>::signaling_NaN() << "  (0x" << std::hex << std::numeric_limits<half>::signaling_NaN().x << ")" << std::endl;
+  std::cout << "epsilon       = " << NumTraits<half>::epsilon() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<half>::epsilon()) << ")" << std::endl;
+  std::cout << "highest       = " << NumTraits<half>::highest() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<half>::highest()) << ")" << std::endl;
+  std::cout << "lowest        = " << NumTraits<half>::lowest() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<half>::lowest()) << ")" << std::endl;
+  std::cout << "min           = " << (std::numeric_limits<half>::min)() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(half((std::numeric_limits<half>::min)())) << ")" << std::endl;
+  std::cout << "denorm min    = " << (std::numeric_limits<half>::denorm_min)() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(half((std::numeric_limits<half>::denorm_min)())) << ")" << std::endl;
+  std::cout << "infinity      = " << NumTraits<half>::infinity() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<half>::infinity()) << ")" << std::endl;
+  std::cout << "quiet nan     = " << NumTraits<half>::quiet_NaN() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(NumTraits<half>::quiet_NaN()) << ")" << std::endl;
+  std::cout << "signaling nan = " << std::numeric_limits<half>::signaling_NaN() << "  (0x" << std::hex << numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::signaling_NaN()) << ")" << std::endl;
 
   VERIFY(NumTraits<half>::IsSigned);
 
-  VERIFY_IS_EQUAL( std::numeric_limits<half>::infinity().x, half(std::numeric_limits<float>::infinity()).x );
-  VERIFY_IS_EQUAL( std::numeric_limits<half>::quiet_NaN().x, half(std::numeric_limits<float>::quiet_NaN()).x );
-  VERIFY_IS_EQUAL( std::numeric_limits<half>::signaling_NaN().x, half(std::numeric_limits<float>::signaling_NaN()).x );
+  VERIFY_IS_EQUAL(
+    numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::infinity()),
+    numext::bit_cast<numext::uint16_t>(half(std::numeric_limits<float>::infinity())) );
+  // There is no guarantee that casting a 32-bit NaN to 16-bit has a precise
+  // bit pattern.  We test that it is in fact a NaN, then test the signaling
+  // bit (msb of significand is 1 for quiet, 0 for signaling).
+  const numext::uint16_t HALF_QUIET_BIT = 0x0200;
+  VERIFY(
+    (numext::isnan)(std::numeric_limits<half>::quiet_NaN())
+    && (numext::isnan)(half(std::numeric_limits<float>::quiet_NaN()))
+    && ((numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::quiet_NaN()) & HALF_QUIET_BIT) > 0)
+    && ((numext::bit_cast<numext::uint16_t>(half(std::numeric_limits<float>::quiet_NaN())) & HALF_QUIET_BIT) > 0) );
+  // After a cast to half, a signaling NaN may become non-signaling
+  // (e.g. in the case of casting float to native __fp16). Thus, we check that
+  // both are NaN, and that only the `numeric_limits` version is signaling.
+  VERIFY(
+    (numext::isnan)(std::numeric_limits<half>::signaling_NaN())
+    && (numext::isnan)(half(std::numeric_limits<float>::signaling_NaN()))
+    && ((numext::bit_cast<numext::uint16_t>(std::numeric_limits<half>::signaling_NaN()) & HALF_QUIET_BIT) == 0) );
+
   VERIFY( (std::numeric_limits<half>::min)() > half(0.f) );
   VERIFY( (std::numeric_limits<half>::denorm_min)() > half(0.f) );
   VERIFY( (std::numeric_limits<half>::min)()/half(2) > half(0.f) );
@@ -129,6 +168,20 @@ void test_arithmetic()
   VERIFY_IS_APPROX(float(half(1.0f) / half(3.0f)), 0.33333f);
   VERIFY_IS_EQUAL(float(-half(4096.0f)), -4096.0f);
   VERIFY_IS_EQUAL(float(-half(-4096.0f)), 4096.0f);
+  
+  half x(3);
+  half y = ++x;
+  VERIFY_IS_EQUAL(x, half(4));
+  VERIFY_IS_EQUAL(y, half(4));
+  y = --x;
+  VERIFY_IS_EQUAL(x, half(3));
+  VERIFY_IS_EQUAL(y, half(3));
+  y = x++;
+  VERIFY_IS_EQUAL(x, half(4));
+  VERIFY_IS_EQUAL(y, half(3));
+  y = x--;
+  VERIFY_IS_EQUAL(x, half(3));
+  VERIFY_IS_EQUAL(y, half(4));
 }
 
 void test_comparison()
@@ -201,6 +254,11 @@ void test_basic_functions()
   VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), 20.f + float(EIGEN_PI));
   VERIFY_IS_APPROX(float(exp(half(EIGEN_PI))), 20.f + float(EIGEN_PI));
 
+  VERIFY_IS_EQUAL(float(numext::expm1(half(0.0f))), 0.0f);
+  VERIFY_IS_EQUAL(float(expm1(half(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(numext::expm1(half(2.0f))), 6.3890561f);
+  VERIFY_IS_APPROX(float(expm1(half(2.0f))), 6.3890561f);
+
   VERIFY_IS_EQUAL(float(numext::log(half(1.0f))), 0.0f);
   VERIFY_IS_EQUAL(float(log(half(1.0f))), 0.0f);
   VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f);
@@ -210,6 +268,11 @@ void test_basic_functions()
   VERIFY_IS_EQUAL(float(log1p(half(0.0f))), 0.0f);
   VERIFY_IS_APPROX(float(numext::log1p(half(10.0f))), 2.3978953f);
   VERIFY_IS_APPROX(float(log1p(half(10.0f))), 2.3978953f);
+  
+  VERIFY_IS_APPROX(numext::fmod(half(5.3f), half(2.0f)), half(1.3f));
+  VERIFY_IS_APPROX(fmod(half(5.3f), half(2.0f)), half(1.3f));
+  VERIFY_IS_APPROX(numext::fmod(half(-18.5f), half(-4.2f)), half(-1.7f));
+  VERIFY_IS_APPROX(fmod(half(-18.5f), half(-4.2f)), half(-1.7f));
 }
 
 void test_trigonometric_functions()
@@ -217,8 +280,8 @@ void test_trigonometric_functions()
   VERIFY_IS_APPROX(numext::cos(half(0.0f)), half(cosf(0.0f)));
   VERIFY_IS_APPROX(cos(half(0.0f)), half(cosf(0.0f)));
   VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI)), half(cosf(EIGEN_PI)));
-  //VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2)));
-  //VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2)));
+  // VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2)));
+  // VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2)));
   VERIFY_IS_APPROX(numext::cos(half(3.5f)), half(cosf(3.5f)));
 
   VERIFY_IS_APPROX(numext::sin(half(0.0f)), half(sinf(0.0f)));
@@ -256,13 +319,31 @@ void test_array()
   ss << a1;
 }
 
-void test_half_float()
+void test_product()
+{
+  typedef Matrix<half,Dynamic,Dynamic> MatrixXh;
+  Index rows  = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  Index cols  = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  Index depth = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  MatrixXh Ah = MatrixXh::Random(rows,depth);
+  MatrixXh Bh = MatrixXh::Random(depth,cols);
+  MatrixXh Ch = MatrixXh::Random(rows,cols);
+  MatrixXf Af = Ah.cast<float>();
+  MatrixXf Bf = Bh.cast<float>();
+  MatrixXf Cf = Ch.cast<float>();
+  VERIFY_IS_APPROX(Ch.noalias()+=Ah*Bh, (Cf.noalias()+=Af*Bf).cast<half>());
+}
+
+EIGEN_DECLARE_TEST(half_float)
 {
-  CALL_SUBTEST(test_conversion());
   CALL_SUBTEST(test_numtraits());
-  CALL_SUBTEST(test_arithmetic());
-  CALL_SUBTEST(test_comparison());
-  CALL_SUBTEST(test_basic_functions());
-  CALL_SUBTEST(test_trigonometric_functions());
-  CALL_SUBTEST(test_array());
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST(test_conversion());
+    CALL_SUBTEST(test_arithmetic());
+    CALL_SUBTEST(test_comparison());
+    CALL_SUBTEST(test_basic_functions());
+    CALL_SUBTEST(test_trigonometric_functions());
+    CALL_SUBTEST(test_array());
+    CALL_SUBTEST(test_product());
+  }
 }
diff --git a/contrib/eigen/test/hessenberg.cpp b/contrib/eigen/test/hessenberg.cpp
index 96bc19e2edfe90f044ee9ec8048e4015a7e454f1..0e1b0098dacd12ea9cef0648ca8212a8906450f5 100644
--- a/contrib/eigen/test/hessenberg.cpp
+++ b/contrib/eigen/test/hessenberg.cpp
@@ -49,7 +49,7 @@ template<typename Scalar,int Size> void hessenberg(int size = Size)
   // TODO: Add tests for packedMatrix() and householderCoefficients()
 }
 
-void test_hessenberg()
+EIGEN_DECLARE_TEST(hessenberg)
 {
   CALL_SUBTEST_1(( hessenberg<std::complex<double>,1>() ));
   CALL_SUBTEST_2(( hessenberg<std::complex<double>,2>() ));
diff --git a/contrib/eigen/test/householder.cpp b/contrib/eigen/test/householder.cpp
index e70b7ea256db0490771e5021ee7db379a4156d91..cad8138a2aa11701d2b14a5ccf3d864539c6eb37 100644
--- a/contrib/eigen/test/householder.cpp
+++ b/contrib/eigen/test/householder.cpp
@@ -48,6 +48,17 @@ template<typename MatrixType> void householder(const MatrixType& m)
   v1.applyHouseholderOnTheLeft(essential,beta,tmp);
   VERIFY_IS_APPROX(v1.norm(), v2.norm());
 
+  // reconstruct householder matrix:
+  SquareMatrixType id, H1, H2;
+  id.setIdentity(rows, rows);
+  H1 = H2 = id;
+  VectorType vv(rows);
+  vv << Scalar(1), essential;
+  H1.applyHouseholderOnTheLeft(essential, beta, tmp);
+  H2.applyHouseholderOnTheRight(essential, beta, tmp);
+  VERIFY_IS_APPROX(H1, H2);
+  VERIFY_IS_APPROX(H1, id - beta * vv*vv.adjoint());
+
   MatrixType m1(rows, cols),
              m2(rows, cols);
 
@@ -68,7 +79,7 @@ template<typename MatrixType> void householder(const MatrixType& m)
   m3.rowwise() = v1.transpose();
   m4 = m3;
   m3.row(0).makeHouseholder(essential, beta, alpha);
-  m3.applyHouseholderOnTheRight(essential,beta,tmp);
+  m3.applyHouseholderOnTheRight(essential.conjugate(),beta,tmp);
   VERIFY_IS_APPROX(m3.norm(), m4.norm());
   if(rows>=2) VERIFY_IS_MUCH_SMALLER_THAN(m3.block(0,1,rows,rows-1).norm(), m3.norm());
   VERIFY_IS_MUCH_SMALLER_THAN(numext::imag(m3(0,0)), numext::real(m3(0,0)));
@@ -103,14 +114,14 @@ template<typename MatrixType> void householder(const MatrixType& m)
   VERIFY_IS_APPROX(hseq_mat.adjoint(),    hseq_mat_adj);
   VERIFY_IS_APPROX(hseq_mat.conjugate(),  hseq_mat_conj);
   VERIFY_IS_APPROX(hseq_mat.transpose(),  hseq_mat_trans);
-  VERIFY_IS_APPROX(hseq_mat * m6,             hseq_mat * m6);
-  VERIFY_IS_APPROX(hseq_mat.adjoint() * m6,   hseq_mat_adj * m6);
-  VERIFY_IS_APPROX(hseq_mat.conjugate() * m6, hseq_mat_conj * m6);
-  VERIFY_IS_APPROX(hseq_mat.transpose() * m6, hseq_mat_trans * m6);
-  VERIFY_IS_APPROX(m6 * hseq_mat,             m6 * hseq_mat);
-  VERIFY_IS_APPROX(m6 * hseq_mat.adjoint(),   m6 * hseq_mat_adj);
-  VERIFY_IS_APPROX(m6 * hseq_mat.conjugate(), m6 * hseq_mat_conj);
-  VERIFY_IS_APPROX(m6 * hseq_mat.transpose(), m6 * hseq_mat_trans);
+  VERIFY_IS_APPROX(hseq * m6,             hseq_mat * m6);
+  VERIFY_IS_APPROX(hseq.adjoint() * m6,   hseq_mat_adj * m6);
+  VERIFY_IS_APPROX(hseq.conjugate() * m6, hseq_mat_conj * m6);
+  VERIFY_IS_APPROX(hseq.transpose() * m6, hseq_mat_trans * m6);
+  VERIFY_IS_APPROX(m6 * hseq,             m6 * hseq_mat);
+  VERIFY_IS_APPROX(m6 * hseq.adjoint(),   m6 * hseq_mat_adj);
+  VERIFY_IS_APPROX(m6 * hseq.conjugate(), m6 * hseq_mat_conj);
+  VERIFY_IS_APPROX(m6 * hseq.transpose(), m6 * hseq_mat_trans);
 
   // test householder sequence on the right with a shift
 
@@ -122,7 +133,7 @@ template<typename MatrixType> void householder(const MatrixType& m)
   VERIFY_IS_APPROX(m3 * m5, m1); // test evaluating rhseq to a dense matrix, then applying
 }
 
-void test_householder()
+EIGEN_DECLARE_TEST(householder)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( householder(Matrix<double,2,2>()) );
diff --git a/contrib/eigen/test/incomplete_cholesky.cpp b/contrib/eigen/test/incomplete_cholesky.cpp
index 59ffe92595c36925a88ae68c68e61b757ee51014..ecc17f5c3582cbf3d39089ac0496dfec197b737b 100644
--- a/contrib/eigen/test/incomplete_cholesky.cpp
+++ b/contrib/eigen/test/incomplete_cholesky.cpp
@@ -12,14 +12,14 @@
 #include <Eigen/IterativeLinearSolvers>
 #include <unsupported/Eigen/IterativeSolvers>
 
-template<typename T, typename I> void test_incomplete_cholesky_T()
+template<typename T, typename I_> void test_incomplete_cholesky_T()
 {
-  typedef SparseMatrix<T,0,I> SparseMatrixType;
-  ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, AMDOrdering<I> > >        cg_illt_lower_amd;
-  ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, NaturalOrdering<I> > >    cg_illt_lower_nat;
-  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, AMDOrdering<I> > >        cg_illt_upper_amd;
-  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, NaturalOrdering<I> > >    cg_illt_upper_nat;
-  ConjugateGradient<SparseMatrixType, Upper|Lower, IncompleteCholesky<T, Lower, AMDOrdering<I> > >  cg_illt_uplo_amd;
+  typedef SparseMatrix<T,0,I_> SparseMatrixType;
+  ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, AMDOrdering<I_> > >        cg_illt_lower_amd;
+  ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, NaturalOrdering<I_> > >    cg_illt_lower_nat;
+  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, AMDOrdering<I_> > >        cg_illt_upper_amd;
+  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, NaturalOrdering<I_> > >    cg_illt_upper_nat;
+  ConjugateGradient<SparseMatrixType, Upper|Lower, IncompleteCholesky<T, Lower, AMDOrdering<I_> > >  cg_illt_uplo_amd;
   
 
   CALL_SUBTEST( check_sparse_spd_solving(cg_illt_lower_amd) );
@@ -29,14 +29,10 @@ template<typename T, typename I> void test_incomplete_cholesky_T()
   CALL_SUBTEST( check_sparse_spd_solving(cg_illt_uplo_amd) );
 }
 
-void test_incomplete_cholesky()
+template<int>
+void bug1150()
 {
-  CALL_SUBTEST_1(( test_incomplete_cholesky_T<double,int>() ));
-  CALL_SUBTEST_2(( test_incomplete_cholesky_T<std::complex<double>, int>() ));
-  CALL_SUBTEST_3(( test_incomplete_cholesky_T<double,long int>() ));
-
-#ifdef EIGEN_TEST_PART_1
-    // regression for bug 1150
+  // regression for bug 1150
   for(int N = 1; N<20; ++N)
   {
     Eigen::MatrixXd b( N, N );
@@ -61,5 +57,13 @@ void test_incomplete_cholesky()
     VERIFY(solver.preconditioner().info() == Eigen::Success);
     VERIFY(solver.info() == Eigen::Success);
   }
-#endif
+}
+
+EIGEN_DECLARE_TEST(incomplete_cholesky)
+{
+  CALL_SUBTEST_1(( test_incomplete_cholesky_T<double,int>() ));
+  CALL_SUBTEST_2(( test_incomplete_cholesky_T<std::complex<double>, int>() ));
+  CALL_SUBTEST_3(( test_incomplete_cholesky_T<double,long int>() ));
+
+  CALL_SUBTEST_1(( bug1150<0>() ));
 }
diff --git a/contrib/eigen/test/indexed_view.cpp b/contrib/eigen/test/indexed_view.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72c54af6845ec34de338a34222738b50dab7838b
--- /dev/null
+++ b/contrib/eigen/test/indexed_view.cpp
@@ -0,0 +1,473 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifdef EIGEN_TEST_PART_2
+// Make sure we also check c++11 max implementation
+#define EIGEN_MAX_CPP_VER 11
+#endif
+
+#ifdef EIGEN_TEST_PART_3
+// Make sure we also check c++98 max implementation
+#define EIGEN_MAX_CPP_VER 03
+
+// We need to disable this warning when compiling with c++11 while limiting Eigen to c++98
+// Ideally we would rather configure the compiler to build in c++98 mode but this needs
+// to be done at the CMakeLists.txt level.
+#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+  #pragma GCC diagnostic ignored "-Wdeprecated"
+#endif
+
+#if defined(__GNUC__) && (__GNUC__ >=9)
+  #pragma GCC diagnostic ignored "-Wdeprecated-copy"
+#endif
+#if defined(__clang__) && (__clang_major__ >= 10)
+  #pragma clang diagnostic ignored "-Wdeprecated-copy"
+#endif
+
+#endif
+
+#include <valarray>
+#include <vector>
+#include "main.h"
+
+#if EIGEN_HAS_CXX11
+#include <array>
+#endif
+
+typedef std::pair<Index,Index> IndexPair;
+
+int encode(Index i, Index j) {
+  return int(i*100 + j);
+}
+
+IndexPair decode(Index ij) {
+  return IndexPair(ij / 100, ij % 100);
+}
+
+template<typename T>
+bool match(const T& xpr, std::string ref, std::string str_xpr = "") {
+  EIGEN_UNUSED_VARIABLE(str_xpr);
+  std::stringstream str;
+  str << xpr;
+  if(!(str.str() == ref))
+    std::cout << str_xpr << "\n" << xpr << "\n\n";
+  return str.str() == ref;
+}
+
+#define MATCH(X,R) match(X, R, #X)
+
+template<typename T1,typename T2>
+typename internal::enable_if<internal::is_same<T1,T2>::value,bool>::type
+is_same_eq(const T1& a, const T2& b)
+{
+  return (a == b).all();
+}
+
+template<typename T1,typename T2>
+bool is_same_seq(const T1& a, const T2& b)
+{
+  bool ok = a.first()==b.first() && a.size() == b.size() && Index(a.incrObject())==Index(b.incrObject());;
+  if(!ok)
+  {
+    std::cerr << "seqN(" << a.first() << ", " << a.size() << ", " << Index(a.incrObject()) << ") != ";
+    std::cerr << "seqN(" << b.first() << ", " << b.size() << ", " << Index(b.incrObject()) << ")\n";
+  }
+  return ok;
+}
+
+template<typename T1,typename T2>
+typename internal::enable_if<internal::is_same<T1,T2>::value,bool>::type
+is_same_seq_type(const T1& a, const T2& b)
+{
+  return is_same_seq(a,b);
+}
+
+
+
+#define VERIFY_EQ_INT(A,B) VERIFY_IS_APPROX(int(A),int(B))
+
+// C++03 does not allow local or unnamed enums as index
+enum DummyEnum { XX=0, YY=1 };
+
+void check_indexed_view()
+{
+  Index n = 10;
+
+  ArrayXd a = ArrayXd::LinSpaced(n,0,n-1);
+  Array<double,1,Dynamic> b = a.transpose();
+
+  #if EIGEN_COMP_CXXVER>=14
+  ArrayXXi A = ArrayXXi::NullaryExpr(n,n, std::ref(encode));
+  #else
+  ArrayXXi A = ArrayXXi::NullaryExpr(n,n, std::ptr_fun(&encode));
+  #endif
+
+  for(Index i=0; i<n; ++i)
+    for(Index j=0; j<n; ++j)
+      VERIFY( decode(A(i,j)) == IndexPair(i,j) );
+
+  Array4i eii(4); eii << 3, 1, 6, 5;
+  std::valarray<int> vali(4); Map<ArrayXi>(&vali[0],4) = eii;
+  std::vector<int> veci(4); Map<ArrayXi>(veci.data(),4) = eii;
+
+  VERIFY( MATCH( A(3, seq(9,3,-1)),
+    "309  308  307  306  305  304  303")
+  );
+
+  VERIFY( MATCH( A(seqN(2,5), seq(9,3,-1)),
+    "209  208  207  206  205  204  203\n"
+    "309  308  307  306  305  304  303\n"
+    "409  408  407  406  405  404  403\n"
+    "509  508  507  506  505  504  503\n"
+    "609  608  607  606  605  604  603")
+  );
+
+  VERIFY( MATCH( A(seqN(2,5), 5),
+    "205\n"
+    "305\n"
+    "405\n"
+    "505\n"
+    "605")
+  );
+
+  VERIFY( MATCH( A(seqN(last,5,-1), seq(2,last)),
+    "902  903  904  905  906  907  908  909\n"
+    "802  803  804  805  806  807  808  809\n"
+    "702  703  704  705  706  707  708  709\n"
+    "602  603  604  605  606  607  608  609\n"
+    "502  503  504  505  506  507  508  509")
+  );
+
+  VERIFY( MATCH( A(eii, veci),
+    "303  301  306  305\n"
+    "103  101  106  105\n"
+    "603  601  606  605\n"
+    "503  501  506  505")
+  );
+
+  VERIFY( MATCH( A(eii, all),
+    "300  301  302  303  304  305  306  307  308  309\n"
+    "100  101  102  103  104  105  106  107  108  109\n"
+    "600  601  602  603  604  605  606  607  608  609\n"
+    "500  501  502  503  504  505  506  507  508  509")
+  );
+
+  // take row number 3, and repeat it 5 times
+  VERIFY( MATCH( A(seqN(3,5,0), all),
+    "300  301  302  303  304  305  306  307  308  309\n"
+    "300  301  302  303  304  305  306  307  308  309\n"
+    "300  301  302  303  304  305  306  307  308  309\n"
+    "300  301  302  303  304  305  306  307  308  309\n"
+    "300  301  302  303  304  305  306  307  308  309")
+  );
+
+  VERIFY( MATCH( a(seqN(3,3),0), "3\n4\n5" ) );
+  VERIFY( MATCH( a(seq(3,5)), "3\n4\n5" ) );
+  VERIFY( MATCH( a(seqN(3,3,1)), "3\n4\n5" ) );
+  VERIFY( MATCH( a(seqN(5,3,-1)), "5\n4\n3" ) );
+
+  VERIFY( MATCH( b(0,seqN(3,3)), "3  4  5" ) );
+  VERIFY( MATCH( b(seq(3,5)), "3  4  5" ) );
+  VERIFY( MATCH( b(seqN(3,3,1)), "3  4  5" ) );
+  VERIFY( MATCH( b(seqN(5,3,-1)), "5  4  3" ) );
+
+  VERIFY( MATCH( b(all), "0  1  2  3  4  5  6  7  8  9" ) );
+  VERIFY( MATCH( b(eii), "3  1  6  5" ) );
+
+  Array44i B;
+  B.setRandom();
+  VERIFY( (A(seqN(2,5), 5)).ColsAtCompileTime == 1);
+  VERIFY( (A(seqN(2,5), 5)).RowsAtCompileTime == Dynamic);
+  VERIFY_EQ_INT( (A(seqN(2,5), 5)).InnerStrideAtCompileTime , A.InnerStrideAtCompileTime);
+  VERIFY_EQ_INT( (A(seqN(2,5), 5)).OuterStrideAtCompileTime , A.col(5).OuterStrideAtCompileTime);
+
+  VERIFY_EQ_INT( (A(5,seqN(2,5))).InnerStrideAtCompileTime , A.row(5).InnerStrideAtCompileTime);
+  VERIFY_EQ_INT( (A(5,seqN(2,5))).OuterStrideAtCompileTime , A.row(5).OuterStrideAtCompileTime);
+  VERIFY_EQ_INT( (B(1,seqN(1,2))).InnerStrideAtCompileTime , B.row(1).InnerStrideAtCompileTime);
+  VERIFY_EQ_INT( (B(1,seqN(1,2))).OuterStrideAtCompileTime , B.row(1).OuterStrideAtCompileTime);
+
+  VERIFY_EQ_INT( (A(seqN(2,5), seq(1,3))).InnerStrideAtCompileTime , A.InnerStrideAtCompileTime);
+  VERIFY_EQ_INT( (A(seqN(2,5), seq(1,3))).OuterStrideAtCompileTime , A.OuterStrideAtCompileTime);
+  VERIFY_EQ_INT( (B(seqN(1,2), seq(1,3))).InnerStrideAtCompileTime , B.InnerStrideAtCompileTime);
+  VERIFY_EQ_INT( (B(seqN(1,2), seq(1,3))).OuterStrideAtCompileTime , B.OuterStrideAtCompileTime);
+  VERIFY_EQ_INT( (A(seqN(2,5,2), seq(1,3,2))).InnerStrideAtCompileTime , Dynamic);
+  VERIFY_EQ_INT( (A(seqN(2,5,2), seq(1,3,2))).OuterStrideAtCompileTime , Dynamic);
+  VERIFY_EQ_INT( (A(seqN(2,5,fix<2>), seq(1,3,fix<3>))).InnerStrideAtCompileTime , 2);
+  VERIFY_EQ_INT( (A(seqN(2,5,fix<2>), seq(1,3,fix<3>))).OuterStrideAtCompileTime , Dynamic);
+  VERIFY_EQ_INT( (B(seqN(1,2,fix<2>), seq(1,3,fix<3>))).InnerStrideAtCompileTime , 2);
+  VERIFY_EQ_INT( (B(seqN(1,2,fix<2>), seq(1,3,fix<3>))).OuterStrideAtCompileTime , 3*4);
+
+  VERIFY_EQ_INT( (A(seqN(2,fix<5>), seqN(1,fix<3>))).RowsAtCompileTime, 5);
+  VERIFY_EQ_INT( (A(seqN(2,fix<5>), seqN(1,fix<3>))).ColsAtCompileTime, 3);
+  VERIFY_EQ_INT( (A(seqN(2,fix<5>(5)), seqN(1,fix<3>(3)))).RowsAtCompileTime, 5);
+  VERIFY_EQ_INT( (A(seqN(2,fix<5>(5)), seqN(1,fix<3>(3)))).ColsAtCompileTime, 3);
+  VERIFY_EQ_INT( (A(seqN(2,fix<Dynamic>(5)), seqN(1,fix<Dynamic>(3)))).RowsAtCompileTime, Dynamic);
+  VERIFY_EQ_INT( (A(seqN(2,fix<Dynamic>(5)), seqN(1,fix<Dynamic>(3)))).ColsAtCompileTime, Dynamic);
+  VERIFY_EQ_INT( (A(seqN(2,fix<Dynamic>(5)), seqN(1,fix<Dynamic>(3)))).rows(), 5);
+  VERIFY_EQ_INT( (A(seqN(2,fix<Dynamic>(5)), seqN(1,fix<Dynamic>(3)))).cols(), 3);
+
+  VERIFY( is_same_seq_type( seqN(2,5,fix<-1>), seqN(2,5,fix<-1>(-1)) ) );
+  VERIFY( is_same_seq_type( seqN(2,5), seqN(2,5,fix<1>(1)) ) );
+  VERIFY( is_same_seq_type( seqN(2,5,3), seqN(2,5,fix<DynamicIndex>(3)) ) );
+  VERIFY( is_same_seq_type( seq(2,7,fix<3>), seqN(2,2,fix<3>) ) );
+  VERIFY( is_same_seq_type( seqN(2,fix<Dynamic>(5),3), seqN(2,5,fix<DynamicIndex>(3)) ) );
+  VERIFY( is_same_seq_type( seqN(2,fix<5>(5),fix<-2>), seqN(2,fix<5>,fix<-2>()) ) );
+
+  VERIFY( is_same_seq_type( seq(2,fix<5>), seqN(2,4) ) );
+#if EIGEN_HAS_CXX11
+  VERIFY( is_same_seq_type( seq(fix<2>,fix<5>), seqN(fix<2>,fix<4>) ) );
+  VERIFY( is_same_seq( seqN(2,std::integral_constant<int,5>(),std::integral_constant<int,-2>()), seqN(2,fix<5>,fix<-2>()) ) );
+  VERIFY( is_same_seq( seq(std::integral_constant<int,1>(),std::integral_constant<int,5>(),std::integral_constant<int,2>()),
+                       seq(fix<1>,fix<5>,fix<2>()) ) );
+  VERIFY( is_same_seq_type( seqN(2,std::integral_constant<int,5>(),std::integral_constant<int,-2>()), seqN(2,fix<5>,fix<-2>()) ) );
+  VERIFY( is_same_seq_type( seq(std::integral_constant<int,1>(),std::integral_constant<int,5>(),std::integral_constant<int,2>()),
+                            seq(fix<1>,fix<5>,fix<2>()) ) );
+
+  VERIFY( is_same_seq_type( seqN(2,std::integral_constant<int,5>()), seqN(2,fix<5>) ) );
+  VERIFY( is_same_seq_type( seq(std::integral_constant<int,1>(),std::integral_constant<int,5>()), seq(fix<1>,fix<5>) ) );
+#else
+  // sorry, no compile-time size recovery in c++98/03
+  VERIFY( is_same_seq( seq(fix<2>,fix<5>), seqN(fix<2>,fix<4>) ) );
+#endif
+
+  VERIFY( (A(seqN(2,fix<5>), 5)).RowsAtCompileTime == 5);
+  VERIFY( (A(4, all)).ColsAtCompileTime == Dynamic);
+  VERIFY( (A(4, all)).RowsAtCompileTime == 1);
+  VERIFY( (B(1, all)).ColsAtCompileTime == 4);
+  VERIFY( (B(1, all)).RowsAtCompileTime == 1);
+  VERIFY( (B(all,1)).ColsAtCompileTime == 1);
+  VERIFY( (B(all,1)).RowsAtCompileTime == 4);
+
+  VERIFY(int( (A(all, eii)).ColsAtCompileTime) == int(eii.SizeAtCompileTime));
+  VERIFY_EQ_INT( (A(eii, eii)).Flags&DirectAccessBit, (unsigned int)(0));
+  VERIFY_EQ_INT( (A(eii, eii)).InnerStrideAtCompileTime, 0);
+  VERIFY_EQ_INT( (A(eii, eii)).OuterStrideAtCompileTime, 0);
+
+  VERIFY_IS_APPROX( A(seq(n-1,2,-2), seqN(n-1-6,3,-1)), A(seq(last,2,fix<-2>), seqN(last-6,3,fix<-1>)) );
+
+  VERIFY_IS_APPROX( A(seq(n-1,2,-2), seqN(n-1-6,4)), A(seq(last,2,-2), seqN(last-6,4)) );
+  VERIFY_IS_APPROX( A(seq(n-1-6,n-1-2), seqN(n-1-6,4)), A(seq(last-6,last-2), seqN(6+last-6-6,4)) );
+  VERIFY_IS_APPROX( A(seq((n-1)/2,(n)/2+3), seqN(2,4)), A(seq(last/2,(last+1)/2+3), seqN(last+2-last,4)) );
+  VERIFY_IS_APPROX( A(seq(n-2,2,-2), seqN(n-8,4)), A(seq(lastp1-2,2,-2), seqN(lastp1-8,4)) );
+
+  // Check all combinations of seq:
+  VERIFY_IS_APPROX( A(seq(1,n-1-2,2), seq(1,n-1-2,2)), A(seq(1,last-2,2), seq(1,last-2,fix<2>)) );
+  VERIFY_IS_APPROX( A(seq(n-1-5,n-1-2,2), seq(n-1-5,n-1-2,2)), A(seq(last-5,last-2,2), seq(last-5,last-2,fix<2>)) );
+  VERIFY_IS_APPROX( A(seq(n-1-5,7,2), seq(n-1-5,7,2)), A(seq(last-5,7,2), seq(last-5,7,fix<2>)) );
+  VERIFY_IS_APPROX( A(seq(1,n-1-2), seq(n-1-5,7)), A(seq(1,last-2), seq(last-5,7)) );
+  VERIFY_IS_APPROX( A(seq(n-1-5,n-1-2), seq(n-1-5,n-1-2)), A(seq(last-5,last-2), seq(last-5,last-2)) );
+
+  VERIFY_IS_APPROX( A.col(A.cols()-1), A(all,last) );
+  VERIFY_IS_APPROX( A(A.rows()-2, A.cols()/2), A(last-1, lastp1/2) );
+  VERIFY_IS_APPROX( a(a.size()-2), a(last-1) );
+  VERIFY_IS_APPROX( a(a.size()/2), a((last+1)/2) );
+
+  // Check fall-back to Block
+  {
+    VERIFY( is_same_eq(A.col(0), A(all,0)) );
+    VERIFY( is_same_eq(A.row(0), A(0,all)) );
+    VERIFY( is_same_eq(A.block(0,0,2,2), A(seqN(0,2),seq(0,1))) );
+    VERIFY( is_same_eq(A.middleRows(2,4), A(seqN(2,4),all)) );
+    VERIFY( is_same_eq(A.middleCols(2,4), A(all,seqN(2,4))) );
+
+    VERIFY( is_same_eq(A.col(A.cols()-1), A(all,last)) );
+
+    const ArrayXXi& cA(A);
+    VERIFY( is_same_eq(cA.col(0), cA(all,0)) );
+    VERIFY( is_same_eq(cA.row(0), cA(0,all)) );
+    VERIFY( is_same_eq(cA.block(0,0,2,2), cA(seqN(0,2),seq(0,1))) );
+    VERIFY( is_same_eq(cA.middleRows(2,4), cA(seqN(2,4),all)) );
+    VERIFY( is_same_eq(cA.middleCols(2,4), cA(all,seqN(2,4))) );
+
+    VERIFY( is_same_eq(a.head(4), a(seq(0,3))) );
+    VERIFY( is_same_eq(a.tail(4), a(seqN(last-3,4))) );
+    VERIFY( is_same_eq(a.tail(4), a(seq(lastp1-4,last))) );
+    VERIFY( is_same_eq(a.segment<4>(3), a(seqN(3,fix<4>))) );
+  }
+
+  ArrayXXi A1=A, A2 = ArrayXXi::Random(4,4);
+  ArrayXi range25(4); range25 << 3,2,4,5;
+  A1(seqN(3,4),seq(2,5)) = A2;
+  VERIFY_IS_APPROX( A1.block(3,2,4,4), A2 );
+  A1 = A;
+  A2.setOnes();
+  A1(seq(6,3,-1),range25) = A2;
+  VERIFY_IS_APPROX( A1.block(3,2,4,4), A2 );
+
+  // check reverse
+  {
+    VERIFY( is_same_seq_type( seq(3,7).reverse(), seqN(7,5,fix<-1>)  ) );
+    VERIFY( is_same_seq_type( seq(7,3,fix<-2>).reverse(), seqN(3,3,fix<2>)  ) );
+    VERIFY_IS_APPROX( a(seqN(2,last/2).reverse()), a(seqN(2+(last/2-1)*1,last/2,fix<-1>)) );
+    VERIFY_IS_APPROX( a(seqN(last/2,fix<4>).reverse()),a(seqN(last/2,fix<4>)).reverse() );
+    VERIFY_IS_APPROX( A(seq(last-5,last-1,2).reverse(), seqN(last-3,3,fix<-2>).reverse()),
+                      A(seq(last-5,last-1,2), seqN(last-3,3,fix<-2>)).reverse() );
+  }
+
+#if EIGEN_HAS_CXX11
+  // check lastN
+  VERIFY_IS_APPROX( a(lastN(3)), a.tail(3) );
+  VERIFY( MATCH( a(lastN(3)), "7\n8\n9" ) );
+  VERIFY_IS_APPROX( a(lastN(fix<3>())), a.tail<3>() );
+  VERIFY( MATCH( a(lastN(3,2)), "5\n7\n9" ) );
+  VERIFY( MATCH( a(lastN(3,fix<2>())), "5\n7\n9" ) );
+  VERIFY( a(lastN(fix<3>())).SizeAtCompileTime == 3 );
+
+  VERIFY( (A(all, std::array<int,4>{{1,3,2,4}})).ColsAtCompileTime == 4);
+
+  VERIFY_IS_APPROX( (A(std::array<int,3>{{1,3,5}}, std::array<int,4>{{9,6,3,0}})), A(seqN(1,3,2), seqN(9,4,-3)) );
+
+#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE
+  VERIFY_IS_APPROX( A({3, 1, 6, 5}, all), A(std::array<int,4>{{3, 1, 6, 5}}, all) );
+  VERIFY_IS_APPROX( A(all,{3, 1, 6, 5}), A(all,std::array<int,4>{{3, 1, 6, 5}}) );
+  VERIFY_IS_APPROX( A({1,3,5},{3, 1, 6, 5}), A(std::array<int,3>{{1,3,5}},std::array<int,4>{{3, 1, 6, 5}}) );
+
+  VERIFY_IS_EQUAL( A({1,3,5},{3, 1, 6, 5}).RowsAtCompileTime, 3 );
+  VERIFY_IS_EQUAL( A({1,3,5},{3, 1, 6, 5}).ColsAtCompileTime, 4 );
+
+  VERIFY_IS_APPROX( a({3, 1, 6, 5}), a(std::array<int,4>{{3, 1, 6, 5}}) );
+  VERIFY_IS_EQUAL( a({1,3,5}).SizeAtCompileTime, 3 );
+
+  VERIFY_IS_APPROX( b({3, 1, 6, 5}), b(std::array<int,4>{{3, 1, 6, 5}}) );
+  VERIFY_IS_EQUAL( b({1,3,5}).SizeAtCompileTime, 3 );
+#endif
+
+#endif
+
+  // check mat(i,j) with weird types for i and j
+  {
+    VERIFY_IS_APPROX( A(B.RowsAtCompileTime-1, 1), A(3,1) );
+    VERIFY_IS_APPROX( A(B.RowsAtCompileTime, 1), A(4,1) );
+    VERIFY_IS_APPROX( A(B.RowsAtCompileTime-1, B.ColsAtCompileTime-1), A(3,3) );
+    VERIFY_IS_APPROX( A(B.RowsAtCompileTime, B.ColsAtCompileTime), A(4,4) );
+    const Index I_ = 3, J_ = 4;
+    VERIFY_IS_APPROX( A(I_,J_), A(3,4) );
+  }
+
+  // check extended block API
+  {
+    VERIFY( is_same_eq( A.block<3,4>(1,1), A.block(1,1,fix<3>,fix<4>)) );
+    VERIFY( is_same_eq( A.block<3,4>(1,1,3,4), A.block(1,1,fix<3>(),fix<4>(4))) );
+    VERIFY( is_same_eq( A.block<3,Dynamic>(1,1,3,4), A.block(1,1,fix<3>,4)) );
+    VERIFY( is_same_eq( A.block<Dynamic,4>(1,1,3,4), A.block(1,1,fix<Dynamic>(3),fix<4>)) );
+    VERIFY( is_same_eq( A.block(1,1,3,4), A.block(1,1,fix<Dynamic>(3),fix<Dynamic>(4))) );
+
+    VERIFY( is_same_eq( A.topLeftCorner<3,4>(), A.topLeftCorner(fix<3>,fix<4>)) );
+    VERIFY( is_same_eq( A.bottomLeftCorner<3,4>(), A.bottomLeftCorner(fix<3>,fix<4>)) );
+    VERIFY( is_same_eq( A.bottomRightCorner<3,4>(), A.bottomRightCorner(fix<3>,fix<4>)) );
+    VERIFY( is_same_eq( A.topRightCorner<3,4>(), A.topRightCorner(fix<3>,fix<4>)) );
+
+    VERIFY( is_same_eq( A.leftCols<3>(), A.leftCols(fix<3>)) );
+    VERIFY( is_same_eq( A.rightCols<3>(), A.rightCols(fix<3>)) );
+    VERIFY( is_same_eq( A.middleCols<3>(1), A.middleCols(1,fix<3>)) );
+
+    VERIFY( is_same_eq( A.topRows<3>(), A.topRows(fix<3>)) );
+    VERIFY( is_same_eq( A.bottomRows<3>(), A.bottomRows(fix<3>)) );
+    VERIFY( is_same_eq( A.middleRows<3>(1), A.middleRows(1,fix<3>)) );
+
+    VERIFY( is_same_eq( a.segment<3>(1), a.segment(1,fix<3>)) );
+    VERIFY( is_same_eq( a.head<3>(), a.head(fix<3>)) );
+    VERIFY( is_same_eq( a.tail<3>(), a.tail(fix<3>)) );
+
+    const ArrayXXi& cA(A);
+    VERIFY( is_same_eq( cA.block<Dynamic,4>(1,1,3,4), cA.block(1,1,fix<Dynamic>(3),fix<4>)) );
+
+    VERIFY( is_same_eq( cA.topLeftCorner<3,4>(), cA.topLeftCorner(fix<3>,fix<4>)) );
+    VERIFY( is_same_eq( cA.bottomLeftCorner<3,4>(), cA.bottomLeftCorner(fix<3>,fix<4>)) );
+    VERIFY( is_same_eq( cA.bottomRightCorner<3,4>(), cA.bottomRightCorner(fix<3>,fix<4>)) );
+    VERIFY( is_same_eq( cA.topRightCorner<3,4>(), cA.topRightCorner(fix<3>,fix<4>)) );
+
+    VERIFY( is_same_eq( cA.leftCols<3>(), cA.leftCols(fix<3>)) );
+    VERIFY( is_same_eq( cA.rightCols<3>(), cA.rightCols(fix<3>)) );
+    VERIFY( is_same_eq( cA.middleCols<3>(1), cA.middleCols(1,fix<3>)) );
+
+    VERIFY( is_same_eq( cA.topRows<3>(), cA.topRows(fix<3>)) );
+    VERIFY( is_same_eq( cA.bottomRows<3>(), cA.bottomRows(fix<3>)) );
+    VERIFY( is_same_eq( cA.middleRows<3>(1), cA.middleRows(1,fix<3>)) );
+  }
+
+  // Check compilation of enums as index type:
+  a(XX) = 1;
+  A(XX,YY) = 1;
+  // Anonymous enums only work with C++11
+#if EIGEN_HAS_CXX11
+  enum { X=0, Y=1 };
+  a(X) = 1;
+  A(X,Y) = 1;
+  A(XX,Y) = 1;
+  A(X,YY) = 1;
+#endif
+
+  // Check compilation of varying integer types as index types:
+  Index i = n/2;
+  short i_short(i);
+  std::size_t i_sizet(i);
+  VERIFY_IS_EQUAL( a(i), a.coeff(i_short) );
+  VERIFY_IS_EQUAL( a(i), a.coeff(i_sizet) );
+
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i_short, i_short) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i_short, i) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i, i_short) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i, i_sizet) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i_sizet, i) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i_sizet, i_short) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(5, i_sizet) );
+
+  // Regression test for Max{Rows,Cols}AtCompileTime
+  {
+    Matrix3i A3 = Matrix3i::Random();
+    ArrayXi ind(5); ind << 1,1,1,1,1;
+    VERIFY_IS_EQUAL( A3(ind,ind).eval(), MatrixXi::Constant(5,5,A3(1,1)) );
+  }
+
+  // Regression for bug 1736
+  {
+    VERIFY_IS_APPROX(A(all, eii).col(0).eval(), A.col(eii(0)));
+    A(all, eii).col(0) = A.col(eii(0));
+  }
+
+  // bug 1815: IndexedView should allow linear access
+  {
+    VERIFY( MATCH( b(eii)(0), "3" ) );
+    VERIFY( MATCH( a(eii)(0), "3" ) );
+    VERIFY( MATCH( A(1,eii)(0), "103"));
+    VERIFY( MATCH( A(eii,1)(0), "301"));
+    VERIFY( MATCH( A(1,all)(1), "101"));
+    VERIFY( MATCH( A(all,1)(1), "101"));
+  }
+
+#if EIGEN_HAS_CXX11
+  //Bug IndexView with a single static row should be RowMajor:
+  {
+    // A(1, seq(0,2,1)).cwiseAbs().colwise().replicate(2).eval();
+    STATIC_CHECK(( (internal::evaluator<decltype( A(1,seq(0,2,1)) )>::Flags & RowMajorBit) == RowMajorBit ));
+  }
+#endif
+
+}
+
+EIGEN_DECLARE_TEST(indexed_view)
+{
+//   for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( check_indexed_view() );
+    CALL_SUBTEST_2( check_indexed_view() );
+    CALL_SUBTEST_3( check_indexed_view() );
+//   }
+
+  // static checks of some internals:
+  STATIC_CHECK(( internal::is_valid_index_type<int>::value ));
+  STATIC_CHECK(( internal::is_valid_index_type<unsigned int>::value ));
+  STATIC_CHECK(( internal::is_valid_index_type<short>::value ));
+  STATIC_CHECK(( internal::is_valid_index_type<std::ptrdiff_t>::value ));
+  STATIC_CHECK(( internal::is_valid_index_type<std::size_t>::value ));
+  STATIC_CHECK(( !internal::valid_indexed_view_overload<int,int>::value ));
+  STATIC_CHECK(( !internal::valid_indexed_view_overload<int,std::ptrdiff_t>::value ));
+  STATIC_CHECK(( !internal::valid_indexed_view_overload<std::ptrdiff_t,int>::value ));
+  STATIC_CHECK(( !internal::valid_indexed_view_overload<std::size_t,int>::value ));
+}
diff --git a/contrib/eigen/test/initializer_list_construction.cpp b/contrib/eigen/test/initializer_list_construction.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7a9c49e8d476e60cf1103f930319298bf7f81304
--- /dev/null
+++ b/contrib/eigen/test/initializer_list_construction.cpp
@@ -0,0 +1,385 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 David Tellenbach <david.tellenbach@tellnotes.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_STATIC_ASSERT
+
+#include "main.h"
+
+template<typename Scalar, bool is_integer = NumTraits<Scalar>::IsInteger>
+struct TestMethodDispatching {
+  static void run() {}
+};
+
+template<typename Scalar>
+struct TestMethodDispatching<Scalar, 1> {
+  static void run()
+  {
+    {
+      Matrix<Scalar, Dynamic, Dynamic> m {3, 4};
+      Array<Scalar, Dynamic, Dynamic> a {3, 4};
+      VERIFY(m.rows() == 3);
+      VERIFY(m.cols() == 4);
+      VERIFY(a.rows() == 3);
+      VERIFY(a.cols() == 4);
+    }
+    {
+      Matrix<Scalar, 1, 2> m {3, 4};
+      Array<Scalar, 1, 2> a {3, 4};
+      VERIFY(m(0) == 3);
+      VERIFY(m(1) == 4);
+      VERIFY(a(0) == 3);
+      VERIFY(a(1) == 4);
+    }
+    {
+      Matrix<Scalar, 2, 1> m {3, 4};
+      Array<Scalar, 2, 1> a {3, 4};
+      VERIFY(m(0) == 3);
+      VERIFY(m(1) == 4);
+      VERIFY(a(0) == 3);
+      VERIFY(a(1) == 4);
+    }
+  }
+};
+
+template<typename Vec4, typename Vec5> void fixedsizeVariadicVectorConstruction2()
+{
+  {
+    Vec4 ref = Vec4::Random();
+    Vec4 v{ ref[0], ref[1], ref[2], ref[3] };
+    VERIFY_IS_APPROX(v, ref);
+    VERIFY_IS_APPROX(v, (Vec4( ref[0], ref[1], ref[2], ref[3] )));
+    VERIFY_IS_APPROX(v, (Vec4({ref[0], ref[1], ref[2], ref[3]})));
+
+    Vec4 v2 = { ref[0], ref[1], ref[2], ref[3] };
+    VERIFY_IS_APPROX(v2, ref);
+  }
+  {
+    Vec5 ref = Vec5::Random();
+    Vec5 v{ ref[0], ref[1], ref[2], ref[3], ref[4] };
+    VERIFY_IS_APPROX(v, ref);
+    VERIFY_IS_APPROX(v, (Vec5( ref[0], ref[1], ref[2], ref[3], ref[4] )));
+    VERIFY_IS_APPROX(v, (Vec5({ref[0], ref[1], ref[2], ref[3], ref[4]})));
+
+    Vec5 v2 = { ref[0], ref[1], ref[2], ref[3], ref[4] };
+    VERIFY_IS_APPROX(v2, ref);
+  }
+}
+
+#define CHECK_MIXSCALAR_V5_APPROX(V, A0, A1, A2, A3, A4) { \
+  VERIFY_IS_APPROX(V[0], Scalar(A0) ); \
+  VERIFY_IS_APPROX(V[1], Scalar(A1) ); \
+  VERIFY_IS_APPROX(V[2], Scalar(A2) ); \
+  VERIFY_IS_APPROX(V[3], Scalar(A3) ); \
+  VERIFY_IS_APPROX(V[4], Scalar(A4) ); \
+}
+
+#define CHECK_MIXSCALAR_V5(VEC5, A0, A1, A2, A3, A4) { \
+  typedef VEC5::Scalar Scalar; \
+  VEC5 v = { A0 , A1 , A2 , A3 , A4 }; \
+  CHECK_MIXSCALAR_V5_APPROX(v, A0 , A1 , A2 , A3 , A4); \
+}
+
+template<int> void fixedsizeVariadicVectorConstruction3()
+{
+  typedef Matrix<double,5,1> Vec5;
+  typedef Array<float,5,1> Arr5;
+  CHECK_MIXSCALAR_V5(Vec5, 1, 2., -3, 4.121, 5.53252);
+  CHECK_MIXSCALAR_V5(Arr5, 1, 2., 3.12f, 4.121, 5.53252);
+}
+
+template<typename Scalar> void fixedsizeVariadicVectorConstruction()
+{
+  CALL_SUBTEST(( fixedsizeVariadicVectorConstruction2<Matrix<Scalar,4,1>, Matrix<Scalar,5,1> >() ));
+  CALL_SUBTEST(( fixedsizeVariadicVectorConstruction2<Matrix<Scalar,1,4>, Matrix<Scalar,1,5> >() ));
+  CALL_SUBTEST(( fixedsizeVariadicVectorConstruction2<Array<Scalar,4,1>,  Array<Scalar,5,1>  >() ));
+  CALL_SUBTEST(( fixedsizeVariadicVectorConstruction2<Array<Scalar,1,4>,  Array<Scalar,1,5>  >() ));
+}
+
+
+template<typename Scalar> void initializerListVectorConstruction()
+{
+  Scalar raw[4];
+  for(int k = 0; k < 4; ++k) {
+    raw[k] = internal::random<Scalar>();
+  }
+  {
+    Matrix<Scalar, 4, 1> m { {raw[0]}, {raw[1]},{raw[2]},{raw[3]} };
+    Array<Scalar, 4, 1> a { {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} };
+    for(int k = 0; k < 4; ++k) {
+      VERIFY(m(k) == raw[k]);
+    }
+    for(int k = 0; k < 4; ++k) {
+      VERIFY(a(k) == raw[k]);
+    }
+    VERIFY_IS_EQUAL(m, (Matrix<Scalar,4,1>({ {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} })));
+    VERIFY((a == (Array<Scalar,4,1>({ {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} }))).all());
+  }
+  {
+    Matrix<Scalar, 1, 4> m { {raw[0], raw[1], raw[2], raw[3]} };
+    Array<Scalar, 1, 4> a { {raw[0], raw[1], raw[2], raw[3]} };
+    for(int k = 0; k < 4; ++k) {
+      VERIFY(m(k) == raw[k]);
+    }
+    for(int k = 0; k < 4; ++k) {
+      VERIFY(a(k) == raw[k]);
+    }
+    VERIFY_IS_EQUAL(m, (Matrix<Scalar, 1, 4>({{raw[0],raw[1],raw[2],raw[3]}})));
+    VERIFY((a == (Array<Scalar, 1, 4>({{raw[0],raw[1],raw[2],raw[3]}}))).all());
+  }
+  {
+    Matrix<Scalar, 4, Dynamic> m { {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} };
+    Array<Scalar, 4, Dynamic> a { {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} };
+    for(int k=0; k < 4; ++k) {
+      VERIFY(m(k) == raw[k]);
+    }
+    for(int k=0; k < 4; ++k) {
+      VERIFY(a(k) == raw[k]);
+    }
+    VERIFY_IS_EQUAL(m, (Matrix<Scalar, 4, Dynamic>({ {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} })));
+    VERIFY((a == (Array<Scalar, 4, Dynamic>({ {raw[0]}, {raw[1]}, {raw[2]}, {raw[3]} }))).all());
+  }
+  {
+    Matrix<Scalar, Dynamic, 4> m {{raw[0],raw[1],raw[2],raw[3]}};
+    Array<Scalar, Dynamic, 4> a {{raw[0],raw[1],raw[2],raw[3]}};
+    for(int k=0; k < 4; ++k) {
+      VERIFY(m(k) == raw[k]);
+    }
+    for(int k=0; k < 4; ++k) {
+      VERIFY(a(k) == raw[k]);
+    }
+    VERIFY_IS_EQUAL(m, (Matrix<Scalar, Dynamic, 4>({{raw[0],raw[1],raw[2],raw[3]}})));
+    VERIFY((a == (Array<Scalar, Dynamic, 4>({{raw[0],raw[1],raw[2],raw[3]}}))).all());
+  }
+}
+
+template<typename Scalar> void initializerListMatrixConstruction()
+{
+  const Index RowsAtCompileTime = 5;
+  const Index ColsAtCompileTime = 4;
+  const Index SizeAtCompileTime = RowsAtCompileTime * ColsAtCompileTime;
+
+  Scalar raw[SizeAtCompileTime];
+  for (int i = 0; i < SizeAtCompileTime; ++i) {
+    raw[i] = internal::random<Scalar>();
+  }
+  {
+    Matrix<Scalar, Dynamic, Dynamic> m {};
+    VERIFY(m.cols() == 0);
+    VERIFY(m.rows() == 0);
+    VERIFY_IS_EQUAL(m, (Matrix<Scalar, Dynamic, Dynamic>()));
+  }
+  {
+    Matrix<Scalar, 5, 4> m {
+      {raw[0], raw[1], raw[2], raw[3]},
+      {raw[4], raw[5], raw[6], raw[7]},
+      {raw[8], raw[9], raw[10], raw[11]},
+      {raw[12], raw[13], raw[14], raw[15]},
+      {raw[16], raw[17], raw[18], raw[19]}
+    };
+
+    Matrix<Scalar, 5, 4> m2;
+    m2 << raw[0], raw[1], raw[2], raw[3],
+          raw[4], raw[5], raw[6], raw[7],
+          raw[8], raw[9], raw[10], raw[11],
+          raw[12], raw[13], raw[14], raw[15],
+          raw[16], raw[17], raw[18], raw[19];
+
+    int k = 0;
+    for(int i = 0; i < RowsAtCompileTime; ++i) {
+      for (int j = 0; j < ColsAtCompileTime; ++j) {
+        VERIFY(m(i, j) == raw[k]);
+        ++k;
+      }
+    }
+    VERIFY_IS_EQUAL(m, m2);
+  }
+  {
+    Matrix<Scalar, Dynamic, Dynamic> m{
+      {raw[0], raw[1], raw[2], raw[3]},
+      {raw[4], raw[5], raw[6], raw[7]},
+      {raw[8], raw[9], raw[10], raw[11]},
+      {raw[12], raw[13], raw[14], raw[15]},
+      {raw[16], raw[17], raw[18], raw[19]}
+    };
+
+    VERIFY(m.cols() == 4);
+    VERIFY(m.rows() == 5);
+    int k = 0;
+    for(int i = 0; i < RowsAtCompileTime; ++i) {
+      for (int j = 0; j < ColsAtCompileTime; ++j) {
+        VERIFY(m(i, j) == raw[k]);
+        ++k;
+      }
+    }
+
+    Matrix<Scalar, Dynamic, Dynamic> m2(RowsAtCompileTime, ColsAtCompileTime);
+    k = 0;
+    for(int i = 0; i < RowsAtCompileTime; ++i) {
+      for (int j = 0; j < ColsAtCompileTime; ++j) {
+        m2(i, j) = raw[k];
+        ++k;
+      }
+    }
+    VERIFY_IS_EQUAL(m, m2);
+  }
+}
+
+template<typename Scalar> void initializerListArrayConstruction()
+{
+  const Index RowsAtCompileTime = 5;
+  const Index ColsAtCompileTime = 4;
+  const Index SizeAtCompileTime = RowsAtCompileTime * ColsAtCompileTime;
+
+  Scalar raw[SizeAtCompileTime];
+  for (int i = 0; i < SizeAtCompileTime; ++i) {
+    raw[i] = internal::random<Scalar>();
+  }
+  {
+    Array<Scalar, Dynamic, Dynamic> a {};
+    VERIFY(a.cols() == 0);
+    VERIFY(a.rows() == 0);
+  }
+  {
+    Array<Scalar, 5, 4> m {
+      {raw[0], raw[1], raw[2], raw[3]},
+      {raw[4], raw[5], raw[6], raw[7]},
+      {raw[8], raw[9], raw[10], raw[11]},
+      {raw[12], raw[13], raw[14], raw[15]},
+      {raw[16], raw[17], raw[18], raw[19]}
+    };
+
+    Array<Scalar, 5, 4> m2;
+    m2 << raw[0], raw[1], raw[2], raw[3],
+          raw[4], raw[5], raw[6], raw[7],
+          raw[8], raw[9], raw[10], raw[11],
+          raw[12], raw[13], raw[14], raw[15],
+          raw[16], raw[17], raw[18], raw[19];
+
+    int k = 0;
+    for(int i = 0; i < RowsAtCompileTime; ++i) {
+      for (int j = 0; j < ColsAtCompileTime; ++j) {
+        VERIFY(m(i, j) == raw[k]);
+        ++k;
+      }
+    }
+    VERIFY_IS_APPROX(m, m2);
+  }
+  {
+    Array<Scalar, Dynamic, Dynamic> m {
+      {raw[0], raw[1], raw[2], raw[3]},
+      {raw[4], raw[5], raw[6], raw[7]},
+      {raw[8], raw[9], raw[10], raw[11]},
+      {raw[12], raw[13], raw[14], raw[15]},
+      {raw[16], raw[17], raw[18], raw[19]}
+    };
+
+    VERIFY(m.cols() == 4);
+    VERIFY(m.rows() == 5);
+    int k = 0;
+    for(int i = 0; i < RowsAtCompileTime; ++i) {
+      for (int j = 0; j < ColsAtCompileTime; ++j) {
+        VERIFY(m(i, j) == raw[k]);
+        ++k;
+      }
+    }
+
+    Array<Scalar, Dynamic, Dynamic> m2(RowsAtCompileTime, ColsAtCompileTime);
+    k = 0;
+    for(int i = 0; i < RowsAtCompileTime; ++i) {
+      for (int j = 0; j < ColsAtCompileTime; ++j) {
+        m2(i, j) = raw[k];
+        ++k;
+      }
+    }
+    VERIFY_IS_APPROX(m, m2);
+  }
+}
+
+template<typename Scalar> void dynamicVectorConstruction()
+{
+  const Index size = 4;
+  Scalar raw[size];
+  for (int i = 0; i < size; ++i) {
+    raw[i] = internal::random<Scalar>();
+  }
+
+  typedef Matrix<Scalar, Dynamic, 1>  VectorX;
+
+  {
+    VectorX v {{raw[0], raw[1], raw[2], raw[3]}};
+    for (int i = 0; i < size; ++i) {
+      VERIFY(v(i) == raw[i]);
+    }
+    VERIFY(v.rows() == size);
+    VERIFY(v.cols() == 1);
+    VERIFY_IS_EQUAL(v, (VectorX {{raw[0], raw[1], raw[2], raw[3]}}));
+  }
+
+  {
+    VERIFY_RAISES_ASSERT((VectorX {raw[0], raw[1], raw[2], raw[3]}));
+  }
+  {
+    VERIFY_RAISES_ASSERT((VectorX  {
+      {raw[0], raw[1], raw[2], raw[3]},
+      {raw[0], raw[1], raw[2], raw[3]},
+    }));
+  }
+}
+
+EIGEN_DECLARE_TEST(initializer_list_construction)
+{
+  CALL_SUBTEST_1(initializerListVectorConstruction<unsigned char>());
+  CALL_SUBTEST_1(initializerListVectorConstruction<float>());
+  CALL_SUBTEST_1(initializerListVectorConstruction<double>());
+  CALL_SUBTEST_1(initializerListVectorConstruction<int>());
+  CALL_SUBTEST_1(initializerListVectorConstruction<long int>());
+  CALL_SUBTEST_1(initializerListVectorConstruction<std::ptrdiff_t>());
+  CALL_SUBTEST_1(initializerListVectorConstruction<std::complex<double>>());
+  CALL_SUBTEST_1(initializerListVectorConstruction<std::complex<float>>());
+
+  CALL_SUBTEST_2(initializerListMatrixConstruction<unsigned char>());
+  CALL_SUBTEST_2(initializerListMatrixConstruction<float>());
+  CALL_SUBTEST_2(initializerListMatrixConstruction<double>());
+  CALL_SUBTEST_2(initializerListMatrixConstruction<int>());
+  CALL_SUBTEST_2(initializerListMatrixConstruction<long int>());
+  CALL_SUBTEST_2(initializerListMatrixConstruction<std::ptrdiff_t>());
+  CALL_SUBTEST_2(initializerListMatrixConstruction<std::complex<double>>());
+  CALL_SUBTEST_2(initializerListMatrixConstruction<std::complex<float>>());
+
+  CALL_SUBTEST_3(initializerListArrayConstruction<unsigned char>());
+  CALL_SUBTEST_3(initializerListArrayConstruction<float>());
+  CALL_SUBTEST_3(initializerListArrayConstruction<double>());
+  CALL_SUBTEST_3(initializerListArrayConstruction<int>());
+  CALL_SUBTEST_3(initializerListArrayConstruction<long int>());
+  CALL_SUBTEST_3(initializerListArrayConstruction<std::ptrdiff_t>());
+  CALL_SUBTEST_3(initializerListArrayConstruction<std::complex<double>>());
+  CALL_SUBTEST_3(initializerListArrayConstruction<std::complex<float>>());
+
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction<unsigned char>());
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction<float>());
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction<double>());
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction<int>());
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction<long int>());
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction<std::ptrdiff_t>());
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction<std::complex<double>>());
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction<std::complex<float>>());
+  CALL_SUBTEST_4(fixedsizeVariadicVectorConstruction3<0>());
+
+  CALL_SUBTEST_5(TestMethodDispatching<int>::run());
+  CALL_SUBTEST_5(TestMethodDispatching<long int>::run());
+
+  CALL_SUBTEST_6(dynamicVectorConstruction<unsigned char>());
+  CALL_SUBTEST_6(dynamicVectorConstruction<float>());
+  CALL_SUBTEST_6(dynamicVectorConstruction<double>());
+  CALL_SUBTEST_6(dynamicVectorConstruction<int>());
+  CALL_SUBTEST_6(dynamicVectorConstruction<long int>());
+  CALL_SUBTEST_6(dynamicVectorConstruction<std::ptrdiff_t>());
+  CALL_SUBTEST_6(dynamicVectorConstruction<std::complex<double>>());
+  CALL_SUBTEST_6(dynamicVectorConstruction<std::complex<float>>());
+}
diff --git a/contrib/eigen/test/inplace_decomposition.cpp b/contrib/eigen/test/inplace_decomposition.cpp
index 92d0d91b6b42e0d27031f968af38e29233df8e23..e3aa9957d0338463b1b8c5a602965d362aabf1c1 100644
--- a/contrib/eigen/test/inplace_decomposition.cpp
+++ b/contrib/eigen/test/inplace_decomposition.cpp
@@ -79,7 +79,7 @@ template<typename DecType,typename MatrixType> void inplace(bool square = false,
 }
 
 
-void test_inplace_decomposition()
+EIGEN_DECLARE_TEST(inplace_decomposition)
 {
   EIGEN_UNUSED typedef Matrix<double,4,3> Matrix43d;
   for(int i = 0; i < g_repeat; i++) {
diff --git a/contrib/eigen/test/integer_types.cpp b/contrib/eigen/test/integer_types.cpp
index 36295598f42b647be305868642114fd4b76ee3c8..31f4100c5ad05bf213c1844eb263e45dcc7e633a 100644
--- a/contrib/eigen/test/integer_types.cpp
+++ b/contrib/eigen/test/integer_types.cpp
@@ -131,7 +131,18 @@ template<typename MatrixType> void integer_type_tests(const MatrixType& m)
   VERIFY_IS_APPROX((m1 * m2.transpose()) * m1, m1 * (m2.transpose() * m1));
 }
 
-void test_integer_types()
+template<int>
+void integer_types_extra()
+{
+  VERIFY_IS_EQUAL(int(internal::scalar_div_cost<int>::value), 8);
+  VERIFY_IS_EQUAL(int(internal::scalar_div_cost<unsigned int>::value), 8);
+  if(sizeof(long)>sizeof(int)) {
+    VERIFY(int(internal::scalar_div_cost<long>::value) > int(internal::scalar_div_cost<int>::value));
+    VERIFY(int(internal::scalar_div_cost<unsigned long>::value) > int(internal::scalar_div_cost<int>::value));
+  }
+}
+
+EIGEN_DECLARE_TEST(integer_types)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( integer_type_tests(Matrix<unsigned int, 1, 1>()) );
@@ -151,17 +162,12 @@ void test_integer_types()
 
     CALL_SUBTEST_6( integer_type_tests(Matrix<unsigned short, 4, 4>()) );
 
+#if EIGEN_HAS_CXX11
     CALL_SUBTEST_7( integer_type_tests(Matrix<long long, 11, 13>()) );
     CALL_SUBTEST_7( signed_integer_type_tests(Matrix<long long, 11, 13>()) );
 
     CALL_SUBTEST_8( integer_type_tests(Matrix<unsigned long long, Dynamic, 5>(1, 5)) );
-  }
-#ifdef EIGEN_TEST_PART_9
-  VERIFY_IS_EQUAL(internal::scalar_div_cost<int>::value, 8);
-  VERIFY_IS_EQUAL(internal::scalar_div_cost<unsigned int>::value, 8);
-  if(sizeof(long)>sizeof(int)) {
-    VERIFY(int(internal::scalar_div_cost<long>::value) > int(internal::scalar_div_cost<int>::value));
-    VERIFY(int(internal::scalar_div_cost<unsigned long>::value) > int(internal::scalar_div_cost<int>::value));
-  }
 #endif
+  }
+  CALL_SUBTEST_9( integer_types_extra<0>() );
 }
diff --git a/contrib/eigen/test/inverse.cpp b/contrib/eigen/test/inverse.cpp
index d81af26c1e60ef102814705ad12b5ad28736ed3f..9cedfa1e1069ce7a15e4056a6ab95fe4cccc7405 100644
--- a/contrib/eigen/test/inverse.cpp
+++ b/contrib/eigen/test/inverse.cpp
@@ -11,35 +11,19 @@
 #include "main.h"
 #include <Eigen/LU>
 
-template<typename MatrixType> void inverse(const MatrixType& m)
+template<typename MatrixType>
+void inverse_for_fixed_size(const MatrixType&, typename internal::enable_if<MatrixType::SizeAtCompileTime==Dynamic>::type* = 0)
 {
-  using std::abs;
-  /* this test covers the following files:
-     Inverse.h
-  */
-  Index rows = m.rows();
-  Index cols = m.cols();
-
-  typedef typename MatrixType::Scalar Scalar;
-
-  MatrixType m1(rows, cols),
-             m2(rows, cols),
-             identity = MatrixType::Identity(rows, rows);
-  createRandomPIMatrixOfRank(rows,rows,rows,m1);
-  m2 = m1.inverse();
-  VERIFY_IS_APPROX(m1, m2.inverse() );
-
-  VERIFY_IS_APPROX((Scalar(2)*m2).inverse(), m2.inverse()*Scalar(0.5));
-
-  VERIFY_IS_APPROX(identity, m1.inverse() * m1 );
-  VERIFY_IS_APPROX(identity, m1 * m1.inverse() );
+}
 
-  VERIFY_IS_APPROX(m1, m1.inverse().inverse() );
+template<typename MatrixType>
+void inverse_for_fixed_size(const MatrixType& m1, typename internal::enable_if<MatrixType::SizeAtCompileTime!=Dynamic>::type* = 0)
+{
+  using std::abs;
 
-  // since for the general case we implement separately row-major and col-major, test that
-  VERIFY_IS_APPROX(MatrixType(m1.transpose().inverse()), MatrixType(m1.inverse().transpose()));
+  MatrixType m2, identity = MatrixType::Identity();
 
-#if !defined(EIGEN_TEST_PART_5) && !defined(EIGEN_TEST_PART_6)
+  typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> VectorType;
   
@@ -60,23 +44,52 @@ template<typename MatrixType> void inverse(const MatrixType& m)
   VERIFY_IS_APPROX(identity, m1*m2);
 
   //Second: a rank one matrix (not invertible, except for 1x1 matrices)
-  VectorType v3 = VectorType::Random(rows);
-  MatrixType m3 = v3*v3.transpose(), m4(rows,cols);
+  VectorType v3 = VectorType::Random();
+  MatrixType m3 = v3*v3.transpose(), m4;
   m3.computeInverseAndDetWithCheck(m4, det, invertible);
-  VERIFY( rows==1 ? invertible : !invertible );
+  VERIFY( m1.rows()==1 ? invertible : !invertible );
   VERIFY_IS_MUCH_SMALLER_THAN(abs(det-m3.determinant()), RealScalar(1));
   m3.computeInverseWithCheck(m4, invertible);
-  VERIFY( rows==1 ? invertible : !invertible );
+  VERIFY( m1.rows()==1 ? invertible : !invertible );
   
   // check with submatrices
   {
     Matrix<Scalar, MatrixType::RowsAtCompileTime+1, MatrixType::RowsAtCompileTime+1, MatrixType::Options> m5;
     m5.setRandom();
-    m5.topLeftCorner(rows,rows) = m1;
+    m5.topLeftCorner(m1.rows(),m1.rows()) = m1;
     m2 = m5.template topLeftCorner<MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime>().inverse();
     VERIFY_IS_APPROX( (m5.template topLeftCorner<MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime>()), m2.inverse() );
   }
-#endif
+}
+
+template<typename MatrixType> void inverse(const MatrixType& m)
+{
+  /* this test covers the following files:
+     Inverse.h
+  */
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  typedef typename MatrixType::Scalar Scalar;
+
+  MatrixType m1(rows, cols),
+             m2(rows, cols),
+             identity = MatrixType::Identity(rows, rows);
+  createRandomPIMatrixOfRank(rows,rows,rows,m1);
+  m2 = m1.inverse();
+  VERIFY_IS_APPROX(m1, m2.inverse() );
+
+  VERIFY_IS_APPROX((Scalar(2)*m2).inverse(), m2.inverse()*Scalar(0.5));
+
+  VERIFY_IS_APPROX(identity, m1.inverse() * m1 );
+  VERIFY_IS_APPROX(identity, m1 * m1.inverse() );
+
+  VERIFY_IS_APPROX(m1, m1.inverse().inverse() );
+
+  // since for the general case we implement separately row-major and col-major, test that
+  VERIFY_IS_APPROX(MatrixType(m1.transpose().inverse()), MatrixType(m1.inverse().transpose()));
+
+  inverse_for_fixed_size(m1);
 
   // check in-place inversion
   if(MatrixType::RowsAtCompileTime>=2 && MatrixType::RowsAtCompileTime<=4)
@@ -108,7 +121,7 @@ void inverse_zerosized()
   }
 }
 
-void test_inverse()
+EIGEN_DECLARE_TEST(inverse)
 {
   int s = 0;
   for(int i = 0; i < g_repeat; i++) {
@@ -122,6 +135,8 @@ void test_inverse()
     CALL_SUBTEST_5( inverse(MatrixXf(s,s)) );
     TEST_SET_BUT_UNUSED_VARIABLE(s)
     CALL_SUBTEST_5( inverse_zerosized<float>() );
+    CALL_SUBTEST_5( inverse(MatrixXf(0, 0)) );
+    CALL_SUBTEST_5( inverse(MatrixXf(1, 1)) );
     
     s = internal::random<int>(25,100);
     CALL_SUBTEST_6( inverse(MatrixXcd(s,s)) );
diff --git a/contrib/eigen/test/io.cpp b/contrib/eigen/test/io.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa14e76e9b984d922e09fb743d78648e92de6bdb
--- /dev/null
+++ b/contrib/eigen/test/io.cpp
@@ -0,0 +1,71 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 Joel Holdsworth <joel.holdsworth@vcatechnology.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <sstream>
+
+#include "main.h"
+
+template<typename Scalar>
+struct check_ostream_impl
+{
+  static void run()
+  {
+    const Array<Scalar,1,1> array(123);
+    std::ostringstream ss;
+    ss << array;
+    VERIFY(ss.str() == "123");
+
+    check_ostream_impl< std::complex<Scalar> >::run();
+  };
+};
+
+template<>
+struct check_ostream_impl<bool>
+{
+  static void run()
+  {
+    const Array<bool,1,2> array(1, 0);
+    std::ostringstream ss;
+    ss << array;
+    VERIFY(ss.str() == "1  0");
+  };
+};
+
+template<typename Scalar>
+struct check_ostream_impl< std::complex<Scalar> >
+{
+  static void run()
+  {
+    const Array<std::complex<Scalar>,1,1> array(std::complex<Scalar>(12, 34));
+    std::ostringstream ss;
+    ss << array;
+    VERIFY(ss.str() == "(12,34)");
+  };
+};
+
+template<typename Scalar>
+static void check_ostream()
+{
+  check_ostream_impl<Scalar>::run();
+}
+
+EIGEN_DECLARE_TEST(rand)
+{
+  CALL_SUBTEST(check_ostream<bool>());
+  CALL_SUBTEST(check_ostream<float>());
+  CALL_SUBTEST(check_ostream<double>());
+  CALL_SUBTEST(check_ostream<Eigen::numext::int8_t>());
+  CALL_SUBTEST(check_ostream<Eigen::numext::uint8_t>());
+  CALL_SUBTEST(check_ostream<Eigen::numext::int16_t>());
+  CALL_SUBTEST(check_ostream<Eigen::numext::uint16_t>());
+  CALL_SUBTEST(check_ostream<Eigen::numext::int32_t>());
+  CALL_SUBTEST(check_ostream<Eigen::numext::uint32_t>());
+  CALL_SUBTEST(check_ostream<Eigen::numext::int64_t>());
+  CALL_SUBTEST(check_ostream<Eigen::numext::uint64_t>());
+}
diff --git a/contrib/eigen/test/is_same_dense.cpp b/contrib/eigen/test/is_same_dense.cpp
index 2c7838ce9696b61ead380a5f4e8322bfc2bd45e4..23dd806ebd9941b6c5477cbed2eb37749e4d7156 100644
--- a/contrib/eigen/test/is_same_dense.cpp
+++ b/contrib/eigen/test/is_same_dense.cpp
@@ -11,12 +11,16 @@
 
 using internal::is_same_dense;
 
-void test_is_same_dense()
+EIGEN_DECLARE_TEST(is_same_dense)
 {
   typedef Matrix<double,Dynamic,Dynamic,ColMajor> ColMatrixXd;
+  typedef Matrix<std::complex<double>,Dynamic,Dynamic,ColMajor> ColMatrixXcd;
   ColMatrixXd m1(10,10);
+  ColMatrixXcd m2(10,10);
   Ref<ColMatrixXd> ref_m1(m1);
+  Ref<ColMatrixXd,0, Stride<Dynamic,Dynamic> >  ref_m2_real(m2.real());
   Ref<const ColMatrixXd> const_ref_m1(m1);
+
   VERIFY(is_same_dense(m1,m1));
   VERIFY(is_same_dense(m1,ref_m1));
   VERIFY(is_same_dense(const_ref_m1,m1));
@@ -30,4 +34,8 @@ void test_is_same_dense()
   
   Ref<const ColMatrixXd> const_ref_m1_col(m1.col(1));
   VERIFY(is_same_dense(m1.col(1),const_ref_m1_col));
+
+
+  VERIFY(!is_same_dense(m1, ref_m2_real));
+  VERIFY(!is_same_dense(m2, ref_m2_real));
 }
diff --git a/contrib/eigen/test/jacobi.cpp b/contrib/eigen/test/jacobi.cpp
index 319e4767abe41719a6a24fd7b97b00d0c9633653..5604797f526089176bb061f86ab1503b162985c8 100644
--- a/contrib/eigen/test/jacobi.cpp
+++ b/contrib/eigen/test/jacobi.cpp
@@ -57,7 +57,7 @@ void jacobi(const MatrixType& m = MatrixType())
   }
 }
 
-void test_jacobi()
+EIGEN_DECLARE_TEST(jacobi)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(( jacobi<Matrix3f, float>() ));
diff --git a/contrib/eigen/test/jacobisvd.cpp b/contrib/eigen/test/jacobisvd.cpp
index 64b8663580ee3e6f96050f076118c6dbcdd2c256..5b15c5a27bff8996be7ab66dc49581fcfd226840 100644
--- a/contrib/eigen/test/jacobisvd.cpp
+++ b/contrib/eigen/test/jacobisvd.cpp
@@ -36,6 +36,9 @@ void jacobisvd(const MatrixType& a = MatrixType(), bool pickrandom = true)
 template<typename MatrixType> void jacobisvd_verify_assert(const MatrixType& m)
 {
   svd_verify_assert<JacobiSVD<MatrixType> >(m);
+  svd_verify_assert<JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner> >(m, true);
+  svd_verify_assert<JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner> >(m);
+  svd_verify_assert<JacobiSVD<MatrixType, HouseholderQRPreconditioner> >(m);
   Index rows = m.rows();
   Index cols = m.cols();
 
@@ -67,6 +70,8 @@ void jacobisvd_method()
   VERIFY_RAISES_ASSERT(m.jacobiSvd().matrixU());
   VERIFY_RAISES_ASSERT(m.jacobiSvd().matrixV());
   VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).solve(m), m);
+  VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).transpose().solve(m), m);
+  VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).adjoint().solve(m), m);
 }
 
 namespace Foo {
@@ -84,7 +89,7 @@ void msvc_workaround()
   std::max EIGEN_NOT_A_MACRO (a,b);
 }
 
-void test_jacobisvd()
+EIGEN_DECLARE_TEST(jacobisvd)
 {
   CALL_SUBTEST_3(( jacobisvd_verify_assert(Matrix3f()) ));
   CALL_SUBTEST_4(( jacobisvd_verify_assert(Matrix4d()) ));
diff --git a/contrib/eigen/test/klu_support.cpp b/contrib/eigen/test/klu_support.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f806ad50ef566bdb569fae6776a90323ee2076ca
--- /dev/null
+++ b/contrib/eigen/test/klu_support.cpp
@@ -0,0 +1,32 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
+#include "sparse_solver.h"
+
+#include <Eigen/KLUSupport>
+
+template<typename T> void test_klu_support_T()
+{
+  KLU<SparseMatrix<T, ColMajor> > klu_colmajor;
+  KLU<SparseMatrix<T, RowMajor> > klu_rowmajor;
+  
+  check_sparse_square_solving(klu_colmajor);
+  check_sparse_square_solving(klu_rowmajor);
+  
+  //check_sparse_square_determinant(umfpack_colmajor);
+  //check_sparse_square_determinant(umfpack_rowmajor);
+}
+
+EIGEN_DECLARE_TEST(klu_support)
+{
+  CALL_SUBTEST_1(test_klu_support_T<double>());
+  CALL_SUBTEST_2(test_klu_support_T<std::complex<double> >());
+}
+
diff --git a/contrib/eigen/test/linearstructure.cpp b/contrib/eigen/test/linearstructure.cpp
index b6559b2a0ebb75110acad76379306a2ec5e7d72c..46ee5162b84049103500e2997814923f5bfa0243 100644
--- a/contrib/eigen/test/linearstructure.cpp
+++ b/contrib/eigen/test/linearstructure.cpp
@@ -110,7 +110,20 @@ template<typename MatrixType> void real_complex(DenseIndex rows = MatrixType::Ro
   VERIFY(g_called && "matrix<complex> - real not properly optimized");
 }
 
-void test_linearstructure()
+template<int>
+void linearstructure_overflow()
+{
+  // make sure that /=scalar and /scalar do not overflow
+  // rational: 1.0/4.94e-320 overflow, but m/4.94e-320 should not
+  Matrix4d m2, m3;
+  m3 = m2 =  Matrix4d::Random()*1e-20;
+  m2 = m2 / 4.9e-320;
+  VERIFY_IS_APPROX(m2.cwiseQuotient(m2), Matrix4d::Ones());
+  m3 /= 4.9e-320;
+  VERIFY_IS_APPROX(m3.cwiseQuotient(m3), Matrix4d::Ones());
+}
+
+EIGEN_DECLARE_TEST(linearstructure)
 {
   g_called = true;
   VERIFY(g_called); // avoid `unneeded-internal-declaration` warning.
@@ -130,19 +143,5 @@ void test_linearstructure()
     CALL_SUBTEST_11( real_complex<MatrixXcf>(10,10) );
     CALL_SUBTEST_11( real_complex<ArrayXXcf>(10,10) );
   }
-  
-#ifdef EIGEN_TEST_PART_4
-  {
-    // make sure that /=scalar and /scalar do not overflow
-    // rational: 1.0/4.94e-320 overflow, but m/4.94e-320 should not
-    Matrix4d m2, m3;
-    m3 = m2 =  Matrix4d::Random()*1e-20;
-    m2 = m2 / 4.9e-320;
-    VERIFY_IS_APPROX(m2.cwiseQuotient(m2), Matrix4d::Ones());
-    m3 /= 4.9e-320;
-    VERIFY_IS_APPROX(m3.cwiseQuotient(m3), Matrix4d::Ones());
-    
-    
-  }
-#endif
+  CALL_SUBTEST_4( linearstructure_overflow<0>() );
 }
diff --git a/contrib/eigen/test/lscg.cpp b/contrib/eigen/test/lscg.cpp
index d49ee00c31997eb88b6c19e04b475da3fefc3922..feb2347a8fc7c20400bbbc9c170cf54690279788 100644
--- a/contrib/eigen/test/lscg.cpp
+++ b/contrib/eigen/test/lscg.cpp
@@ -30,7 +30,7 @@ template<typename T> void test_lscg_T()
   CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_rowmajor_I)     );
 }
 
-void test_lscg()
+EIGEN_DECLARE_TEST(lscg)
 {
   CALL_SUBTEST_1(test_lscg_T<double>());
   CALL_SUBTEST_2(test_lscg_T<std::complex<double> >());
diff --git a/contrib/eigen/test/lu.cpp b/contrib/eigen/test/lu.cpp
index 176a2f09081ad08bd34647e4d9046b315da644e0..1bbadcbf0ff6b65479cba01849503f7c7dc33117 100644
--- a/contrib/eigen/test/lu.cpp
+++ b/contrib/eigen/test/lu.cpp
@@ -9,6 +9,7 @@
 
 #include "main.h"
 #include <Eigen/LU>
+#include "solverbase.h"
 using namespace std;
 
 template<typename MatrixType>
@@ -18,6 +19,8 @@ typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) {
 
 template<typename MatrixType> void lu_non_invertible()
 {
+  STATIC_CHECK(( internal::is_same<typename FullPivLU<MatrixType>::StorageIndex,int>::value ));
+
   typedef typename MatrixType::RealScalar RealScalar;
   /* this test covers the following files:
      LU.h
@@ -94,38 +97,20 @@ template<typename MatrixType> void lu_non_invertible()
   VERIFY(m1image.fullPivLu().rank() == rank);
   VERIFY_IS_APPROX(m1 * m1.adjoint() * m1image, m1image);
 
+  check_solverbase<CMatrixType, MatrixType>(m1, lu, rows, cols, cols2);
+
   m2 = CMatrixType::Random(cols,cols2);
   m3 = m1*m2;
   m2 = CMatrixType::Random(cols,cols2);
   // test that the code, which does resize(), may be applied to an xpr
   m2.block(0,0,m2.rows(),m2.cols()) = lu.solve(m3);
   VERIFY_IS_APPROX(m3, m1*m2);
-
-  // test solve with transposed
-  m3 = MatrixType::Random(rows,cols2);
-  m2 = m1.transpose()*m3;
-  m3 = MatrixType::Random(rows,cols2);
-  lu.template _solve_impl_transposed<false>(m2, m3);
-  VERIFY_IS_APPROX(m2, m1.transpose()*m3);
-  m3 = MatrixType::Random(rows,cols2);
-  m3 = lu.transpose().solve(m2);
-  VERIFY_IS_APPROX(m2, m1.transpose()*m3);
-
-  // test solve with conjugate transposed
-  m3 = MatrixType::Random(rows,cols2);
-  m2 = m1.adjoint()*m3;
-  m3 = MatrixType::Random(rows,cols2);
-  lu.template _solve_impl_transposed<true>(m2, m3);
-  VERIFY_IS_APPROX(m2, m1.adjoint()*m3);
-  m3 = MatrixType::Random(rows,cols2);
-  m3 = lu.adjoint().solve(m2);
-  VERIFY_IS_APPROX(m2, m1.adjoint()*m3);
 }
 
 template<typename MatrixType> void lu_invertible()
 {
   /* this test covers the following files:
-     LU.h
+     FullPivLU.h
   */
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
   Index size = MatrixType::RowsAtCompileTime;
@@ -148,10 +133,12 @@ template<typename MatrixType> void lu_invertible()
   VERIFY(lu.isSurjective());
   VERIFY(lu.isInvertible());
   VERIFY(lu.image(m1).fullPivLu().isInvertible());
+
+  check_solverbase<MatrixType, MatrixType>(m1, lu, size, size, size);
+
+  MatrixType m1_inverse = lu.inverse();
   m3 = MatrixType::Random(size,size);
   m2 = lu.solve(m3);
-  VERIFY_IS_APPROX(m3, m1*m2);
-  MatrixType m1_inverse = lu.inverse();
   VERIFY_IS_APPROX(m2, m1_inverse*m3);
 
   RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse);
@@ -160,63 +147,37 @@ template<typename MatrixType> void lu_invertible()
   // truth.
   VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
 
-  // test solve with transposed
-  lu.template _solve_impl_transposed<false>(m3, m2);
-  VERIFY_IS_APPROX(m3, m1.transpose()*m2);
-  m3 = MatrixType::Random(size,size);
-  m3 = lu.transpose().solve(m2);
-  VERIFY_IS_APPROX(m2, m1.transpose()*m3);
-
-  // test solve with conjugate transposed
-  lu.template _solve_impl_transposed<true>(m3, m2);
-  VERIFY_IS_APPROX(m3, m1.adjoint()*m2);
-  m3 = MatrixType::Random(size,size);
-  m3 = lu.adjoint().solve(m2);
-  VERIFY_IS_APPROX(m2, m1.adjoint()*m3);
-
   // Regression test for Bug 302
   MatrixType m4 = MatrixType::Random(size,size);
   VERIFY_IS_APPROX(lu.solve(m3*m4), lu.solve(m3)*m4);
 }
 
-template<typename MatrixType> void lu_partial_piv()
+template<typename MatrixType> void lu_partial_piv(Index size = MatrixType::ColsAtCompileTime)
 {
   /* this test covers the following files:
      PartialPivLU.h
   */
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-  Index size = internal::random<Index>(1,4);
 
   MatrixType m1(size, size), m2(size, size), m3(size, size);
   m1.setRandom();
   PartialPivLU<MatrixType> plu(m1);
 
+  STATIC_CHECK(( internal::is_same<typename PartialPivLU<MatrixType>::StorageIndex,int>::value ));
+
   VERIFY_IS_APPROX(m1, plu.reconstructedMatrix());
 
+  check_solverbase<MatrixType, MatrixType>(m1, plu, size, size, size);
+
+  MatrixType m1_inverse = plu.inverse();
   m3 = MatrixType::Random(size,size);
   m2 = plu.solve(m3);
-  VERIFY_IS_APPROX(m3, m1*m2);
-  MatrixType m1_inverse = plu.inverse();
   VERIFY_IS_APPROX(m2, m1_inverse*m3);
 
   RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse);
   const RealScalar rcond_est = plu.rcond();
   // Verify that the estimate is within a factor of 10 of the truth.
   VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
-
-  // test solve with transposed
-  plu.template _solve_impl_transposed<false>(m3, m2);
-  VERIFY_IS_APPROX(m3, m1.transpose()*m2);
-  m3 = MatrixType::Random(size,size);
-  m3 = plu.transpose().solve(m2);
-  VERIFY_IS_APPROX(m2, m1.transpose()*m3);
-
-  // test solve with conjugate transposed
-  plu.template _solve_impl_transposed<true>(m3, m2);
-  VERIFY_IS_APPROX(m3, m1.adjoint()*m2);
-  m3 = MatrixType::Random(size,size);
-  m3 = plu.adjoint().solve(m2);
-  VERIFY_IS_APPROX(m2, m1.adjoint()*m3);
 }
 
 template<typename MatrixType> void lu_verify_assert()
@@ -230,6 +191,8 @@ template<typename MatrixType> void lu_verify_assert()
   VERIFY_RAISES_ASSERT(lu.kernel())
   VERIFY_RAISES_ASSERT(lu.image(tmp))
   VERIFY_RAISES_ASSERT(lu.solve(tmp))
+  VERIFY_RAISES_ASSERT(lu.transpose().solve(tmp))
+  VERIFY_RAISES_ASSERT(lu.adjoint().solve(tmp))
   VERIFY_RAISES_ASSERT(lu.determinant())
   VERIFY_RAISES_ASSERT(lu.rank())
   VERIFY_RAISES_ASSERT(lu.dimensionOfKernel())
@@ -242,19 +205,25 @@ template<typename MatrixType> void lu_verify_assert()
   VERIFY_RAISES_ASSERT(plu.matrixLU())
   VERIFY_RAISES_ASSERT(plu.permutationP())
   VERIFY_RAISES_ASSERT(plu.solve(tmp))
+  VERIFY_RAISES_ASSERT(plu.transpose().solve(tmp))
+  VERIFY_RAISES_ASSERT(plu.adjoint().solve(tmp))
   VERIFY_RAISES_ASSERT(plu.determinant())
   VERIFY_RAISES_ASSERT(plu.inverse())
 }
 
-void test_lu()
+EIGEN_DECLARE_TEST(lu)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( lu_non_invertible<Matrix3f>() );
     CALL_SUBTEST_1( lu_invertible<Matrix3f>() );
     CALL_SUBTEST_1( lu_verify_assert<Matrix3f>() );
+    CALL_SUBTEST_1( lu_partial_piv<Matrix3f>() );
 
     CALL_SUBTEST_2( (lu_non_invertible<Matrix<double, 4, 6> >()) );
     CALL_SUBTEST_2( (lu_verify_assert<Matrix<double, 4, 6> >()) );
+    CALL_SUBTEST_2( lu_partial_piv<Matrix2d>() );
+    CALL_SUBTEST_2( lu_partial_piv<Matrix4d>() );
+    CALL_SUBTEST_2( (lu_partial_piv<Matrix<double,6,6> >()) );
 
     CALL_SUBTEST_3( lu_non_invertible<MatrixXf>() );
     CALL_SUBTEST_3( lu_invertible<MatrixXf>() );
@@ -262,7 +231,7 @@ void test_lu()
 
     CALL_SUBTEST_4( lu_non_invertible<MatrixXd>() );
     CALL_SUBTEST_4( lu_invertible<MatrixXd>() );
-    CALL_SUBTEST_4( lu_partial_piv<MatrixXd>() );
+    CALL_SUBTEST_4( lu_partial_piv<MatrixXd>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)) );
     CALL_SUBTEST_4( lu_verify_assert<MatrixXd>() );
 
     CALL_SUBTEST_5( lu_non_invertible<MatrixXcf>() );
@@ -271,7 +240,7 @@ void test_lu()
 
     CALL_SUBTEST_6( lu_non_invertible<MatrixXcd>() );
     CALL_SUBTEST_6( lu_invertible<MatrixXcd>() );
-    CALL_SUBTEST_6( lu_partial_piv<MatrixXcd>() );
+    CALL_SUBTEST_6( lu_partial_piv<MatrixXcd>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)) );
     CALL_SUBTEST_6( lu_verify_assert<MatrixXcd>() );
 
     CALL_SUBTEST_7(( lu_non_invertible<Matrix<float,Dynamic,16> >() ));
diff --git a/contrib/eigen/test/main.h b/contrib/eigen/test/main.h
index 18bb5c825501c97c0e7484ac7cfffd964be4e384..07f3794ac8be407cc859bc44ae2afa14902d8d21 100644
--- a/contrib/eigen/test/main.h
+++ b/contrib/eigen/test/main.h
@@ -1,3 +1,4 @@
+
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
@@ -17,6 +18,7 @@
 #include <sstream>
 #include <vector>
 #include <typeinfo>
+#include <functional>
 
 // The following includes of STL headers have to be done _before_ the
 // definition of macros min() and max().  The reason is that many STL
@@ -38,28 +40,33 @@
 // definitions.
 #include <limits>
 #include <algorithm>
+// Disable ICC's std::complex operator specializations so we can use our own.
+#define _OVERRIDE_COMPLEX_SPECIALIZATION_ 1
 #include <complex>
 #include <deque>
 #include <queue>
 #include <cassert>
 #include <list>
-#if __cplusplus >= 201103L
+#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
 #include <random>
+#include <chrono>
 #ifdef EIGEN_USE_THREADS
 #include <future>
 #endif
 #endif
 
 // Same for cuda_fp16.h
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-#define EIGEN_TEST_CUDACC_VER  ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
-#elif defined(__CUDACC_VER__)
-#define EIGEN_TEST_CUDACC_VER __CUDACC_VER__
+#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
+  // Means the compiler is either nvcc or clang with CUDA enabled
+  #define EIGEN_CUDACC __CUDACC__
+#endif
+#if defined(EIGEN_CUDACC)
+#include <cuda.h>
+  #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
 #else
-#define EIGEN_TEST_CUDACC_VER 0
+  #define EIGEN_CUDA_SDK_VER 0
 #endif
-
-#if EIGEN_TEST_CUDACC_VER >= 70500
+#if EIGEN_CUDA_SDK_VER >= 70500
 #include <cuda_fp16.h>
 #endif
 
@@ -67,11 +74,27 @@
 // protected by parenthesis against macro expansion, the min()/max() macros
 // are defined here and any not-parenthesized min/max call will cause a
 // compiler error.
-#define min(A,B) please_protect_your_min_with_parentheses
-#define max(A,B) please_protect_your_max_with_parentheses
-#define isnan(X) please_protect_your_isnan_with_parentheses
-#define isinf(X) please_protect_your_isinf_with_parentheses
-#define isfinite(X) please_protect_your_isfinite_with_parentheses
+#if !defined(__HIPCC__) && !defined(EIGEN_USE_SYCL)
+  //
+  // HIP header files include the following files
+  //  <thread>
+  //  <regex>
+  //  <unordered_map>
+  // which seem to contain not-parenthesized calls to "max"/"min", triggering the following check and causing the compile to fail
+  //
+  // Including those header files before the following macro definition for "min" / "max", only partially resolves the issue
+  // This is because other HIP header files also define "isnan" / "isinf" / "isfinite" functions, which are needed in other
+  // headers.
+  //
+  // So instead choosing to simply disable this check for HIP
+  //
+  #define min(A,B) please_protect_your_min_with_parentheses
+  #define max(A,B) please_protect_your_max_with_parentheses
+  #define isnan(X) please_protect_your_isnan_with_parentheses
+  #define isinf(X) please_protect_your_isinf_with_parentheses
+  #define isfinite(X) please_protect_your_isfinite_with_parentheses
+#endif
+
 
 // test possible conflicts
 struct real {};
@@ -85,6 +108,8 @@ struct imag {};
 #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
 // B0 is defined in POSIX header termios.h
 #define B0 FORBIDDEN_IDENTIFIER
+// `I` may be defined by complex.h:
+#define I  FORBIDDEN_IDENTIFIER
 
 // Unit tests calling Eigen's blas library must preserve the default blocking size
 // to avoid troubles.
@@ -111,13 +136,12 @@ inline void on_temporary_creation(long int size) {
 #define VERIFY_EVALUATION_COUNT(XPR,N) {\
     nb_temporaries = 0; \
     XPR; \
-    if(nb_temporaries!=N) { std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; }\
-    VERIFY( (#XPR) && nb_temporaries==N ); \
+    if(nb_temporaries!=(N)) { std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; }\
+    VERIFY( (#XPR) && nb_temporaries==(N) ); \
   }
-  
+
 #endif
 
-// the following file is automatically generated by cmake
 #include "split_test_helper.h"
 
 #ifdef NDEBUG
@@ -134,10 +158,6 @@ inline void on_temporary_creation(long int size) {
 #define EIGEN_MAKING_DOCS
 #endif
 
-#ifndef EIGEN_TEST_FUNC
-#error EIGEN_TEST_FUNC must be defined
-#endif
-
 #define DEFAULT_REPEAT 10
 
 namespace Eigen
@@ -146,20 +166,50 @@ namespace Eigen
   // level == 0 <=> abort if test fail
   // level >= 1 <=> warning message to std::cerr if test fail
   static int g_test_level = 0;
-  static int g_repeat;
-  static unsigned int g_seed;
-  static bool g_has_set_repeat, g_has_set_seed;
+  static int g_repeat = 1;
+  static unsigned int g_seed = 0;
+  static bool g_has_set_repeat = false, g_has_set_seed = false;
+
+  class EigenTest
+  {
+  public:
+    EigenTest() : m_func(0) {}
+    EigenTest(const char* a_name, void (*func)(void))
+      : m_name(a_name), m_func(func)
+    {
+      get_registered_tests().push_back(this);
+    }
+    const std::string& name() const { return m_name; }
+    void operator()() const { m_func(); }
+
+    static const std::vector<EigenTest*>& all() { return get_registered_tests(); }
+  protected:
+    static std::vector<EigenTest*>& get_registered_tests()
+    {
+      static std::vector<EigenTest*>* ms_registered_tests = new std::vector<EigenTest*>();
+      return *ms_registered_tests;
+    }
+    std::string m_name;
+    void (*m_func)(void);
+  };
+
+  // Declare and register a test, e.g.:
+  //    EIGEN_DECLARE_TEST(mytest) { ... }
+  // will create a function:
+  //    void test_mytest() { ... }
+  // that will be automatically called.
+  #define EIGEN_DECLARE_TEST(X) \
+    void EIGEN_CAT(test_,X) (); \
+    static EigenTest EIGEN_CAT(test_handler_,X) (EIGEN_MAKESTRING(X), & EIGEN_CAT(test_,X)); \
+    void EIGEN_CAT(test_,X) ()
 }
 
 #define TRACK std::cerr << __FILE__ << " " << __LINE__ << std::endl
 // #define TRACK while()
 
-#define EI_PP_MAKE_STRING2(S) #S
-#define EI_PP_MAKE_STRING(S) EI_PP_MAKE_STRING2(S)
-
 #define EIGEN_DEFAULT_IO_FORMAT IOFormat(4, 0, "  ", "\n", "", "", "", "")
 
-#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__)
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
   #define EIGEN_EXCEPTIONS
 #endif
 
@@ -188,7 +238,7 @@ namespace Eigen
     };
   }
   // If EIGEN_DEBUG_ASSERTS is defined and if no assertion is triggered while
-  // one should have been, then the list of excecuted assertions is printed out.
+  // one should have been, then the list of executed assertions is printed out.
   //
   // EIGEN_DEBUG_ASSERTS is not enabled by default as it
   // significantly increases the compilation time
@@ -214,7 +264,7 @@ namespace Eigen
       }                                     \
       else if (Eigen::internal::push_assert)       \
       {                                     \
-        eigen_assert_list.push_back(std::string(EI_PP_MAKE_STRING(__FILE__) " (" EI_PP_MAKE_STRING(__LINE__) ") : " #a) ); \
+        eigen_assert_list.push_back(std::string(EIGEN_MAKESTRING(__FILE__) " (" EIGEN_MAKESTRING(__LINE__) ") : " #a) ); \
       }
 
     #ifdef EIGEN_EXCEPTIONS
@@ -238,7 +288,7 @@ namespace Eigen
       }
     #endif //EIGEN_EXCEPTIONS
 
-  #elif !defined(__CUDACC__) // EIGEN_DEBUG_ASSERTS
+  #elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(SYCL_DEVICE_ONLY) // EIGEN_DEBUG_ASSERTS
     // see bug 89. The copy_bool here is working around a bug in gcc <= 4.3
     #define eigen_assert(a) \
       if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\
@@ -294,8 +344,8 @@ namespace Eigen
   #define VERIFY_RAISES_STATIC_ASSERT(a) \
     std::cout << "Can't VERIFY_RAISES_STATIC_ASSERT( " #a " ) with exceptions disabled\n";
 #endif
-    
-  #if !defined(__CUDACC__)
+
+  #if !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(SYCL_DEVICE_ONLY)
   #define EIGEN_USE_CUSTOM_ASSERT
   #endif
 
@@ -327,10 +377,10 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
   }
 }
 
-#define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a))
+#define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EIGEN_MAKESTRING(a))
 
-#define VERIFY_GE(a, b) ::verify_impl(a >= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a >= b))
-#define VERIFY_LE(a, b) ::verify_impl(a <= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a <= b))
+#define VERIFY_GE(a, b) ::verify_impl(a >= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EIGEN_MAKESTRING(a >= b))
+#define VERIFY_LE(a, b) ::verify_impl(a <= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EIGEN_MAKESTRING(a <= b))
 
 
 #define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b, true))
@@ -344,8 +394,10 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
 
 #define VERIFY_IS_UNITARY(a) VERIFY(test_isUnitary(a))
 
+#define STATIC_CHECK(COND) EIGEN_STATIC_ASSERT( (COND) , EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT )
+
 #define CALL_SUBTEST(FUNC) do { \
-    g_test_stack.push_back(EI_PP_MAKE_STRING(FUNC)); \
+    g_test_stack.push_back(EIGEN_MAKESTRING(FUNC)); \
     FUNC; \
     g_test_stack.pop_back(); \
   } while (0)
@@ -353,6 +405,13 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
 
 namespace Eigen {
 
+template<typename T1,typename T2>
+typename internal::enable_if<internal::is_same<T1,T2>::value,bool>::type
+is_same_type(const T1&, const T2&)
+{
+  return true;
+}
+
 template<typename T> inline typename NumTraits<T>::Real test_precision() { return NumTraits<T>::dummy_precision(); }
 template<> inline float test_precision<float>() { return 1e-3f; }
 template<> inline double test_precision<double>() { return 1e-6; }
@@ -361,37 +420,30 @@ template<> inline float test_precision<std::complex<float> >() { return test_pre
 template<> inline double test_precision<std::complex<double> >() { return test_precision<double>(); }
 template<> inline long double test_precision<std::complex<long double> >() { return test_precision<long double>(); }
 
-inline bool test_isApprox(const short& a, const short& b)
-{ return internal::isApprox(a, b, test_precision<short>()); }
-inline bool test_isApprox(const unsigned short& a, const unsigned short& b)
-{ return internal::isApprox(a, b, test_precision<unsigned short>()); }
-inline bool test_isApprox(const unsigned int& a, const unsigned int& b)
-{ return internal::isApprox(a, b, test_precision<unsigned int>()); }
-inline bool test_isApprox(const long& a, const long& b)
-{ return internal::isApprox(a, b, test_precision<long>()); }
-inline bool test_isApprox(const unsigned long& a, const unsigned long& b)
-{ return internal::isApprox(a, b, test_precision<unsigned long>()); }
-
-inline bool test_isApprox(const int& a, const int& b)
-{ return internal::isApprox(a, b, test_precision<int>()); }
-inline bool test_isMuchSmallerThan(const int& a, const int& b)
-{ return internal::isMuchSmallerThan(a, b, test_precision<int>()); }
-inline bool test_isApproxOrLessThan(const int& a, const int& b)
-{ return internal::isApproxOrLessThan(a, b, test_precision<int>()); }
-
-inline bool test_isApprox(const float& a, const float& b)
-{ return internal::isApprox(a, b, test_precision<float>()); }
-inline bool test_isMuchSmallerThan(const float& a, const float& b)
-{ return internal::isMuchSmallerThan(a, b, test_precision<float>()); }
-inline bool test_isApproxOrLessThan(const float& a, const float& b)
-{ return internal::isApproxOrLessThan(a, b, test_precision<float>()); }
-
-inline bool test_isApprox(const double& a, const double& b)
-{ return internal::isApprox(a, b, test_precision<double>()); }
-inline bool test_isMuchSmallerThan(const double& a, const double& b)
-{ return internal::isMuchSmallerThan(a, b, test_precision<double>()); }
-inline bool test_isApproxOrLessThan(const double& a, const double& b)
-{ return internal::isApproxOrLessThan(a, b, test_precision<double>()); }
+#define EIGEN_TEST_SCALAR_TEST_OVERLOAD(TYPE)                             \
+  inline bool test_isApprox(TYPE a, TYPE b)                               \
+  { return internal::isApprox(a, b, test_precision<TYPE>()); }            \
+  inline bool test_isMuchSmallerThan(TYPE a, TYPE b)                      \
+  { return internal::isMuchSmallerThan(a, b, test_precision<TYPE>()); }   \
+  inline bool test_isApproxOrLessThan(TYPE a, TYPE b)                     \
+  { return internal::isApproxOrLessThan(a, b, test_precision<TYPE>()); }
+
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(short)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(unsigned short)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(int)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(unsigned int)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(long)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(unsigned long)
+#if EIGEN_HAS_CXX11
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(long long)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(unsigned long long)
+#endif
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(float)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(double)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(half)
+EIGEN_TEST_SCALAR_TEST_OVERLOAD(bfloat16)
+
+#undef EIGEN_TEST_SCALAR_TEST_OVERLOAD
 
 #ifndef EIGEN_TEST_NO_COMPLEX
 inline bool test_isApprox(const std::complex<float>& a, const std::complex<float>& b)
@@ -428,13 +480,6 @@ inline bool test_isApproxOrLessThan(const long double& a, const long double& b)
 { return internal::isApproxOrLessThan(a, b, test_precision<long double>()); }
 #endif // EIGEN_TEST_NO_LONGDOUBLE
 
-inline bool test_isApprox(const half& a, const half& b)
-{ return internal::isApprox(a, b, test_precision<half>()); }
-inline bool test_isMuchSmallerThan(const half& a, const half& b)
-{ return internal::isMuchSmallerThan(a, b, test_precision<half>()); }
-inline bool test_isApproxOrLessThan(const half& a, const half& b)
-{ return internal::isApproxOrLessThan(a, b, test_precision<half>()); }
-
 // test_relative_error returns the relative difference between a and b as a real scalar as used in isApprox.
 template<typename T1,typename T2>
 typename NumTraits<typename T1::RealScalar>::NonInteger test_relative_error(const EigenBase<T1> &a, const EigenBase<T2> &b)
@@ -501,7 +546,7 @@ template<typename T1,typename T2>
 typename NumTraits<typename NumTraits<T1>::Real>::NonInteger test_relative_error(const T1 &a, const T2 &b, typename internal::enable_if<internal::is_arithmetic<typename NumTraits<T1>::Real>::value, T1>::type* = 0)
 {
   typedef typename NumTraits<typename NumTraits<T1>::Real>::NonInteger RealScalar;
-  return numext::sqrt(RealScalar(numext::abs2(a-b))/RealScalar((numext::mini)(numext::abs2(a),numext::abs2(b))));
+  return numext::sqrt(RealScalar(numext::abs2(a-b))/(numext::mini)(RealScalar(numext::abs2(a)),RealScalar(numext::abs2(b))));
 }
 
 template<typename T>
@@ -696,9 +741,6 @@ template<> std::string type_name<std::complex<double> >()       { return "comple
 template<> std::string type_name<std::complex<long double> >()  { return "complex<long double>"; }
 template<> std::string type_name<std::complex<int> >()          { return "complex<int>"; }
 
-// forward declaration of the main test function
-void EIGEN_CAT(test_,EIGEN_TEST_FUNC)();
-
 using namespace Eigen;
 
 inline void set_repeat_from_string(const char *str)
@@ -785,9 +827,16 @@ int main(int argc, char *argv[])
     srand(g_seed);
     std::cout << "Repeating each test " << g_repeat << " times" << std::endl;
 
-    Eigen::g_test_stack.push_back(std::string(EI_PP_MAKE_STRING(EIGEN_TEST_FUNC)));
+    VERIFY(EigenTest::all().size()>0);
+
+    for(std::size_t i=0; i<EigenTest::all().size(); ++i)
+    {
+      const EigenTest& current_test = *EigenTest::all()[i];
+      Eigen::g_test_stack.push_back(current_test.name());
+      current_test();
+      Eigen::g_test_stack.pop_back();
+    }
 
-    EIGEN_CAT(test_,EIGEN_TEST_FUNC)();
     return 0;
 }
 
diff --git a/contrib/eigen/test/mapped_matrix.cpp b/contrib/eigen/test/mapped_matrix.cpp
index bc8a694ab96e1e19f0af67b25bbc73f934dfbacd..0ea136ae6293b7d224db9745001e91f1157d5325 100644
--- a/contrib/eigen/test/mapped_matrix.cpp
+++ b/contrib/eigen/test/mapped_matrix.cpp
@@ -178,7 +178,7 @@ void map_not_aligned_on_scalar()
   internal::aligned_delete(array1, (size+1)*(size+1)+1);
 }
 
-void test_mapped_matrix()
+EIGEN_DECLARE_TEST(mapped_matrix)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( map_class_vector(Matrix<float, 1, 1>()) );
@@ -202,7 +202,6 @@ void test_mapped_matrix()
     CALL_SUBTEST_8( map_static_methods(RowVector3d()) );
     CALL_SUBTEST_9( map_static_methods(VectorXcd(8)) );
     CALL_SUBTEST_10( map_static_methods(VectorXf(12)) );
-    
     CALL_SUBTEST_11( map_not_aligned_on_scalar<double>() );
   }
 }
diff --git a/contrib/eigen/test/mapstaticmethods.cpp b/contrib/eigen/test/mapstaticmethods.cpp
index 8156ca939537dae1a765b49dd160a282f294a43d..d0128ba94f9b702a3105e76b287662a51dd2d5b2 100644
--- a/contrib/eigen/test/mapstaticmethods.cpp
+++ b/contrib/eigen/test/mapstaticmethods.cpp
@@ -9,8 +9,12 @@
 
 #include "main.h"
 
+// GCC<=4.8 has spurious shadow warnings, because `ptr` re-appears inside template instantiations
+// workaround: put these in an anonymous namespace
+namespace {
 float *ptr;
 const float *const_ptr;
+}
 
 template<typename PlainObjectType,
          bool IsDynamicSize = PlainObjectType::SizeAtCompileTime == Dynamic,
@@ -143,7 +147,7 @@ void mapstaticmethods(const PlainObjectType& m)
   VERIFY(true); // just to avoid 'unused function' warning
 }
 
-void test_mapstaticmethods()
+EIGEN_DECLARE_TEST(mapstaticmethods)
 {
   ptr = internal::aligned_new<float>(1000);
   for(int i = 0; i < 1000; i++) ptr[i] = float(i);
diff --git a/contrib/eigen/test/mapstride.cpp b/contrib/eigen/test/mapstride.cpp
index d785148cf67eea27246bd3891593315a91d9f888..fde73f2eccf62f48907812f16acbfad31bb03d91 100644
--- a/contrib/eigen/test/mapstride.cpp
+++ b/contrib/eigen/test/mapstride.cpp
@@ -162,6 +162,32 @@ template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixTy
     VERIFY_IS_APPROX(map,s1*m);
   }
 
+  // test negative strides
+  {
+    Matrix<Scalar,Dynamic,1>::Map(a_array1, arraysize+1).setRandom();
+    Index outerstride = m.innerSize()+4;
+    Scalar* array = array1;
+
+    {
+      Map<MatrixType, Alignment, OuterStride<> > map1(array, rows, cols, OuterStride<>( outerstride));
+      Map<MatrixType, Unaligned, OuterStride<> > map2(array+(m.outerSize()-1)*outerstride, rows, cols, OuterStride<>(-outerstride));
+      if(MatrixType::IsRowMajor)  VERIFY_IS_APPROX(map1.colwise().reverse(), map2);
+      else                        VERIFY_IS_APPROX(map1.rowwise().reverse(), map2);
+    }
+
+    {
+      Map<MatrixType, Alignment, OuterStride<> > map1(array, rows, cols, OuterStride<>( outerstride));
+      Map<MatrixType, Unaligned, Stride<Dynamic,Dynamic> > map2(array+(m.outerSize()-1)*outerstride+m.innerSize()-1, rows, cols, Stride<Dynamic,Dynamic>(-outerstride,-1));
+      VERIFY_IS_APPROX(map1.reverse(), map2);
+    }
+
+    {
+      Map<MatrixType, Alignment, OuterStride<> > map1(array, rows, cols, OuterStride<>( outerstride));
+      Map<MatrixType, Unaligned, Stride<Dynamic,-1> > map2(array+(m.outerSize()-1)*outerstride+m.innerSize()-1, rows, cols, Stride<Dynamic,-1>(-outerstride,-1));
+      VERIFY_IS_APPROX(map1.reverse(), map2);
+    }
+  }
+
   internal::aligned_delete(a_array1, arraysize+1);
 }
 
@@ -197,10 +223,10 @@ void bug1453()
   VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
 }
 
-void test_mapstride()
+EIGEN_DECLARE_TEST(mapstride)
 {
   for(int i = 0; i < g_repeat; i++) {
-    int maxn = 30;
+    int maxn = 3;
     CALL_SUBTEST_1( map_class_vector<Aligned>(Matrix<float, 1, 1>()) );
     CALL_SUBTEST_1( map_class_vector<Unaligned>(Matrix<float, 1, 1>()) );
     CALL_SUBTEST_2( map_class_vector<Aligned>(Vector4d()) );
diff --git a/contrib/eigen/test/meta.cpp b/contrib/eigen/test/meta.cpp
index b8dea68e8083f56c651d7578a192a47e1a588b6c..7a8b93c3d8bd78e5d0dae361988f17e35293dd0b 100644
--- a/contrib/eigen/test/meta.cpp
+++ b/contrib/eigen/test/meta.cpp
@@ -15,14 +15,26 @@ bool check_is_convertible(const From&, const To&)
   return internal::is_convertible<From,To>::value;
 }
 
-void test_meta()
+struct FooReturnType {
+  typedef int ReturnType;
+};
+
+struct MyInterface {
+  virtual void func() = 0;
+  virtual ~MyInterface() {}
+};
+struct MyImpl : public MyInterface {
+  void func() {}
+};
+
+EIGEN_DECLARE_TEST(meta)
 {
   VERIFY((internal::conditional<(3<4),internal::true_type, internal::false_type>::type::value));
   VERIFY(( internal::is_same<float,float>::value));
   VERIFY((!internal::is_same<float,double>::value));
   VERIFY((!internal::is_same<float,float&>::value));
   VERIFY((!internal::is_same<float,const float&>::value));
-  
+
   VERIFY(( internal::is_same<float,internal::remove_all<const float&>::type >::value));
   VERIFY(( internal::is_same<float,internal::remove_all<const float*>::type >::value));
   VERIFY(( internal::is_same<float,internal::remove_all<const float*&>::type >::value));
@@ -51,23 +63,40 @@ void test_meta()
 
   VERIFY(( internal::is_same< internal::add_const_on_value_type<const float* const>::type, const float* const>::value));
   VERIFY(( internal::is_same< internal::add_const_on_value_type<float* const>::type, const float* const>::value));
-  
+
   VERIFY(( internal::is_same<float,internal::remove_reference<float&>::type >::value));
   VERIFY(( internal::is_same<const float,internal::remove_reference<const float&>::type >::value));
   VERIFY(( internal::is_same<float,internal::remove_pointer<float*>::type >::value));
   VERIFY(( internal::is_same<const float,internal::remove_pointer<const float*>::type >::value));
   VERIFY(( internal::is_same<float,internal::remove_pointer<float* const >::type >::value));
-  
-  VERIFY(( internal::is_convertible<float,double>::value ));
-  VERIFY(( internal::is_convertible<int,double>::value ));
-  VERIFY(( internal::is_convertible<double,int>::value ));
-  VERIFY((!internal::is_convertible<std::complex<double>,double>::value ));
-  VERIFY(( internal::is_convertible<Array33f,Matrix3f>::value ));
-//   VERIFY((!internal::is_convertible<Matrix3f,Matrix3d>::value )); //does not work because the conversion is prevented by a static assertion
-  VERIFY((!internal::is_convertible<Array33f,int>::value ));
-  VERIFY((!internal::is_convertible<MatrixXf,float>::value ));
+
+
+  // is_convertible
+  STATIC_CHECK(( internal::is_convertible<float,double>::value ));
+  STATIC_CHECK(( internal::is_convertible<int,double>::value ));
+  STATIC_CHECK(( internal::is_convertible<int, short>::value ));
+  STATIC_CHECK(( internal::is_convertible<short, int>::value ));
+  STATIC_CHECK(( internal::is_convertible<double,int>::value ));
+  STATIC_CHECK(( internal::is_convertible<double,std::complex<double> >::value ));
+  STATIC_CHECK((!internal::is_convertible<std::complex<double>,double>::value ));
+  STATIC_CHECK(( internal::is_convertible<Array33f,Matrix3f>::value ));
+  STATIC_CHECK(( internal::is_convertible<Matrix3f&,Matrix3f>::value ));
+  STATIC_CHECK(( internal::is_convertible<Matrix3f&,Matrix3f&>::value ));
+  STATIC_CHECK(( internal::is_convertible<Matrix3f&,const Matrix3f&>::value ));
+  STATIC_CHECK(( internal::is_convertible<const Matrix3f&,Matrix3f>::value ));
+  STATIC_CHECK(( internal::is_convertible<const Matrix3f&,const Matrix3f&>::value ));
+  STATIC_CHECK((!internal::is_convertible<const Matrix3f&,Matrix3f&>::value ));
+  STATIC_CHECK((!internal::is_convertible<const Matrix3f,Matrix3f&>::value ));
+  STATIC_CHECK(!( internal::is_convertible<Matrix3f,Matrix3f&>::value ));
+
+  STATIC_CHECK(!( internal::is_convertible<int,int&>::value ));
+  STATIC_CHECK(( internal::is_convertible<const int,const int& >::value ));
+
+  //STATIC_CHECK((!internal::is_convertible<Matrix3f,Matrix3d>::value )); //does not even compile because the conversion is prevented by a static assertion
+  STATIC_CHECK((!internal::is_convertible<Array33f,int>::value ));
+  STATIC_CHECK((!internal::is_convertible<MatrixXf,float>::value ));
   {
-    float f;
+    float f = 0.0f;
     MatrixXf A, B;
     VectorXf a, b;
     VERIFY(( check_is_convertible(a.dot(b), f) ));
@@ -75,7 +104,39 @@ void test_meta()
     VERIFY((!check_is_convertible(A*B, f) ));
     VERIFY(( check_is_convertible(A*B, A) ));
   }
-  
+
+  #if (EIGEN_COMP_GNUC  && EIGEN_COMP_GNUC  <=  99) \
+   || (EIGEN_COMP_CLANG && EIGEN_COMP_CLANG <= 909) \
+   || (EIGEN_COMP_MSVC  && EIGEN_COMP_MSVC  <=1914)
+  // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1752,
+  // basically, a fix in the c++ standard breaks our c++98 implementation
+  // of is_convertible for abstract classes.
+  // So the following tests are expected to fail with recent compilers.
+
+  STATIC_CHECK(( !internal::is_convertible<MyInterface, MyImpl>::value ));
+  #if (!EIGEN_COMP_GNUC_STRICT) || (EIGEN_GNUC_AT_LEAST(4,8))
+  // GCC prior to 4.8 fails to compile this test:
+  // error: cannot allocate an object of abstract type 'MyInterface'
+  // In other word, it does not obey SFINAE.
+  // Nevertheless, we don't really care about supporting abstract type as scalar type!
+  STATIC_CHECK(( !internal::is_convertible<MyImpl, MyInterface>::value ));
+  #endif
+  STATIC_CHECK((  internal::is_convertible<MyImpl, const MyInterface&>::value ));
+
+  #endif
+
+  {
+    int i = 0;
+    VERIFY(( check_is_convertible(fix<3>(), i) ));
+    VERIFY((!check_is_convertible(i, fix<DynamicIndex>()) ));
+  }
+
+
+  VERIFY((  internal::has_ReturnType<FooReturnType>::value ));
+  VERIFY((  internal::has_ReturnType<ScalarBinaryOpTraits<int,int> >::value ));
+  VERIFY(( !internal::has_ReturnType<MatrixXf>::value ));
+  VERIFY(( !internal::has_ReturnType<int>::value ));
+
   VERIFY(internal::meta_sqrt<1>::ret == 1);
   #define VERIFY_META_SQRT(X) VERIFY(internal::meta_sqrt<X>::ret == int(std::sqrt(double(X))))
   VERIFY_META_SQRT(2);
diff --git a/contrib/eigen/test/metis_support.cpp b/contrib/eigen/test/metis_support.cpp
index d87c56a13055772fcc60047e32ead8364f5091ff..b490dacde9839aaf51d67ceaa6e846d745db52db 100644
--- a/contrib/eigen/test/metis_support.cpp
+++ b/contrib/eigen/test/metis_support.cpp
@@ -19,7 +19,7 @@ template<typename T> void test_metis_T()
   check_sparse_square_solving(sparselu_metis); 
 }
 
-void test_metis_support()
+EIGEN_DECLARE_TEST(metis_support)
 {
   CALL_SUBTEST_1(test_metis_T<double>());
 }
diff --git a/contrib/eigen/test/miscmatrices.cpp b/contrib/eigen/test/miscmatrices.cpp
index f17291c408ced32fe48e66165521315fdd747891..e71712f33290ae1c716d198e53b922530f3fcf10 100644
--- a/contrib/eigen/test/miscmatrices.cpp
+++ b/contrib/eigen/test/miscmatrices.cpp
@@ -34,7 +34,7 @@ template<typename MatrixType> void miscMatrices(const MatrixType& m)
   VERIFY_IS_APPROX(square, MatrixType::Identity(rows, rows));
 }
 
-void test_miscmatrices()
+EIGEN_DECLARE_TEST(miscmatrices)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( miscMatrices(Matrix<float, 1, 1>()) );
diff --git a/contrib/eigen/test/mixingtypes.cpp b/contrib/eigen/test/mixingtypes.cpp
index 45d79aa0c35d1d4709f132c7d810dd413e428377..d450dbff8bbfd324e5b7b19b0329dbebef52ba45 100644
--- a/contrib/eigen/test/mixingtypes.cpp
+++ b/contrib/eigen/test/mixingtypes.cpp
@@ -309,8 +309,9 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   VERIFY_IS_APPROX( rcd.noalias() -= mcd + md*md,           - ((md*md).eval().template cast<CD>()) );
 }
 
-void test_mixingtypes()
+EIGEN_DECLARE_TEST(mixingtypes)
 {
+  g_called = false; // Silence -Wunneeded-internal-declaration.
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(mixingtypes<3>());
     CALL_SUBTEST_2(mixingtypes<4>());
diff --git a/contrib/eigen/test/mpl2only.cpp b/contrib/eigen/test/mpl2only.cpp
index 7d04d6bba617bb19d12c8471196c2a68daf34554..296350d082fdf7d3e989ae56afe0d01c54786a11 100644
--- a/contrib/eigen/test/mpl2only.cpp
+++ b/contrib/eigen/test/mpl2only.cpp
@@ -7,7 +7,9 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#ifndef EIGEN_MPL2_ONLY
 #define EIGEN_MPL2_ONLY
+#endif
 #include <Eigen/Dense>
 #include <Eigen/SparseCore>
 #include <Eigen/SparseLU>
diff --git a/contrib/eigen/test/nestbyvalue.cpp b/contrib/eigen/test/nestbyvalue.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c5356bc24cb8e571309b5cb674aecc41602fedd2
--- /dev/null
+++ b/contrib/eigen/test/nestbyvalue.cpp
@@ -0,0 +1,37 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define TEST_ENABLE_TEMPORARY_TRACKING
+
+#include "main.h"
+
+typedef NestByValue<MatrixXd> CpyMatrixXd;
+typedef CwiseBinaryOp<internal::scalar_sum_op<double,double>,const CpyMatrixXd,const CpyMatrixXd> XprType;
+
+XprType get_xpr_with_temps(const MatrixXd& a)
+{
+  MatrixXd t1 = a.rowwise().reverse();
+  MatrixXd t2 = a+a;
+  return t1.nestByValue() + t2.nestByValue();
+}
+
+EIGEN_DECLARE_TEST(nestbyvalue)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    Index rows = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+    Index cols = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+    MatrixXd a = MatrixXd(rows,cols);
+    nb_temporaries = 0;
+    XprType x = get_xpr_with_temps(a);
+    VERIFY_IS_EQUAL(nb_temporaries,6);
+    MatrixXd b = x;
+    VERIFY_IS_EQUAL(nb_temporaries,6+1);
+    VERIFY_IS_APPROX(b, a.rowwise().reverse().eval() + (a+a).eval());
+  }
+}
diff --git a/contrib/eigen/test/nesting_ops.cpp b/contrib/eigen/test/nesting_ops.cpp
index a419b0e44ac5f9e343ea177c600e23c5c692184c..4b5fc21f259d13570e979d31086511007fa6a358 100644
--- a/contrib/eigen/test/nesting_ops.cpp
+++ b/contrib/eigen/test/nesting_ops.cpp
@@ -91,7 +91,7 @@ template <typename MatrixType> void run_nesting_ops_2(const MatrixType& _m)
 }
 
 
-void test_nesting_ops()
+EIGEN_DECLARE_TEST(nesting_ops)
 {
   CALL_SUBTEST_1(run_nesting_ops_1(MatrixXf::Random(25,25)));
   CALL_SUBTEST_2(run_nesting_ops_1(MatrixXcd::Random(25,25)));
diff --git a/contrib/eigen/test/nomalloc.cpp b/contrib/eigen/test/nomalloc.cpp
index b7ea4d362ce641d056b98972fd67689ed22d8132..cb4c073e941895b9e72f6d39cdb8ae70f707598e 100644
--- a/contrib/eigen/test/nomalloc.cpp
+++ b/contrib/eigen/test/nomalloc.cpp
@@ -172,7 +172,7 @@ template<typename MatrixType> void test_reference(const MatrixType& m) {
   typedef typename MatrixType::Scalar Scalar;
   enum { Flag          =  MatrixType::IsRowMajor ? Eigen::RowMajor : Eigen::ColMajor};
   enum { TransposeFlag = !MatrixType::IsRowMajor ? Eigen::RowMajor : Eigen::ColMajor};
-  typename MatrixType::Index rows = m.rows(), cols=m.cols();
+  Index rows = m.rows(), cols=m.cols();
   typedef Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Flag         > MatrixX;
   typedef Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, TransposeFlag> MatrixXT;
   // Dynamic reference:
@@ -202,7 +202,7 @@ template<typename MatrixType> void test_reference(const MatrixType& m) {
 
 }
 
-void test_nomalloc()
+EIGEN_DECLARE_TEST(nomalloc)
 {
   // create some dynamic objects
   Eigen::MatrixXd M1 = MatrixXd::Random(3,3);
diff --git a/contrib/eigen/test/nullary.cpp b/contrib/eigen/test/nullary.cpp
index acd55506e98bf7c7d9e8bb4576da7dd89075c2a9..9b25ea4f36b6be5b02af2f52e7268250d91862cb 100644
--- a/contrib/eigen/test/nullary.cpp
+++ b/contrib/eigen/test/nullary.cpp
@@ -70,7 +70,7 @@ void testVectorType(const VectorType& base)
   
   Scalar high = internal::random<Scalar>(-500,500);
   Scalar low = (size == 1 ? high : internal::random<Scalar>(-500,500));
-  if (low>high) std::swap(low,high);
+  if (numext::real(low)>numext::real(high)) std::swap(low,high);
 
   // check low==high
   if(internal::random<float>(0.f,1.f)<0.05f)
@@ -79,7 +79,7 @@ void testVectorType(const VectorType& base)
   else if(size>2 && std::numeric_limits<RealScalar>::max_exponent10>0 && internal::random<float>(0.f,1.f)<0.1f)
     low = -internal::random<Scalar>(1,2) * RealScalar(std::pow(RealScalar(10),std::numeric_limits<RealScalar>::max_exponent10/2));
 
-  const Scalar step = ((size == 1) ? 1 : (high-low)/(size-1));
+  const Scalar step = ((size == 1) ? 1 : (high-low)/RealScalar(size-1));
 
   // check whether the result yields what we expect it to do
   VectorType m(base);
@@ -89,21 +89,22 @@ void testVectorType(const VectorType& base)
   {
     VectorType n(size);
     for (int i=0; i<size; ++i)
-      n(i) = low+i*step;
+      n(i) = low+RealScalar(i)*step;
     VERIFY_IS_APPROX(m,n);
 
     CALL_SUBTEST( check_extremity_accuracy(m, low, high) );
   }
 
-  if((!NumTraits<Scalar>::IsInteger) || ((high-low)>=size && (Index(high-low)%(size-1))==0) || (Index(high-low+1)<size && (size%Index(high-low+1))==0))
+  RealScalar range_length = numext::real(high-low);
+  if((!NumTraits<Scalar>::IsInteger) || (range_length>=size && (Index(range_length)%(size-1))==0) || (Index(range_length+1)<size && (size%Index(range_length+1))==0))
   {
     VectorType n(size);
-    if((!NumTraits<Scalar>::IsInteger) || (high-low>=size))
+    if((!NumTraits<Scalar>::IsInteger) || (range_length>=size))
       for (int i=0; i<size; ++i)
-        n(i) = size==1 ? low : (low + ((high-low)*Scalar(i))/(size-1));
+        n(i) = size==1 ? low : (low + ((high-low)*Scalar(i))/RealScalar(size-1));
     else
       for (int i=0; i<size; ++i)
-        n(i) = size==1 ? low : low + Scalar((double(high-low+1)*double(i))/double(size));
+        n(i) = size==1 ? low : low + Scalar((double(range_length+1)*double(i))/double(size));
     VERIFY_IS_APPROX(m,n);
 
     // random access version
@@ -116,12 +117,12 @@ void testVectorType(const VectorType& base)
       CALL_SUBTEST( check_extremity_accuracy(m, low, high) );
   }
 
-  VERIFY( m(m.size()-1) <= high );
-  VERIFY( (m.array() <= high).all() );
-  VERIFY( (m.array() >= low).all() );
+  VERIFY( numext::real(m(m.size()-1)) <= numext::real(high) );
+  VERIFY( (m.array().real() <= numext::real(high)).all() );
+  VERIFY( (m.array().real() >= numext::real(low)).all() );
 
 
-  VERIFY( m(m.size()-1) >= low );
+  VERIFY( numext::real(m(m.size()-1)) >= numext::real(low) );
   if(size>=1)
   {
     VERIFY( internal::isApprox(m(0),low) );
@@ -135,7 +136,7 @@ void testVectorType(const VectorType& base)
   col_vector.setLinSpaced(size,low,high);
   // when using the extended precision (e.g., FPU) the relative error might exceed 1 bit
   // when computing the squared sum in isApprox, thus the 2x factor.
-  VERIFY( row_vector.isApprox(col_vector.transpose(), Scalar(2)*NumTraits<Scalar>::epsilon()));
+  VERIFY( row_vector.isApprox(col_vector.transpose(), RealScalar(2)*NumTraits<Scalar>::epsilon()));
 
   Matrix<Scalar,Dynamic,1> size_changer(size+50);
   size_changer.setLinSpaced(size,low,high);
@@ -157,18 +158,18 @@ void testVectorType(const VectorType& base)
   {
     Index n0 = VectorType::SizeAtCompileTime==Dynamic ? 0 : VectorType::SizeAtCompileTime;
     low = internal::random<Scalar>();
-    m = VectorType::LinSpaced(n0,low,low-1);
+    m = VectorType::LinSpaced(n0,low,low-RealScalar(1));
     VERIFY(m.size()==n0);
 
     if(VectorType::SizeAtCompileTime==Dynamic)
     {
       VERIFY_IS_EQUAL(VectorType::LinSpaced(n0,0,Scalar(n0-1)).sum(),Scalar(0));
-      VERIFY_IS_EQUAL(VectorType::LinSpaced(n0,low,low-1).sum(),Scalar(0));
+      VERIFY_IS_EQUAL(VectorType::LinSpaced(n0,low,low-RealScalar(1)).sum(),Scalar(0));
     }
 
     m.setLinSpaced(n0,0,Scalar(n0-1));
     VERIFY(m.size()==n0);
-    m.setLinSpaced(n0,low,low-1);
+    m.setLinSpaced(n0,low,low-RealScalar(1));
     VERIFY(m.size()==n0);
 
     // empty range only:
@@ -178,19 +179,37 @@ void testVectorType(const VectorType& base)
 
     if(NumTraits<Scalar>::IsInteger)
     {
-      VERIFY_IS_APPROX( VectorType::LinSpaced(size,low,Scalar(low+size-1)), VectorType::LinSpaced(size,Scalar(low+size-1),low).reverse() );
+      VERIFY_IS_APPROX( VectorType::LinSpaced(size,low,low+Scalar(size-1)), VectorType::LinSpaced(size,low+Scalar(size-1),low).reverse() );
 
       if(VectorType::SizeAtCompileTime==Dynamic)
       {
         // Check negative multiplicator path:
         for(Index k=1; k<5; ++k)
-          VERIFY_IS_APPROX( VectorType::LinSpaced(size,low,Scalar(low+(size-1)*k)), VectorType::LinSpaced(size,Scalar(low+(size-1)*k),low).reverse() );
+          VERIFY_IS_APPROX( VectorType::LinSpaced(size,low,low+Scalar((size-1)*k)), VectorType::LinSpaced(size,low+Scalar((size-1)*k),low).reverse() );
         // Check negative divisor path:
         for(Index k=1; k<5; ++k)
-          VERIFY_IS_APPROX( VectorType::LinSpaced(size*k,low,Scalar(low+size-1)), VectorType::LinSpaced(size*k,Scalar(low+size-1),low).reverse() );
+          VERIFY_IS_APPROX( VectorType::LinSpaced(size*k,low,low+Scalar(size-1)), VectorType::LinSpaced(size*k,low+Scalar(size-1),low).reverse() );
       }
     }
   }
+
+  // test setUnit()
+  if(m.size()>0)
+  {
+    for(Index k=0; k<10; ++k)
+    {
+      Index i = internal::random<Index>(0,m.size()-1);
+      m.setUnit(i);
+      VERIFY_IS_APPROX( m, VectorType::Unit(m.size(), i) );
+    }
+    if(VectorType::SizeAtCompileTime==Dynamic)
+    {
+      Index i = internal::random<Index>(0,2*m.size()-1);
+      m.setUnit(2*m.size(),i);
+      VERIFY_IS_APPROX( m, VectorType::Unit(m.size(),i) );
+    }
+  }
+
 }
 
 template<typename MatrixType>
@@ -221,45 +240,36 @@ void testMatrixType(const MatrixType& m)
   VERIFY_IS_APPROX( A(i,j), s1 );
 }
 
-void test_nullary()
+template<int>
+void bug79()
 {
-  CALL_SUBTEST_1( testMatrixType(Matrix2d()) );
-  CALL_SUBTEST_2( testMatrixType(MatrixXcf(internal::random<int>(1,300),internal::random<int>(1,300))) );
-  CALL_SUBTEST_3( testMatrixType(MatrixXf(internal::random<int>(1,300),internal::random<int>(1,300))) );
-  
-  for(int i = 0; i < g_repeat*10; i++) {
-    CALL_SUBTEST_4( testVectorType(VectorXd(internal::random<int>(1,30000))) );
-    CALL_SUBTEST_5( testVectorType(Vector4d()) );  // regression test for bug 232
-    CALL_SUBTEST_6( testVectorType(Vector3d()) );
-    CALL_SUBTEST_7( testVectorType(VectorXf(internal::random<int>(1,30000))) );
-    CALL_SUBTEST_8( testVectorType(Vector3f()) );
-    CALL_SUBTEST_8( testVectorType(Vector4f()) );
-    CALL_SUBTEST_8( testVectorType(Matrix<float,8,1>()) );
-    CALL_SUBTEST_8( testVectorType(Matrix<float,1,1>()) );
-
-    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(1,10))) );
-    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(9,300))) );
-    CALL_SUBTEST_9( testVectorType(Matrix<int,1,1>()) );
-  }
-
-#ifdef EIGEN_TEST_PART_6
   // Assignment of a RowVectorXd to a MatrixXd (regression test for bug #79).
   VERIFY( (MatrixXd(RowVectorXd::LinSpaced(3, 0, 1)) - RowVector3d(0, 0.5, 1)).norm() < std::numeric_limits<double>::epsilon() );
-#endif
+}
 
-#ifdef EIGEN_TEST_PART_9
+template<int>
+void bug1630()
+{
+  Array4d x4 = Array4d::LinSpaced(0.0, 1.0);
+  Array3d x3(Array4d::LinSpaced(0.0, 1.0).head(3));
+  VERIFY_IS_APPROX(x4.head(3), x3);
+}
+
+template<int>
+void nullary_overflow()
+{
   // Check possible overflow issue
-  {
-    int n = 60000;
-    ArrayXi a1(n), a2(n);
-    a1.setLinSpaced(n, 0, n-1);
-    for(int i=0; i<n; ++i)
-      a2(i) = i;
-    VERIFY_IS_APPROX(a1,a2);
-  }
-#endif
+  int n = 60000;
+  ArrayXi a1(n), a2(n);
+  a1.setLinSpaced(n, 0, n-1);
+  for(int i=0; i<n; ++i)
+    a2(i) = i;
+  VERIFY_IS_APPROX(a1,a2);
+}
 
-#ifdef EIGEN_TEST_PART_10
+template<int>
+void nullary_internal_logic()
+{
   // check some internal logic
   VERIFY((  internal::has_nullary_operator<internal::scalar_constant_op<double> >::value ));
   VERIFY(( !internal::has_unary_operator<internal::scalar_constant_op<double> >::value ));
@@ -271,10 +281,10 @@ void test_nullary()
   VERIFY((  internal::has_binary_operator<internal::scalar_identity_op<double> >::value ));
   VERIFY(( !internal::functor_has_linear_access<internal::scalar_identity_op<double> >::ret ));
 
-  VERIFY(( !internal::has_nullary_operator<internal::linspaced_op<float,float> >::value ));
-  VERIFY((  internal::has_unary_operator<internal::linspaced_op<float,float> >::value ));
-  VERIFY(( !internal::has_binary_operator<internal::linspaced_op<float,float> >::value ));
-  VERIFY((  internal::functor_has_linear_access<internal::linspaced_op<float,float> >::ret ));
+  VERIFY(( !internal::has_nullary_operator<internal::linspaced_op<float> >::value ));
+  VERIFY((  internal::has_unary_operator<internal::linspaced_op<float> >::value ));
+  VERIFY(( !internal::has_binary_operator<internal::linspaced_op<float> >::value ));
+  VERIFY((  internal::functor_has_linear_access<internal::linspaced_op<float> >::ret ));
 
   // Regression unit test for a weird MSVC bug.
   // Search "nullary_wrapper_workaround_msvc" in CoreEvaluators.h for the details.
@@ -295,10 +305,37 @@ void test_nullary()
     VERIFY(( !internal::has_binary_operator<internal::scalar_constant_op<float> >::value ));
     VERIFY((  internal::functor_has_linear_access<internal::scalar_constant_op<float> >::ret ));
 
-    VERIFY(( !internal::has_nullary_operator<internal::linspaced_op<int,int> >::value ));
-    VERIFY((  internal::has_unary_operator<internal::linspaced_op<int,int> >::value ));
-    VERIFY(( !internal::has_binary_operator<internal::linspaced_op<int,int> >::value ));
-    VERIFY((  internal::functor_has_linear_access<internal::linspaced_op<int,int> >::ret ));
+    VERIFY(( !internal::has_nullary_operator<internal::linspaced_op<int> >::value ));
+    VERIFY((  internal::has_unary_operator<internal::linspaced_op<int> >::value ));
+    VERIFY(( !internal::has_binary_operator<internal::linspaced_op<int> >::value ));
+    VERIFY((  internal::functor_has_linear_access<internal::linspaced_op<int> >::ret ));
   }
-#endif
+}
+
+EIGEN_DECLARE_TEST(nullary)
+{
+  CALL_SUBTEST_1( testMatrixType(Matrix2d()) );
+  CALL_SUBTEST_2( testMatrixType(MatrixXcf(internal::random<int>(1,300),internal::random<int>(1,300))) );
+  CALL_SUBTEST_3( testMatrixType(MatrixXf(internal::random<int>(1,300),internal::random<int>(1,300))) );
+  
+  for(int i = 0; i < g_repeat*10; i++) {
+    CALL_SUBTEST_3( testVectorType(VectorXcd(internal::random<int>(1,30000))) );
+    CALL_SUBTEST_4( testVectorType(VectorXd(internal::random<int>(1,30000))) );
+    CALL_SUBTEST_5( testVectorType(Vector4d()) );  // regression test for bug 232
+    CALL_SUBTEST_6( testVectorType(Vector3d()) );
+    CALL_SUBTEST_7( testVectorType(VectorXf(internal::random<int>(1,30000))) );
+    CALL_SUBTEST_8( testVectorType(Vector3f()) );
+    CALL_SUBTEST_8( testVectorType(Vector4f()) );
+    CALL_SUBTEST_8( testVectorType(Matrix<float,8,1>()) );
+    CALL_SUBTEST_8( testVectorType(Matrix<float,1,1>()) );
+
+    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(1,10))) );
+    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(9,300))) );
+    CALL_SUBTEST_9( testVectorType(Matrix<int,1,1>()) );
+  }
+
+  CALL_SUBTEST_6( bug79<0>() );
+  CALL_SUBTEST_6( bug1630<0>() );
+  CALL_SUBTEST_9( nullary_overflow<0>() );
+  CALL_SUBTEST_10( nullary_internal_logic<0>() );
 }
diff --git a/contrib/eigen/test/num_dimensions.cpp b/contrib/eigen/test/num_dimensions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ad7ef6979d67cf3390bb4ce609f688d183cfea0
--- /dev/null
+++ b/contrib/eigen/test/num_dimensions.cpp
@@ -0,0 +1,90 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/SparseCore>
+
+template<int ExpectedDim,typename Xpr>
+void check_dim(const Xpr& ) {
+  STATIC_CHECK( Xpr::NumDimensions == ExpectedDim );
+}
+
+#if EIGEN_HAS_CXX11
+template<template <typename,int,int> class Object>
+void map_num_dimensions()
+{
+  typedef Object<double, 1, 1> ArrayScalarType;
+  typedef Object<double, 2, 1> ArrayVectorType;
+  typedef Object<double, 1, 2> TransposeArrayVectorType;
+  typedef Object<double, 2, 2> ArrayType;
+  typedef Object<double, Eigen::Dynamic, 1> DynamicArrayVectorType;
+  typedef Object<double, 1, Eigen::Dynamic> DynamicTransposeArrayVectorType;
+  typedef Object<double, Eigen::Dynamic, Eigen::Dynamic> DynamicArrayType;
+
+  STATIC_CHECK(ArrayScalarType::NumDimensions == 0);
+  STATIC_CHECK(ArrayVectorType::NumDimensions == 1);
+  STATIC_CHECK(TransposeArrayVectorType::NumDimensions == 1);
+  STATIC_CHECK(ArrayType::NumDimensions == 2);
+  STATIC_CHECK(DynamicArrayVectorType::NumDimensions == 1);
+  STATIC_CHECK(DynamicTransposeArrayVectorType::NumDimensions == 1);
+  STATIC_CHECK(DynamicArrayType::NumDimensions == 2);
+
+  typedef Eigen::Map<ArrayScalarType> ArrayScalarMap;
+  typedef Eigen::Map<ArrayVectorType> ArrayVectorMap;
+  typedef Eigen::Map<TransposeArrayVectorType> TransposeArrayVectorMap;
+  typedef Eigen::Map<ArrayType> ArrayMap;
+  typedef Eigen::Map<DynamicArrayVectorType> DynamicArrayVectorMap;
+  typedef Eigen::Map<DynamicTransposeArrayVectorType> DynamicTransposeArrayVectorMap;
+  typedef Eigen::Map<DynamicArrayType> DynamicArrayMap;
+
+  STATIC_CHECK(ArrayScalarMap::NumDimensions == 0);
+  STATIC_CHECK(ArrayVectorMap::NumDimensions == 1);
+  STATIC_CHECK(TransposeArrayVectorMap::NumDimensions == 1);
+  STATIC_CHECK(ArrayMap::NumDimensions == 2);
+  STATIC_CHECK(DynamicArrayVectorMap::NumDimensions == 1);
+  STATIC_CHECK(DynamicTransposeArrayVectorMap::NumDimensions == 1);
+  STATIC_CHECK(DynamicArrayMap::NumDimensions == 2);
+}
+
+template<typename Scalar, int Rows, int Cols>
+using TArray = Array<Scalar,Rows,Cols>;
+
+template<typename Scalar, int Rows, int Cols>
+using TMatrix = Matrix<Scalar,Rows,Cols>;
+
+#endif
+
+EIGEN_DECLARE_TEST(num_dimensions)
+{
+  int n = 10;
+  ArrayXXd A(n,n);
+  CALL_SUBTEST( check_dim<2>(A) );
+  CALL_SUBTEST( check_dim<2>(A.block(1,1,2,2)) );
+  CALL_SUBTEST( check_dim<1>(A.col(1)) );
+  CALL_SUBTEST( check_dim<1>(A.row(1)) );
+
+  MatrixXd M(n,n);
+  CALL_SUBTEST( check_dim<0>(M.row(1)*M.col(1)) );
+
+  SparseMatrix<double> S(n,n);
+  CALL_SUBTEST( check_dim<2>(S) );
+  CALL_SUBTEST( check_dim<2>(S.block(1,1,2,2)) );
+  CALL_SUBTEST( check_dim<1>(S.col(1)) );
+  CALL_SUBTEST( check_dim<1>(S.row(1)) );
+
+  SparseVector<double> s(n);
+  CALL_SUBTEST( check_dim<1>(s) );
+  CALL_SUBTEST( check_dim<1>(s.head(2)) );
+  
+
+  #if EIGEN_HAS_CXX11
+  CALL_SUBTEST( map_num_dimensions<TArray>() );
+  CALL_SUBTEST( map_num_dimensions<TMatrix>() );
+  #endif
+}
diff --git a/contrib/eigen/test/numext.cpp b/contrib/eigen/test/numext.cpp
index beba9e911b37642d2820760e707417de9b43e7e0..8a2fde5015bc50baaf41d0c3b3eb8c2662626f20 100644
--- a/contrib/eigen/test/numext.cpp
+++ b/contrib/eigen/test/numext.cpp
@@ -9,6 +9,33 @@
 
 #include "main.h"
 
+template<typename T, typename U>
+bool check_if_equal_or_nans(const T& actual, const U& expected) {
+  return ((actual == expected) || ((numext::isnan)(actual) && (numext::isnan)(expected)));
+}
+
+template<typename T, typename U>
+bool check_if_equal_or_nans(const std::complex<T>& actual, const std::complex<U>& expected) {
+  return check_if_equal_or_nans(numext::real(actual), numext::real(expected))
+         && check_if_equal_or_nans(numext::imag(actual), numext::imag(expected));
+}
+
+template<typename T, typename U>
+bool test_is_equal_or_nans(const T& actual, const U& expected)
+{
+    if (check_if_equal_or_nans(actual, expected)) {
+      return true;
+    }
+
+    // false:
+    std::cerr
+        << "\n    actual   = " << actual
+        << "\n    expected = " << expected << "\n\n";
+    return false;
+}
+
+#define VERIFY_IS_EQUAL_OR_NANS(a, b) VERIFY(test_is_equal_or_nans(a, b))
+
 template<typename T>
 void check_abs() {
   typedef typename NumTraits<T>::Real Real;
@@ -19,7 +46,7 @@ void check_abs() {
   VERIFY_IS_EQUAL(numext::abs(T(0)), T(0));
   VERIFY_IS_EQUAL(numext::abs(T(1)), T(1));
 
-  for(int k=0; k<g_repeat*100; ++k)
+  for(int k=0; k<100; ++k)
   {
     T x = internal::random<T>();
     if(!internal::is_same<T,bool>::value)
@@ -34,21 +61,215 @@ void check_abs() {
   }
 }
 
-void test_numext() {
-  CALL_SUBTEST( check_abs<bool>() );
-  CALL_SUBTEST( check_abs<signed char>() );
-  CALL_SUBTEST( check_abs<unsigned char>() );
-  CALL_SUBTEST( check_abs<short>() );
-  CALL_SUBTEST( check_abs<unsigned short>() );
-  CALL_SUBTEST( check_abs<int>() );
-  CALL_SUBTEST( check_abs<unsigned int>() );
-  CALL_SUBTEST( check_abs<long>() );
-  CALL_SUBTEST( check_abs<unsigned long>() );
-  CALL_SUBTEST( check_abs<half>() );
-  CALL_SUBTEST( check_abs<float>() );
-  CALL_SUBTEST( check_abs<double>() );
-  CALL_SUBTEST( check_abs<long double>() );
-
-  CALL_SUBTEST( check_abs<std::complex<float> >() );
-  CALL_SUBTEST( check_abs<std::complex<double> >() );
+template<typename T>
+void check_arg() {
+  typedef typename NumTraits<T>::Real Real;
+  VERIFY_IS_EQUAL(numext::abs(T(0)), T(0));
+  VERIFY_IS_EQUAL(numext::abs(T(1)), T(1));
+
+  for(int k=0; k<100; ++k)
+  {
+    T x = internal::random<T>();
+    Real y = numext::arg(x);
+    VERIFY_IS_APPROX( y, std::arg(x) );
+  }
+}
+
+template<typename T>
+struct check_sqrt_impl {
+  static void run() {
+    for (int i=0; i<1000; ++i) {
+      const T x = numext::abs(internal::random<T>());
+      const T sqrtx = numext::sqrt(x);
+      VERIFY_IS_APPROX(sqrtx*sqrtx, x);
+    }
+
+    // Corner cases.
+    const T zero = T(0);
+    const T one = T(1);
+    const T inf = std::numeric_limits<T>::infinity();
+    const T nan = std::numeric_limits<T>::quiet_NaN();
+    VERIFY_IS_EQUAL(numext::sqrt(zero), zero);
+    VERIFY_IS_EQUAL(numext::sqrt(inf), inf);
+    VERIFY((numext::isnan)(numext::sqrt(nan)));
+    VERIFY((numext::isnan)(numext::sqrt(-one)));
+  }
+};
+
+template<typename T>
+struct check_sqrt_impl<std::complex<T>  > {
+  static void run() {
+    typedef typename std::complex<T> ComplexT;
+
+    for (int i=0; i<1000; ++i) {
+      const ComplexT x = internal::random<ComplexT>();
+      const ComplexT sqrtx = numext::sqrt(x);
+      VERIFY_IS_APPROX(sqrtx*sqrtx, x);
+    }
+
+    // Corner cases.
+    const T zero = T(0);
+    const T one = T(1);
+    const T inf = std::numeric_limits<T>::infinity();
+    const T nan = std::numeric_limits<T>::quiet_NaN();
+
+    // Set of corner cases from https://en.cppreference.com/w/cpp/numeric/complex/sqrt
+    const int kNumCorners = 20;
+    const ComplexT corners[kNumCorners][2] = {
+      {ComplexT(zero, zero), ComplexT(zero, zero)},
+      {ComplexT(-zero, zero), ComplexT(zero, zero)},
+      {ComplexT(zero, -zero), ComplexT(zero, zero)},
+      {ComplexT(-zero, -zero), ComplexT(zero, zero)},
+      {ComplexT(one, inf), ComplexT(inf, inf)},
+      {ComplexT(nan, inf), ComplexT(inf, inf)},
+      {ComplexT(one, -inf), ComplexT(inf, -inf)},
+      {ComplexT(nan, -inf), ComplexT(inf, -inf)},
+      {ComplexT(-inf, one), ComplexT(zero, inf)},
+      {ComplexT(inf, one), ComplexT(inf, zero)},
+      {ComplexT(-inf, -one), ComplexT(zero, -inf)},
+      {ComplexT(inf, -one), ComplexT(inf, -zero)},
+      {ComplexT(-inf, nan), ComplexT(nan, inf)},
+      {ComplexT(inf, nan), ComplexT(inf, nan)},
+      {ComplexT(zero, nan), ComplexT(nan, nan)},
+      {ComplexT(one, nan), ComplexT(nan, nan)},
+      {ComplexT(nan, zero), ComplexT(nan, nan)},
+      {ComplexT(nan, one), ComplexT(nan, nan)},
+      {ComplexT(nan, -one), ComplexT(nan, nan)},
+      {ComplexT(nan, nan), ComplexT(nan, nan)},
+    };
+
+    for (int i=0; i<kNumCorners; ++i) {
+      const ComplexT& x = corners[i][0];
+      const ComplexT sqrtx = corners[i][1];
+      VERIFY_IS_EQUAL_OR_NANS(numext::sqrt(x), sqrtx);
+    }
+  }
+};
+
+template<typename T>
+void check_sqrt() {
+  check_sqrt_impl<T>::run();
+}
+
+template<typename T>
+struct check_rsqrt_impl {
+  static void run() {
+    const T zero = T(0);
+    const T one = T(1);
+    const T inf = std::numeric_limits<T>::infinity();
+    const T nan = std::numeric_limits<T>::quiet_NaN();
+
+    for (int i=0; i<1000; ++i) {
+      const T x = numext::abs(internal::random<T>());
+      const T rsqrtx = numext::rsqrt(x);
+      const T invx = one / x;
+      VERIFY_IS_APPROX(rsqrtx*rsqrtx, invx);
+    }
+
+    // Corner cases.
+    VERIFY_IS_EQUAL(numext::rsqrt(zero), inf);
+    VERIFY_IS_EQUAL(numext::rsqrt(inf), zero);
+    VERIFY((numext::isnan)(numext::rsqrt(nan)));
+    VERIFY((numext::isnan)(numext::rsqrt(-one)));
+  }
+};
+
+template<typename T>
+struct check_rsqrt_impl<std::complex<T> > {
+  static void run() {
+    typedef typename std::complex<T> ComplexT;
+    const T zero = T(0);
+    const T one = T(1);
+    const T inf = std::numeric_limits<T>::infinity();
+    const T nan = std::numeric_limits<T>::quiet_NaN();
+
+    for (int i=0; i<1000; ++i) {
+      const ComplexT x = internal::random<ComplexT>();
+      const ComplexT invx = ComplexT(one, zero) / x;
+      const ComplexT rsqrtx = numext::rsqrt(x);
+      VERIFY_IS_APPROX(rsqrtx*rsqrtx, invx);
+    }
+
+    // GCC and MSVC differ in their treatment of 1/(0 + 0i)
+    //   GCC/clang = (inf, nan)
+    //   MSVC = (nan, nan)
+    // and 1 / (x + inf i)
+    //   GCC/clang = (0, 0)
+    //   MSVC = (nan, nan)
+    #if (EIGEN_COMP_GNUC)
+    {
+      const int kNumCorners = 20;
+      const ComplexT corners[kNumCorners][2] = {
+        // Only consistent across GCC, clang
+        {ComplexT(zero, zero), ComplexT(zero, zero)},
+        {ComplexT(-zero, zero), ComplexT(zero, zero)},
+        {ComplexT(zero, -zero), ComplexT(zero, zero)},
+        {ComplexT(-zero, -zero), ComplexT(zero, zero)},
+        {ComplexT(one, inf), ComplexT(inf, inf)},
+        {ComplexT(nan, inf), ComplexT(inf, inf)},
+        {ComplexT(one, -inf), ComplexT(inf, -inf)},
+        {ComplexT(nan, -inf), ComplexT(inf, -inf)},
+        // Consistent across GCC, clang, MSVC
+        {ComplexT(-inf, one), ComplexT(zero, inf)},
+        {ComplexT(inf, one), ComplexT(inf, zero)},
+        {ComplexT(-inf, -one), ComplexT(zero, -inf)},
+        {ComplexT(inf, -one), ComplexT(inf, -zero)},
+        {ComplexT(-inf, nan), ComplexT(nan, inf)},
+        {ComplexT(inf, nan), ComplexT(inf, nan)},
+        {ComplexT(zero, nan), ComplexT(nan, nan)},
+        {ComplexT(one, nan), ComplexT(nan, nan)},
+        {ComplexT(nan, zero), ComplexT(nan, nan)},
+        {ComplexT(nan, one), ComplexT(nan, nan)},
+        {ComplexT(nan, -one), ComplexT(nan, nan)},
+        {ComplexT(nan, nan), ComplexT(nan, nan)},
+      };
+
+      for (int i=0; i<kNumCorners; ++i) {
+        const ComplexT& x = corners[i][0];
+        const ComplexT rsqrtx = ComplexT(one, zero) / corners[i][1];
+        VERIFY_IS_EQUAL_OR_NANS(numext::rsqrt(x), rsqrtx);
+      }
+    }
+    #endif
+  }
+};
+
+template<typename T>
+void check_rsqrt() {
+  check_rsqrt_impl<T>::run();
+}
+
+EIGEN_DECLARE_TEST(numext) {
+  for(int k=0; k<g_repeat; ++k)
+  {
+    CALL_SUBTEST( check_abs<bool>() );
+    CALL_SUBTEST( check_abs<signed char>() );
+    CALL_SUBTEST( check_abs<unsigned char>() );
+    CALL_SUBTEST( check_abs<short>() );
+    CALL_SUBTEST( check_abs<unsigned short>() );
+    CALL_SUBTEST( check_abs<int>() );
+    CALL_SUBTEST( check_abs<unsigned int>() );
+    CALL_SUBTEST( check_abs<long>() );
+    CALL_SUBTEST( check_abs<unsigned long>() );
+    CALL_SUBTEST( check_abs<half>() );
+    CALL_SUBTEST( check_abs<bfloat16>() );
+    CALL_SUBTEST( check_abs<float>() );
+    CALL_SUBTEST( check_abs<double>() );
+    CALL_SUBTEST( check_abs<long double>() );
+    CALL_SUBTEST( check_abs<std::complex<float> >() );
+    CALL_SUBTEST( check_abs<std::complex<double> >() );
+
+    CALL_SUBTEST( check_arg<std::complex<float> >() );
+    CALL_SUBTEST( check_arg<std::complex<double> >() );
+
+    CALL_SUBTEST( check_sqrt<float>() );
+    CALL_SUBTEST( check_sqrt<double>() );
+    CALL_SUBTEST( check_sqrt<std::complex<float> >() );
+    CALL_SUBTEST( check_sqrt<std::complex<double> >() );
+    
+    CALL_SUBTEST( check_rsqrt<float>() );
+    CALL_SUBTEST( check_rsqrt<double>() );
+    CALL_SUBTEST( check_rsqrt<std::complex<float> >() );
+    CALL_SUBTEST( check_rsqrt<std::complex<double> >() );
+  }
 }
diff --git a/contrib/eigen/test/packetmath.cpp b/contrib/eigen/test/packetmath.cpp
index 330848c3d611cc27086b8b62fb3b042ec89c07ff..121ec7283cacf1ecd79886d8906ea973aa07b2a7 100644
--- a/contrib/eigen/test/packetmath.cpp
+++ b/contrib/eigen/test/packetmath.cpp
@@ -8,284 +8,562 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#include "main.h"
-#include "unsupported/Eigen/SpecialFunctions"
+#include "packetmath_test_shared.h"
+#include "random_without_cast_overflow.h"
 
-#if defined __GNUC__ && __GNUC__>=6
-  #pragma GCC diagnostic ignored "-Wignored-attributes"
-#endif
-// using namespace Eigen;
-
-#ifdef EIGEN_VECTORIZE_SSE
-const bool g_vectorize_sse = true;
-#else
-const bool g_vectorize_sse = false;
-#endif
+template <typename T>
+inline T REF_ADD(const T& a, const T& b) {
+  return a + b;
+}
+template <typename T>
+inline T REF_SUB(const T& a, const T& b) {
+  return a - b;
+}
+template <typename T>
+inline T REF_MUL(const T& a, const T& b) {
+  return a * b;
+}
+template <typename T>
+inline T REF_DIV(const T& a, const T& b) {
+  return a / b;
+}
+template <typename T>
+inline T REF_ABS_DIFF(const T& a, const T& b) {
+  return a > b ? a - b : b - a;
+}
 
-namespace Eigen {
-namespace internal {
-template<typename T> T negate(const T& x) { return -x; }
+// Specializations for bool.
+template <>
+inline bool REF_ADD(const bool& a, const bool& b) {
+  return a || b;
+}
+template <>
+inline bool REF_SUB(const bool& a, const bool& b) {
+  return a ^ b;
 }
+template <>
+inline bool REF_MUL(const bool& a, const bool& b) {
+  return a && b;
 }
 
-// NOTE: we disbale inlining for this function to workaround a GCC issue when using -O3 and the i387 FPU.
-template<typename Scalar> EIGEN_DONT_INLINE
-bool isApproxAbs(const Scalar& a, const Scalar& b, const typename NumTraits<Scalar>::Real& refvalue)
-{
-  return internal::isMuchSmallerThan(a-b, refvalue);
+template <typename T>
+inline T REF_FREXP(const T& x, T& exp) {
+  int iexp;
+  EIGEN_USING_STD(frexp)
+  const T out = static_cast<T>(frexp(x, &iexp));
+  exp = static_cast<T>(iexp);
+  return out;
 }
 
-template<typename Scalar> bool areApproxAbs(const Scalar* a, const Scalar* b, int size, const typename NumTraits<Scalar>::Real& refvalue)
-{
-  for (int i=0; i<size; ++i)
-  {
-    if (!isApproxAbs(a[i],b[i],refvalue))
-    {
-      std::cout << "ref: [" << Map<const Matrix<Scalar,1,Dynamic> >(a,size) << "]" << " != vec: [" << Map<const Matrix<Scalar,1,Dynamic> >(b,size) << "]\n";
-      return false;
+template <typename T>
+inline T REF_LDEXP(const T& x, const T& exp) {
+  EIGEN_USING_STD(ldexp)
+  return static_cast<T>(ldexp(x, static_cast<int>(exp)));
+}
+
+// Uses pcast to cast from one array to another.
+template <typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
+struct pcast_array;
+
+template <typename SrcPacket, typename TgtPacket, int TgtCoeffRatio>
+struct pcast_array<SrcPacket, TgtPacket, 1, TgtCoeffRatio> {
+  typedef typename internal::unpacket_traits<SrcPacket>::type SrcScalar;
+  typedef typename internal::unpacket_traits<TgtPacket>::type TgtScalar;
+  static void cast(const SrcScalar* src, size_t size, TgtScalar* dst) {
+    static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+    size_t i;
+    for (i = 0; i < size && i + SrcPacketSize <= size; i += TgtPacketSize) {
+      internal::pstoreu(dst + i, internal::pcast<SrcPacket, TgtPacket>(internal::ploadu<SrcPacket>(src + i)));
+    }
+    // Leftovers that cannot be loaded into a packet.
+    for (; i < size; ++i) {
+      dst[i] = static_cast<TgtScalar>(src[i]);
     }
   }
-  return true;
-}
+};
 
-template<typename Scalar> bool areApprox(const Scalar* a, const Scalar* b, int size)
-{
-  for (int i=0; i<size; ++i)
-  {
-    if (a[i]!=b[i] && !internal::isApprox(a[i],b[i]))
-    {
-      std::cout << "ref: [" << Map<const Matrix<Scalar,1,Dynamic> >(a,size) << "]" << " != vec: [" << Map<const Matrix<Scalar,1,Dynamic> >(b,size) << "]\n";
-      return false;
+template <typename SrcPacket, typename TgtPacket>
+struct pcast_array<SrcPacket, TgtPacket, 2, 1> {
+  static void cast(const typename internal::unpacket_traits<SrcPacket>::type* src, size_t size,
+                   typename internal::unpacket_traits<TgtPacket>::type* dst) {
+    static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+    for (size_t i = 0; i < size; i += TgtPacketSize) {
+      SrcPacket a = internal::ploadu<SrcPacket>(src + i);
+      SrcPacket b = internal::ploadu<SrcPacket>(src + i + SrcPacketSize);
+      internal::pstoreu(dst + i, internal::pcast<SrcPacket, TgtPacket>(a, b));
     }
   }
-  return true;
-}
+};
 
-#define CHECK_CWISE1(REFOP, POP) { \
-  for (int i=0; i<PacketSize; ++i) \
-    ref[i] = REFOP(data1[i]); \
-  internal::pstore(data2, POP(internal::pload<Packet>(data1))); \
-  VERIFY(areApprox(ref, data2, PacketSize) && #POP); \
-}
+template <typename SrcPacket, typename TgtPacket>
+struct pcast_array<SrcPacket, TgtPacket, 4, 1> {
+  static void cast(const typename internal::unpacket_traits<SrcPacket>::type* src, size_t size,
+                   typename internal::unpacket_traits<TgtPacket>::type* dst) {
+    static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+    for (size_t i = 0; i < size; i += TgtPacketSize) {
+      SrcPacket a = internal::ploadu<SrcPacket>(src + i);
+      SrcPacket b = internal::ploadu<SrcPacket>(src + i + SrcPacketSize);
+      SrcPacket c = internal::ploadu<SrcPacket>(src + i + 2 * SrcPacketSize);
+      SrcPacket d = internal::ploadu<SrcPacket>(src + i + 3 * SrcPacketSize);
+      internal::pstoreu(dst + i, internal::pcast<SrcPacket, TgtPacket>(a, b, c, d));
+    }
+  }
+};
 
-template<bool Cond,typename Packet>
-struct packet_helper
-{
-  template<typename T>
-  inline Packet load(const T* from) const { return internal::pload<Packet>(from); }
+template <typename SrcPacket, typename TgtPacket>
+struct pcast_array<SrcPacket, TgtPacket, 8, 1> {
+  static void cast(const typename internal::unpacket_traits<SrcPacket>::type* src, size_t size,
+                   typename internal::unpacket_traits<TgtPacket>::type* dst) {
+    static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+    for (size_t i = 0; i < size; i += TgtPacketSize) {
+      SrcPacket a = internal::ploadu<SrcPacket>(src + i);
+      SrcPacket b = internal::ploadu<SrcPacket>(src + i + SrcPacketSize);
+      SrcPacket c = internal::ploadu<SrcPacket>(src + i + 2 * SrcPacketSize);
+      SrcPacket d = internal::ploadu<SrcPacket>(src + i + 3 * SrcPacketSize);
+      SrcPacket e = internal::ploadu<SrcPacket>(src + i + 4 * SrcPacketSize);
+      SrcPacket f = internal::ploadu<SrcPacket>(src + i + 5 * SrcPacketSize);
+      SrcPacket g = internal::ploadu<SrcPacket>(src + i + 6 * SrcPacketSize);
+      SrcPacket h = internal::ploadu<SrcPacket>(src + i + 7 * SrcPacketSize);
+      internal::pstoreu(dst + i, internal::pcast<SrcPacket, TgtPacket>(a, b, c, d, e, f, g, h));
+    }
+  }
+};
+
+template <typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio, bool CanCast = false>
+struct test_cast_helper;
 
-  template<typename T>
-  inline void store(T* to, const Packet& x) const { internal::pstore(to,x); }
+template <typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
+struct test_cast_helper<SrcPacket, TgtPacket, SrcCoeffRatio, TgtCoeffRatio, false> {
+  static void run() {}
 };
 
-template<typename Packet>
-struct packet_helper<false,Packet>
-{
-  template<typename T>
-  inline T load(const T* from) const { return *from; }
+template <typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
+struct test_cast_helper<SrcPacket, TgtPacket, SrcCoeffRatio, TgtCoeffRatio, true> {
+  static void run() {
+    typedef typename internal::unpacket_traits<SrcPacket>::type SrcScalar;
+    typedef typename internal::unpacket_traits<TgtPacket>::type TgtScalar;
+    static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+    static const int BlockSize = SrcPacketSize * SrcCoeffRatio;
+    eigen_assert(BlockSize == TgtPacketSize * TgtCoeffRatio && "Packet sizes and cast ratios are mismatched.");
+
+    static const int DataSize = 10 * BlockSize;
+    EIGEN_ALIGN_MAX SrcScalar data1[DataSize];
+    EIGEN_ALIGN_MAX TgtScalar data2[DataSize];
+    EIGEN_ALIGN_MAX TgtScalar ref[DataSize];
+
+    // Construct a packet of scalars that will not overflow when casting
+    for (int i = 0; i < DataSize; ++i) {
+      data1[i] = internal::random_without_cast_overflow<SrcScalar, TgtScalar>::value();
+    }
+
+    for (int i = 0; i < DataSize; ++i) {
+      ref[i] = static_cast<const TgtScalar>(data1[i]);
+    }
 
-  template<typename T>
-  inline void store(T* to, const T& x) const { *to = x; }
+    pcast_array<SrcPacket, TgtPacket, SrcCoeffRatio, TgtCoeffRatio>::cast(data1, DataSize, data2);
+
+    VERIFY(test::areApprox(ref, data2, DataSize) && "internal::pcast<>");
+  }
 };
 
-#define CHECK_CWISE1_IF(COND, REFOP, POP) if(COND) { \
-  packet_helper<COND,Packet> h; \
-  for (int i=0; i<PacketSize; ++i) \
-    ref[i] = REFOP(data1[i]); \
-  h.store(data2, POP(h.load(data1))); \
-  VERIFY(areApprox(ref, data2, PacketSize) && #POP); \
+template <typename SrcPacket, typename TgtPacket>
+struct test_cast {
+  static void run() {
+    typedef typename internal::unpacket_traits<SrcPacket>::type SrcScalar;
+    typedef typename internal::unpacket_traits<TgtPacket>::type TgtScalar;
+    typedef typename internal::type_casting_traits<SrcScalar, TgtScalar> TypeCastingTraits;
+    static const int SrcCoeffRatio = TypeCastingTraits::SrcCoeffRatio;
+    static const int TgtCoeffRatio = TypeCastingTraits::TgtCoeffRatio;
+    static const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    static const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+    static const bool HasCast =
+        internal::unpacket_traits<SrcPacket>::vectorizable && internal::unpacket_traits<TgtPacket>::vectorizable &&
+        TypeCastingTraits::VectorizedCast && (SrcPacketSize * SrcCoeffRatio == TgtPacketSize * TgtCoeffRatio);
+    test_cast_helper<SrcPacket, TgtPacket, SrcCoeffRatio, TgtCoeffRatio, HasCast>::run();
+  }
+};
+
+template <typename SrcPacket, typename TgtScalar,
+          typename TgtPacket = typename internal::packet_traits<TgtScalar>::type,
+          bool Vectorized = internal::packet_traits<TgtScalar>::Vectorizable,
+          bool HasHalf = !internal::is_same<typename internal::unpacket_traits<TgtPacket>::half, TgtPacket>::value>
+struct test_cast_runner;
+
+template <typename SrcPacket, typename TgtScalar, typename TgtPacket>
+struct test_cast_runner<SrcPacket, TgtScalar, TgtPacket, true, false> {
+  static void run() { test_cast<SrcPacket, TgtPacket>::run(); }
+};
+
+template <typename SrcPacket, typename TgtScalar, typename TgtPacket>
+struct test_cast_runner<SrcPacket, TgtScalar, TgtPacket, true, true> {
+  static void run() {
+    test_cast<SrcPacket, TgtPacket>::run();
+    test_cast_runner<SrcPacket, TgtScalar, typename internal::unpacket_traits<TgtPacket>::half>::run();
+  }
+};
+
+template <typename SrcPacket, typename TgtScalar, typename TgtPacket>
+struct test_cast_runner<SrcPacket, TgtScalar, TgtPacket, false, false> {
+  static void run() {}
+};
+
+template <typename Scalar, typename Packet, typename EnableIf = void>
+struct packetmath_pcast_ops_runner {
+  static void run() {
+    test_cast_runner<Packet, float>::run();
+    test_cast_runner<Packet, double>::run();
+    test_cast_runner<Packet, int8_t>::run();
+    test_cast_runner<Packet, uint8_t>::run();
+    test_cast_runner<Packet, int16_t>::run();
+    test_cast_runner<Packet, uint16_t>::run();
+    test_cast_runner<Packet, int32_t>::run();
+    test_cast_runner<Packet, uint32_t>::run();
+    test_cast_runner<Packet, int64_t>::run();
+    test_cast_runner<Packet, uint64_t>::run();
+    test_cast_runner<Packet, bool>::run();
+    test_cast_runner<Packet, std::complex<float> >::run();
+    test_cast_runner<Packet, std::complex<double> >::run();
+    test_cast_runner<Packet, half>::run();
+    test_cast_runner<Packet, bfloat16>::run();
+  }
+};
+
+// Only some types support cast from std::complex<>.
+template <typename Scalar, typename Packet>
+struct packetmath_pcast_ops_runner<Scalar, Packet, typename internal::enable_if<NumTraits<Scalar>::IsComplex>::type> {
+  static void run() {
+    test_cast_runner<Packet, std::complex<float> >::run();
+    test_cast_runner<Packet, std::complex<double> >::run();
+    test_cast_runner<Packet, half>::run();
+    test_cast_runner<Packet, bfloat16>::run();
+  }
+};
+
+template <typename Scalar, typename Packet>
+void packetmath_boolean_mask_ops() {
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+  const int size = 2 * PacketSize;
+  EIGEN_ALIGN_MAX Scalar data1[size];
+  EIGEN_ALIGN_MAX Scalar data2[size];
+  EIGEN_ALIGN_MAX Scalar ref[size];
+
+  for (int i = 0; i < size; ++i) {
+    data1[i] = internal::random<Scalar>();
+  }
+  CHECK_CWISE1(internal::ptrue, internal::ptrue);
+  CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot);
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(i);
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+
+  CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
+
+  //Test (-0) == (0) for signed operations
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(-0.0);
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
+
+  //Test NaN
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = NumTraits<Scalar>::quiet_NaN();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
 }
 
-#define CHECK_CWISE2_IF(COND, REFOP, POP) if(COND) { \
-  packet_helper<COND,Packet> h; \
-  for (int i=0; i<PacketSize; ++i) \
-    ref[i] = REFOP(data1[i], data1[i+PacketSize]); \
-  h.store(data2, POP(h.load(data1),h.load(data1+PacketSize))); \
-  VERIFY(areApprox(ref, data2, PacketSize) && #POP); \
+template <typename Scalar, typename Packet>
+void packetmath_boolean_mask_ops_real() {
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+  const int size = 2 * PacketSize;
+  EIGEN_ALIGN_MAX Scalar data1[size];
+  EIGEN_ALIGN_MAX Scalar data2[size];
+  EIGEN_ALIGN_MAX Scalar ref[size];
+
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = internal::random<Scalar>();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+
+  CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
+
+  //Test (-0) <=/< (0) for signed operations
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(-0.0);
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
+
+  //Test NaN
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = NumTraits<Scalar>::quiet_NaN();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
 }
 
-#define REF_ADD(a,b) ((a)+(b))
-#define REF_SUB(a,b) ((a)-(b))
-#define REF_MUL(a,b) ((a)*(b))
-#define REF_DIV(a,b) ((a)/(b))
+template <typename Scalar, typename Packet>
+void packetmath_boolean_mask_ops_notcomplex() {
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+  const int size = 2 * PacketSize;
+  EIGEN_ALIGN_MAX Scalar data1[size];
+  EIGEN_ALIGN_MAX Scalar data2[size];
+  EIGEN_ALIGN_MAX Scalar ref[size];
+
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = internal::random<Scalar>();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+
+  CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
+  CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
 
-template<typename Scalar> void packetmath()
-{
-  using std::abs;
+  //Test (-0) <=/< (0) for signed operations
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(-0.0);
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
+  CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
+
+  //Test NaN
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = NumTraits<Scalar>::quiet_NaN();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
+  CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
+}
+
+// Packet16b representing bool does not support ptrue, pandnot or pcmp_eq, since the scalar path
+// (for some compilers) compute the bitwise and with 0x1 of the results to keep the value in [0,1].
+template<>
+void packetmath_boolean_mask_ops<bool, internal::packet_traits<bool>::type>() {}
+template<>
+void packetmath_boolean_mask_ops_notcomplex<bool, internal::packet_traits<bool>::type>() {}
+
+template <typename Scalar, typename Packet>
+void packetmath_minus_zero_add() {
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+  const int size = 2 * PacketSize;
+  EIGEN_ALIGN_MAX Scalar data1[size];
+  EIGEN_ALIGN_MAX Scalar data2[size];
+  EIGEN_ALIGN_MAX Scalar ref[size];
+
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(-0.0);
+    data1[i + PacketSize] = Scalar(-0.0);
+  }
+  CHECK_CWISE2_IF(internal::packet_traits<Scalar>::HasAdd, REF_ADD, internal::padd);
+}
+
+// Ensure optimization barrier compiles and doesn't modify contents.
+// Only applies to raw types, so will not work for std::complex, Eigen::half
+// or Eigen::bfloat16. For those you would need to refer to an underlying
+// storage element.
+template<typename Packet, typename EnableIf = void>
+struct eigen_optimization_barrier_test {
+  static void run() {}
+};
+
+template<typename Packet>
+struct eigen_optimization_barrier_test<Packet, typename internal::enable_if<
+    !NumTraits<Packet>::IsComplex &&
+    !internal::is_same<Packet, Eigen::half>::value &&
+    !internal::is_same<Packet, Eigen::bfloat16>::value
+  >::type> {
+  static void run() {
+    typedef typename internal::unpacket_traits<Packet>::type Scalar;
+    Scalar s = internal::random<Scalar>();
+    Packet barrier = internal::pset1<Packet>(s);
+    EIGEN_OPTIMIZATION_BARRIER(barrier);
+    eigen_assert(s == internal::pfirst(barrier) && "EIGEN_OPTIMIZATION_BARRIER");
+  }
+};
+
+template <typename Scalar, typename Packet>
+void packetmath() {
   typedef internal::packet_traits<Scalar> PacketTraits;
-  typedef typename PacketTraits::type Packet;
-  const int PacketSize = PacketTraits::size;
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
+  if (g_first_pass)
+    std::cerr << "=== Testing packet of type '" << typeid(Packet).name() << "' and scalar type '"
+              << typeid(Scalar).name() << "' and size '" << PacketSize << "' ===\n";
+
   const int max_size = PacketSize > 4 ? PacketSize : 4;
-  const int size = PacketSize*max_size;
+  const int size = PacketSize * max_size;
   EIGEN_ALIGN_MAX Scalar data1[size];
   EIGEN_ALIGN_MAX Scalar data2[size];
-  EIGEN_ALIGN_MAX Packet packets[PacketSize*2];
+  EIGEN_ALIGN_MAX Scalar data3[size];
   EIGEN_ALIGN_MAX Scalar ref[size];
-  RealScalar refvalue = 0;
-  for (int i=0; i<size; ++i)
-  {
-    data1[i] = internal::random<Scalar>()/RealScalar(PacketSize);
-    data2[i] = internal::random<Scalar>()/RealScalar(PacketSize);
-    refvalue = (std::max)(refvalue,abs(data1[i]));
+  RealScalar refvalue = RealScalar(0);
+
+  eigen_optimization_barrier_test<Packet>::run();
+  eigen_optimization_barrier_test<Scalar>::run();
+
+  for (int i = 0; i < size; ++i) {
+    data1[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+    data2[i] = internal::random<Scalar>() / RealScalar(PacketSize);
+    refvalue = (std::max)(refvalue, numext::abs(data1[i]));
   }
 
   internal::pstore(data2, internal::pload<Packet>(data1));
-  VERIFY(areApprox(data1, data2, PacketSize) && "aligned load/store");
+  VERIFY(test::areApprox(data1, data2, PacketSize) && "aligned load/store");
 
-  for (int offset=0; offset<PacketSize; ++offset)
-  {
-    internal::pstore(data2, internal::ploadu<Packet>(data1+offset));
-    VERIFY(areApprox(data1+offset, data2, PacketSize) && "internal::ploadu");
+  for (int offset = 0; offset < PacketSize; ++offset) {
+    internal::pstore(data2, internal::ploadu<Packet>(data1 + offset));
+    VERIFY(test::areApprox(data1 + offset, data2, PacketSize) && "internal::ploadu");
   }
 
-  for (int offset=0; offset<PacketSize; ++offset)
-  {
-    internal::pstoreu(data2+offset, internal::pload<Packet>(data1));
-    VERIFY(areApprox(data1, data2+offset, PacketSize) && "internal::pstoreu");
+  for (int offset = 0; offset < PacketSize; ++offset) {
+    internal::pstoreu(data2 + offset, internal::pload<Packet>(data1));
+    VERIFY(test::areApprox(data1, data2 + offset, PacketSize) && "internal::pstoreu");
   }
 
-  for (int offset=0; offset<PacketSize; ++offset)
-  {
-    packets[0] = internal::pload<Packet>(data1);
-    packets[1] = internal::pload<Packet>(data1+PacketSize);
-         if (offset==0) internal::palign<0>(packets[0], packets[1]);
-    else if (offset==1) internal::palign<1>(packets[0], packets[1]);
-    else if (offset==2) internal::palign<2>(packets[0], packets[1]);
-    else if (offset==3) internal::palign<3>(packets[0], packets[1]);
-    else if (offset==4) internal::palign<4>(packets[0], packets[1]);
-    else if (offset==5) internal::palign<5>(packets[0], packets[1]);
-    else if (offset==6) internal::palign<6>(packets[0], packets[1]);
-    else if (offset==7) internal::palign<7>(packets[0], packets[1]);
-    else if (offset==8) internal::palign<8>(packets[0], packets[1]);
-    else if (offset==9) internal::palign<9>(packets[0], packets[1]);
-    else if (offset==10) internal::palign<10>(packets[0], packets[1]);
-    else if (offset==11) internal::palign<11>(packets[0], packets[1]);
-    else if (offset==12) internal::palign<12>(packets[0], packets[1]);
-    else if (offset==13) internal::palign<13>(packets[0], packets[1]);
-    else if (offset==14) internal::palign<14>(packets[0], packets[1]);
-    else if (offset==15) internal::palign<15>(packets[0], packets[1]);
-    internal::pstore(data2, packets[0]);
-
-    for (int i=0; i<PacketSize; ++i)
-      ref[i] = data1[i+offset];
-
-    VERIFY(areApprox(ref, data2, PacketSize) && "internal::palign");
+  if (internal::unpacket_traits<Packet>::masked_load_available) {
+    test::packet_helper<internal::unpacket_traits<Packet>::masked_load_available, Packet> h;
+    unsigned long long max_umask = (0x1ull << PacketSize);
+
+    for (int offset = 0; offset < PacketSize; ++offset) {
+      for (unsigned long long umask = 0; umask < max_umask; ++umask) {
+        h.store(data2, h.load(data1 + offset, umask));
+        for (int k = 0; k < PacketSize; ++k) data3[k] = ((umask & (0x1ull << k)) >> k) ? data1[k + offset] : Scalar(0);
+        VERIFY(test::areApprox(data3, data2, PacketSize) && "internal::ploadu masked");
+      }
+    }
+  }
+
+  if (internal::unpacket_traits<Packet>::masked_store_available) {
+    test::packet_helper<internal::unpacket_traits<Packet>::masked_store_available, Packet> h;
+    unsigned long long max_umask = (0x1ull << PacketSize);
+
+    for (int offset = 0; offset < PacketSize; ++offset) {
+      for (unsigned long long umask = 0; umask < max_umask; ++umask) {
+        internal::pstore(data2, internal::pset1<Packet>(Scalar(0)));
+        h.store(data2, h.loadu(data1 + offset), umask);
+        for (int k = 0; k < PacketSize; ++k) data3[k] = ((umask & (0x1ull << k)) >> k) ? data1[k + offset] : Scalar(0);
+        VERIFY(test::areApprox(data3, data2, PacketSize) && "internal::pstoreu masked");
+      }
+    }
   }
 
   VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasAdd);
   VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasSub);
   VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMul);
-  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasNegate);
-  VERIFY((internal::is_same<Scalar,int>::value) || (!PacketTraits::Vectorizable) || PacketTraits::HasDiv);
 
-  CHECK_CWISE2_IF(PacketTraits::HasAdd, REF_ADD,  internal::padd);
-  CHECK_CWISE2_IF(PacketTraits::HasSub, REF_SUB,  internal::psub);
-  CHECK_CWISE2_IF(PacketTraits::HasMul, REF_MUL,  internal::pmul);
+  CHECK_CWISE2_IF(PacketTraits::HasAdd, REF_ADD, internal::padd);
+  CHECK_CWISE2_IF(PacketTraits::HasSub, REF_SUB, internal::psub);
+  CHECK_CWISE2_IF(PacketTraits::HasMul, REF_MUL, internal::pmul);
   CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv);
 
-  CHECK_CWISE1(internal::negate, internal::pnegate);
+  if (PacketTraits::HasNegate) CHECK_CWISE1(internal::negate, internal::pnegate);
   CHECK_CWISE1(numext::conj, internal::pconj);
 
-  for(int offset=0;offset<3;++offset)
-  {
-    for (int i=0; i<PacketSize; ++i)
-      ref[i] = data1[offset];
+  for (int offset = 0; offset < 3; ++offset) {
+    for (int i = 0; i < PacketSize; ++i) ref[i] = data1[offset];
     internal::pstore(data2, internal::pset1<Packet>(data1[offset]));
-    VERIFY(areApprox(ref, data2, PacketSize) && "internal::pset1");
+    VERIFY(test::areApprox(ref, data2, PacketSize) && "internal::pset1");
   }
 
   {
-    for (int i=0; i<PacketSize*4; ++i)
-      ref[i] = data1[i/PacketSize];
+    for (int i = 0; i < PacketSize * 4; ++i) ref[i] = data1[i / PacketSize];
     Packet A0, A1, A2, A3;
     internal::pbroadcast4<Packet>(data1, A0, A1, A2, A3);
-    internal::pstore(data2+0*PacketSize, A0);
-    internal::pstore(data2+1*PacketSize, A1);
-    internal::pstore(data2+2*PacketSize, A2);
-    internal::pstore(data2+3*PacketSize, A3);
-    VERIFY(areApprox(ref, data2, 4*PacketSize) && "internal::pbroadcast4");
+    internal::pstore(data2 + 0 * PacketSize, A0);
+    internal::pstore(data2 + 1 * PacketSize, A1);
+    internal::pstore(data2 + 2 * PacketSize, A2);
+    internal::pstore(data2 + 3 * PacketSize, A3);
+    VERIFY(test::areApprox(ref, data2, 4 * PacketSize) && "internal::pbroadcast4");
   }
 
   {
-    for (int i=0; i<PacketSize*2; ++i)
-      ref[i] = data1[i/PacketSize];
+    for (int i = 0; i < PacketSize * 2; ++i) ref[i] = data1[i / PacketSize];
     Packet A0, A1;
     internal::pbroadcast2<Packet>(data1, A0, A1);
-    internal::pstore(data2+0*PacketSize, A0);
-    internal::pstore(data2+1*PacketSize, A1);
-    VERIFY(areApprox(ref, data2, 2*PacketSize) && "internal::pbroadcast2");
+    internal::pstore(data2 + 0 * PacketSize, A0);
+    internal::pstore(data2 + 1 * PacketSize, A1);
+    VERIFY(test::areApprox(ref, data2, 2 * PacketSize) && "internal::pbroadcast2");
   }
 
   VERIFY(internal::isApprox(data1[0], internal::pfirst(internal::pload<Packet>(data1))) && "internal::pfirst");
 
-  if(PacketSize>1)
-  {
-    for(int offset=0;offset<4;++offset)
-    {
-      for(int i=0;i<PacketSize/2;++i)
-        ref[2*i+0] = ref[2*i+1] = data1[offset+i];
-      internal::pstore(data2,internal::ploaddup<Packet>(data1+offset));
-      VERIFY(areApprox(ref, data2, PacketSize) && "ploaddup");
+  if (PacketSize > 1) {
+    // apply different offsets to check that ploaddup is robust to unaligned inputs
+    for (int offset = 0; offset < 4; ++offset) {
+      for (int i = 0; i < PacketSize / 2; ++i) ref[2 * i + 0] = ref[2 * i + 1] = data1[offset + i];
+      internal::pstore(data2, internal::ploaddup<Packet>(data1 + offset));
+      VERIFY(test::areApprox(ref, data2, PacketSize) && "ploaddup");
     }
   }
 
-  if(PacketSize>2)
-  {
-    for(int offset=0;offset<4;++offset)
-    {
-      for(int i=0;i<PacketSize/4;++i)
-        ref[4*i+0] = ref[4*i+1] = ref[4*i+2] = ref[4*i+3] = data1[offset+i];
-      internal::pstore(data2,internal::ploadquad<Packet>(data1+offset));
-      VERIFY(areApprox(ref, data2, PacketSize) && "ploadquad");
+  if (PacketSize > 2) {
+    // apply different offsets to check that ploadquad is robust to unaligned inputs
+    for (int offset = 0; offset < 4; ++offset) {
+      for (int i = 0; i < PacketSize / 4; ++i)
+        ref[4 * i + 0] = ref[4 * i + 1] = ref[4 * i + 2] = ref[4 * i + 3] = data1[offset + i];
+      internal::pstore(data2, internal::ploadquad<Packet>(data1 + offset));
+      VERIFY(test::areApprox(ref, data2, PacketSize) && "ploadquad");
     }
   }
 
-  ref[0] = 0;
-  for (int i=0; i<PacketSize; ++i)
-    ref[0] += data1[i];
-  VERIFY(isApproxAbs(ref[0], internal::predux(internal::pload<Packet>(data1)), refvalue) && "internal::predux");
-
-  {
-    int newsize = PacketSize>4?PacketSize/2:PacketSize;
-    for (int i=0; i<newsize; ++i)
-      ref[i] = 0;
-    for (int i=0; i<PacketSize; ++i)
-      ref[i%newsize] += data1[i];
-    internal::pstore(data2, internal::predux_downto4(internal::pload<Packet>(data1)));
-    VERIFY(areApprox(ref, data2, newsize) && "internal::predux_downto4");
-  }
-
-  ref[0] = 1;
-  for (int i=0; i<PacketSize; ++i)
-    ref[0] *= data1[i];
-  VERIFY(internal::isApprox(ref[0], internal::predux_mul(internal::pload<Packet>(data1))) && "internal::predux_mul");
+  ref[0] = Scalar(0);
+  for (int i = 0; i < PacketSize; ++i) ref[0] += data1[i];
+  VERIFY(test::isApproxAbs(ref[0], internal::predux(internal::pload<Packet>(data1)), refvalue) && "internal::predux");
 
-  for (int j=0; j<PacketSize; ++j)
-  {
-    ref[j] = 0;
-    for (int i=0; i<PacketSize; ++i)
-      ref[j] += data1[i+j*PacketSize];
-    packets[j] = internal::pload<Packet>(data1+j*PacketSize);
+  if (!internal::is_same<Packet, typename internal::unpacket_traits<Packet>::half>::value) {
+    int HalfPacketSize = PacketSize > 4 ? PacketSize / 2 : PacketSize;
+    for (int i = 0; i < HalfPacketSize; ++i) ref[i] = Scalar(0);
+    for (int i = 0; i < PacketSize; ++i) ref[i % HalfPacketSize] += data1[i];
+    internal::pstore(data2, internal::predux_half_dowto4(internal::pload<Packet>(data1)));
+    VERIFY(test::areApprox(ref, data2, HalfPacketSize) && "internal::predux_half_dowto4");
   }
-  internal::pstore(data2, internal::preduxp(packets));
-  VERIFY(areApproxAbs(ref, data2, PacketSize, refvalue) && "internal::preduxp");
 
-  for (int i=0; i<PacketSize; ++i)
-    ref[i] = data1[PacketSize-i-1];
+  ref[0] = Scalar(1);
+  for (int i = 0; i < PacketSize; ++i) ref[0] = REF_MUL(ref[0], data1[i]);
+  VERIFY(internal::isApprox(ref[0], internal::predux_mul(internal::pload<Packet>(data1))) && "internal::predux_mul");
+
+  for (int i = 0; i < PacketSize; ++i) ref[i] = data1[PacketSize - i - 1];
   internal::pstore(data2, internal::preverse(internal::pload<Packet>(data1)));
-  VERIFY(areApprox(ref, data2, PacketSize) && "internal::preverse");
+  VERIFY(test::areApprox(ref, data2, PacketSize) && "internal::preverse");
 
   internal::PacketBlock<Packet> kernel;
-  for (int i=0; i<PacketSize; ++i) {
-    kernel.packet[i] = internal::pload<Packet>(data1+i*PacketSize);
+  for (int i = 0; i < PacketSize; ++i) {
+    kernel.packet[i] = internal::pload<Packet>(data1 + i * PacketSize);
   }
   ptranspose(kernel);
-  for (int i=0; i<PacketSize; ++i) {
+  for (int i = 0; i < PacketSize; ++i) {
     internal::pstore(data2, kernel.packet[i]);
     for (int j = 0; j < PacketSize; ++j) {
-      VERIFY(isApproxAbs(data2[j], data1[i+j*PacketSize], refvalue) && "ptranspose");
+      VERIFY(test::isApproxAbs(data2[j], data1[i + j * PacketSize], refvalue) && "ptranspose");
+    }
+  }
+
+  // GeneralBlockPanelKernel also checks PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize>;
+  if (PacketSize > 4 && PacketSize % 4 == 0) {
+    internal::PacketBlock<Packet, PacketSize%4==0?4:PacketSize> kernel2;
+    for (int i = 0; i < 4; ++i) {
+      kernel2.packet[i] = internal::pload<Packet>(data1 + i * PacketSize);
+    }
+    ptranspose(kernel2);
+    int data_counter = 0;
+    for (int i = 0; i < PacketSize; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        data2[data_counter++] = data1[j*PacketSize + i];
+      }
+    }
+    for (int i = 0; i < 4; ++i) {
+      internal::pstore(data3, kernel2.packet[i]);
+      for (int j = 0; j < PacketSize; ++j) {
+        VERIFY(test::isApproxAbs(data3[j], data2[i*PacketSize + j], refvalue) && "ptranspose");
+      }
     }
   }
 
@@ -301,342 +579,724 @@ template<typename Scalar> void packetmath()
     EIGEN_ALIGN_MAX Scalar result[size];
     internal::pstore(result, blend);
     for (int i = 0; i < PacketSize; ++i) {
-      VERIFY(isApproxAbs(result[i], (selector.select[i] ? data1[i] : data2[i]), refvalue));
+      VERIFY(test::isApproxAbs(result[i], (selector.select[i] ? data1[i] : data2[i]), refvalue));
     }
   }
 
-  if (PacketTraits::HasBlend) {
-    // pinsertfirst
-    for (int i=0; i<PacketSize; ++i)
-      ref[i] = data1[i];
-    Scalar s = internal::random<Scalar>();
-    ref[0] = s;
-    internal::pstore(data2, internal::pinsertfirst(internal::pload<Packet>(data1),s));
-    VERIFY(areApprox(ref, data2, PacketSize) && "internal::pinsertfirst");
+  {
+    for (int i = 0; i < PacketSize; ++i) {
+      // "if" mask
+      unsigned char v = internal::random<bool>() ? 0xff : 0;
+      char* bytes = (char*)(data1 + i);
+      for (int k = 0; k < int(sizeof(Scalar)); ++k) {
+        bytes[k] = v;
+      }
+      // "then" packet
+      data1[i + PacketSize] = internal::random<Scalar>();
+      // "else" packet
+      data1[i + 2 * PacketSize] = internal::random<Scalar>();
+    }
+    CHECK_CWISE3_IF(true, internal::pselect, internal::pselect);
   }
 
-  if (PacketTraits::HasBlend) {
-    // pinsertlast
-    for (int i=0; i<PacketSize; ++i)
-      ref[i] = data1[i];
-    Scalar s = internal::random<Scalar>();
-    ref[PacketSize-1] = s;
-    internal::pstore(data2, internal::pinsertlast(internal::pload<Packet>(data1),s));
-    VERIFY(areApprox(ref, data2, PacketSize) && "internal::pinsertlast");
+  for (int i = 0; i < size; ++i) {
+    data1[i] = internal::random<Scalar>();
+  }
+  CHECK_CWISE1(internal::pzero, internal::pzero);
+  CHECK_CWISE2_IF(true, internal::por, internal::por);
+  CHECK_CWISE2_IF(true, internal::pxor, internal::pxor);
+  CHECK_CWISE2_IF(true, internal::pand, internal::pand);
+
+  packetmath_boolean_mask_ops<Scalar, Packet>();
+  packetmath_pcast_ops_runner<Scalar, Packet>::run();
+  packetmath_minus_zero_add<Scalar, Packet>();
+
+  for (int i = 0; i < size; ++i) {
+    data1[i] = numext::abs(internal::random<Scalar>());
   }
+  CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
+  CHECK_CWISE1_IF(PacketTraits::HasRsqrt, numext::rsqrt, internal::prsqrt);
+}
+
+// Notice that this definition works for complex types as well.
+// c++11 has std::log2 for real, but not for complex types.
+template <typename Scalar>
+Scalar log2(Scalar x) {
+  return Scalar(EIGEN_LOG2E) * std::log(x);
 }
 
-template<typename Scalar> void packetmath_real()
-{
-  using std::abs;
+template <typename Scalar, typename Packet>
+void packetmath_real() {
   typedef internal::packet_traits<Scalar> PacketTraits;
-  typedef typename PacketTraits::type Packet;
-  const int PacketSize = PacketTraits::size;
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
 
-  const int size = PacketSize*4;
-  EIGEN_ALIGN_MAX Scalar data1[PacketTraits::size*4];
-  EIGEN_ALIGN_MAX Scalar data2[PacketTraits::size*4];
-  EIGEN_ALIGN_MAX Scalar ref[PacketTraits::size*4];
+  const int size = PacketSize * 4;
+  EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4];
+  EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4];
+  EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4];
 
-  for (int i=0; i<size; ++i)
-  {
-    data1[i] = internal::random<Scalar>(-1,1) * std::pow(Scalar(10), internal::random<Scalar>(-3,3));
-    data2[i] = internal::random<Scalar>(-1,1) * std::pow(Scalar(10), internal::random<Scalar>(-3,3));
+  for (int i = 0; i < size; ++i) {
+    data1[i] = Scalar(internal::random<double>(0, 1) * std::pow(10., internal::random<double>(-6, 6)));
+    data2[i] = Scalar(internal::random<double>(0, 1) * std::pow(10., internal::random<double>(-6, 6)));
+  }
+
+  if (internal::random<float>(0, 1) < 0.1f) data1[internal::random<int>(0, PacketSize)] = Scalar(0);
+
+  CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog);
+  CHECK_CWISE1_IF(PacketTraits::HasLog, log2, internal::plog2);
+  CHECK_CWISE1_IF(PacketTraits::HasRsqrt, numext::rsqrt, internal::prsqrt);
+
+  for (int i = 0; i < size; ++i) {
+    data1[i] = Scalar(internal::random<double>(-1, 1) * std::pow(10., internal::random<double>(-3, 3)));
+    data2[i] = Scalar(internal::random<double>(-1, 1) * std::pow(10., internal::random<double>(-3, 3)));
   }
   CHECK_CWISE1_IF(PacketTraits::HasSin, std::sin, internal::psin);
   CHECK_CWISE1_IF(PacketTraits::HasCos, std::cos, internal::pcos);
   CHECK_CWISE1_IF(PacketTraits::HasTan, std::tan, internal::ptan);
 
-  CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround);
-  CHECK_CWISE1_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
-  CHECK_CWISE1_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
+  CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::round, internal::pround);
+  CHECK_CWISE1_EXACT_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
+  CHECK_CWISE1_EXACT_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
+  CHECK_CWISE1_EXACT_IF(PacketTraits::HasRint, numext::rint, internal::print);
+
+  packetmath_boolean_mask_ops_real<Scalar,Packet>();
+  
+  // Rounding edge cases.
+  if (PacketTraits::HasRound || PacketTraits::HasCeil || PacketTraits::HasFloor || PacketTraits::HasRint) {
+    typedef typename internal::make_integer<Scalar>::type IntType;
+    // Start with values that cannot fit inside an integer, work down to less than one.
+    Scalar val = numext::mini(
+        Scalar(2) * static_cast<Scalar>(NumTraits<IntType>::highest()),
+        NumTraits<Scalar>::highest());
+    std::vector<Scalar> values;
+    while (val > Scalar(0.25)) {
+      // Cover both even and odd, positive and negative cases.
+      values.push_back(val);
+      values.push_back(val + Scalar(0.3));
+      values.push_back(val + Scalar(0.5));
+      values.push_back(val + Scalar(0.8));
+      values.push_back(val + Scalar(1));
+      values.push_back(val + Scalar(1.3));
+      values.push_back(val + Scalar(1.5));
+      values.push_back(val + Scalar(1.8));
+      values.push_back(-val);
+      values.push_back(-val - Scalar(0.3));
+      values.push_back(-val - Scalar(0.5));
+      values.push_back(-val - Scalar(0.8));
+      values.push_back(-val - Scalar(1));
+      values.push_back(-val - Scalar(1.3));
+      values.push_back(-val - Scalar(1.5));
+      values.push_back(-val - Scalar(1.8));
+      values.push_back(Scalar(-1.5) + val);  // Bug 1785.
+      val = val / Scalar(2);
+    }
+    values.push_back(NumTraits<Scalar>::infinity());
+    values.push_back(-NumTraits<Scalar>::infinity());
+    values.push_back(NumTraits<Scalar>::quiet_NaN());
+    
+    for (size_t k=0; k<values.size(); ++k) {
+      data1[0] = values[k];
+      CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::round, internal::pround);
+      CHECK_CWISE1_EXACT_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
+      CHECK_CWISE1_EXACT_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
+      CHECK_CWISE1_EXACT_IF(PacketTraits::HasRint, numext::rint, internal::print);
+    }
+  }
 
-  for (int i=0; i<size; ++i)
-  {
-    data1[i] = internal::random<Scalar>(-1,1);
-    data2[i] = internal::random<Scalar>(-1,1);
+  for (int i = 0; i < size; ++i) {
+    data1[i] = Scalar(internal::random<double>(-1, 1));
+    data2[i] = Scalar(internal::random<double>(-1, 1));
   }
   CHECK_CWISE1_IF(PacketTraits::HasASin, std::asin, internal::pasin);
   CHECK_CWISE1_IF(PacketTraits::HasACos, std::acos, internal::pacos);
 
-  for (int i=0; i<size; ++i)
-  {
-    data1[i] = internal::random<Scalar>(-87,88);
-    data2[i] = internal::random<Scalar>(-87,88);
+  for (int i = 0; i < size; ++i) {
+    data1[i] = Scalar(internal::random<double>(-87, 88));
+    data2[i] = Scalar(internal::random<double>(-87, 88));
   }
   CHECK_CWISE1_IF(PacketTraits::HasExp, std::exp, internal::pexp);
-  for (int i=0; i<size; ++i)
-  {
-    data1[i] = internal::random<Scalar>(-1,1) * std::pow(Scalar(10), internal::random<Scalar>(-6,6));
-    data2[i] = internal::random<Scalar>(-1,1) * std::pow(Scalar(10), internal::random<Scalar>(-6,6));
+  
+  CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
+  if (PacketTraits::HasExp) {
+    // Check denormals:
+    for (int j=0; j<3; ++j) {
+      data1[0] = Scalar(std::ldexp(1, NumTraits<Scalar>::min_exponent()-j));
+      CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
+      data1[0] = -data1[0];
+      CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
+    }
+    
+    // zero
+    data1[0] = Scalar(0);
+    CHECK_CWISE1_BYREF1_IF(PacketTraits::HasExp, REF_FREXP, internal::pfrexp);
+    
+    // inf and NaN only compare output fraction, not exponent.
+    test::packet_helper<PacketTraits::HasExp,Packet> h;
+    Packet pout;
+    Scalar sout;
+    Scalar special[] = { NumTraits<Scalar>::infinity(), 
+                        -NumTraits<Scalar>::infinity(),
+                         NumTraits<Scalar>::quiet_NaN()};
+    for (int i=0; i<3; ++i) {
+      data1[0] = special[i];
+      ref[0] = Scalar(REF_FREXP(data1[0], ref[PacketSize]));
+      h.store(data2, internal::pfrexp(h.load(data1), h.forward_reference(pout, sout)));
+      VERIFY(test::areApprox(ref, data2, 1) && "internal::pfrexp");
+    }
+  }
+  
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(internal::random<double>(-1, 1));
+    data2[i] = Scalar(internal::random<double>(-1, 1));
   }
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i+PacketSize] = Scalar(internal::random<int>(-4, 4));
+    data2[i+PacketSize] = Scalar(internal::random<double>(-4, 4));
+  }
+  CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+  if (PacketTraits::HasExp) {
+    data1[0] = Scalar(-1);
+    // underflow to zero
+    data1[PacketSize] = Scalar(NumTraits<Scalar>::min_exponent()-55);
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    // overflow to inf
+    data1[PacketSize] = Scalar(NumTraits<Scalar>::max_exponent()+10);
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    // NaN stays NaN
+    data1[0] = NumTraits<Scalar>::quiet_NaN();
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    VERIFY((numext::isnan)(data2[0]));
+    // inf stays inf
+    data1[0] = NumTraits<Scalar>::infinity();
+    data1[PacketSize] = Scalar(NumTraits<Scalar>::min_exponent()-10);
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    // zero stays zero
+    data1[0] = Scalar(0);
+    data1[PacketSize] = Scalar(NumTraits<Scalar>::max_exponent()+10);
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    // Small number big exponent.
+    data1[0] = Scalar(std::ldexp(Scalar(1.0), NumTraits<Scalar>::min_exponent()-1));
+    data1[PacketSize] = Scalar(-NumTraits<Scalar>::min_exponent()
+                               +NumTraits<Scalar>::max_exponent());
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+    // Big number small exponent.
+    data1[0] = Scalar(std::ldexp(Scalar(1.0), NumTraits<Scalar>::max_exponent()-1));
+    data1[PacketSize] = Scalar(+NumTraits<Scalar>::min_exponent()
+                               -NumTraits<Scalar>::max_exponent());
+    CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
+  }
+
+  for (int i = 0; i < size; ++i) {
+    data1[i] = Scalar(internal::random<double>(-1, 1) * std::pow(10., internal::random<double>(-6, 6)));
+    data2[i] = Scalar(internal::random<double>(-1, 1) * std::pow(10., internal::random<double>(-6, 6)));
+  }
+  data1[0] = Scalar(1e-20);
   CHECK_CWISE1_IF(PacketTraits::HasTanh, std::tanh, internal::ptanh);
-  if(PacketTraits::HasExp && PacketTraits::size>=2)
-  {
-    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
-    data1[1] = std::numeric_limits<Scalar>::epsilon();
-    packet_helper<PacketTraits::HasExp,Packet> h;
+  if (PacketTraits::HasExp && PacketSize >= 2) {
+    const Scalar small = NumTraits<Scalar>::epsilon();
+    data1[0] = NumTraits<Scalar>::quiet_NaN();
+    data1[1] = small;
+    test::packet_helper<PacketTraits::HasExp, Packet> h;
     h.store(data2, internal::pexp(h.load(data1)));
     VERIFY((numext::isnan)(data2[0]));
-    VERIFY_IS_EQUAL(std::exp(std::numeric_limits<Scalar>::epsilon()), data2[1]);
+    // TODO(rmlarsen): Re-enable for bfloat16.
+    if (!internal::is_same<Scalar, bfloat16>::value) {
+      VERIFY_IS_APPROX(std::exp(small), data2[1]);
+    }
 
-    data1[0] = -std::numeric_limits<Scalar>::epsilon();
-    data1[1] = 0;
+    data1[0] = -small;
+    data1[1] = Scalar(0);
     h.store(data2, internal::pexp(h.load(data1)));
-    VERIFY_IS_EQUAL(std::exp(-std::numeric_limits<Scalar>::epsilon()), data2[0]);
+    // TODO(rmlarsen): Re-enable for bfloat16.
+    if (!internal::is_same<Scalar, bfloat16>::value) {
+      VERIFY_IS_APPROX(std::exp(-small), data2[0]);
+    }
     VERIFY_IS_EQUAL(std::exp(Scalar(0)), data2[1]);
 
     data1[0] = (std::numeric_limits<Scalar>::min)();
     data1[1] = -(std::numeric_limits<Scalar>::min)();
     h.store(data2, internal::pexp(h.load(data1)));
-    VERIFY_IS_EQUAL(std::exp((std::numeric_limits<Scalar>::min)()), data2[0]);
-    VERIFY_IS_EQUAL(std::exp(-(std::numeric_limits<Scalar>::min)()), data2[1]);
+    VERIFY_IS_APPROX(std::exp((std::numeric_limits<Scalar>::min)()), data2[0]);
+    VERIFY_IS_APPROX(std::exp(-(std::numeric_limits<Scalar>::min)()), data2[1]);
 
     data1[0] = std::numeric_limits<Scalar>::denorm_min();
     data1[1] = -std::numeric_limits<Scalar>::denorm_min();
     h.store(data2, internal::pexp(h.load(data1)));
-    VERIFY_IS_EQUAL(std::exp(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
-    VERIFY_IS_EQUAL(std::exp(-std::numeric_limits<Scalar>::denorm_min()), data2[1]);
+    VERIFY_IS_APPROX(std::exp(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
+    VERIFY_IS_APPROX(std::exp(-std::numeric_limits<Scalar>::denorm_min()), data2[1]);
   }
 
   if (PacketTraits::HasTanh) {
     // NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details.
-    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
-    packet_helper<internal::packet_traits<Scalar>::HasTanh,Packet> h;
+    data1[0] = NumTraits<Scalar>::quiet_NaN();
+    test::packet_helper<internal::packet_traits<Scalar>::HasTanh, Packet> h;
     h.store(data2, internal::ptanh(h.load(data1)));
     VERIFY((numext::isnan)(data2[0]));
   }
 
-#if EIGEN_HAS_C99_MATH
-  {
-    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
-    packet_helper<internal::packet_traits<Scalar>::HasLGamma,Packet> h;
-    h.store(data2, internal::plgamma(h.load(data1)));
-    VERIFY((numext::isnan)(data2[0]));
-  }
-  {
-    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
-    packet_helper<internal::packet_traits<Scalar>::HasErf,Packet> h;
-    h.store(data2, internal::perf(h.load(data1)));
-    VERIFY((numext::isnan)(data2[0]));
-  }
-  {
-    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
-    packet_helper<internal::packet_traits<Scalar>::HasErfc,Packet> h;
-    h.store(data2, internal::perfc(h.load(data1)));
-    VERIFY((numext::isnan)(data2[0]));
-  }
-#endif  // EIGEN_HAS_C99_MATH
+  if (PacketTraits::HasExp) {
+    internal::scalar_logistic_op<Scalar> logistic;
+    for (int i = 0; i < size; ++i) {
+      data1[i] = Scalar(internal::random<double>(-20, 20));
+    }
 
-  for (int i=0; i<size; ++i)
-  {
-    data1[i] = internal::random<Scalar>(0,1) * std::pow(Scalar(10), internal::random<Scalar>(-6,6));
-    data2[i] = internal::random<Scalar>(0,1) * std::pow(Scalar(10), internal::random<Scalar>(-6,6));
+    test::packet_helper<PacketTraits::HasExp, Packet> h;
+    h.store(data2, logistic.packetOp(h.load(data1)));
+    for (int i = 0; i < PacketSize; ++i) {
+      VERIFY_IS_APPROX(data2[i], logistic(data1[i]));
+    }
   }
 
-  if(internal::random<float>(0,1)<0.1f)
-    data1[internal::random<int>(0, PacketSize)] = 0;
-  CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt);
-  CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog);
-#if EIGEN_HAS_C99_MATH && (__cplusplus > 199711L)
+#if EIGEN_HAS_C99_MATH && (EIGEN_COMP_CXXVER >= 11)
+  data1[0] = NumTraits<Scalar>::infinity();
+  data1[1] = Scalar(-1);
   CHECK_CWISE1_IF(PacketTraits::HasLog1p, std::log1p, internal::plog1p);
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc);
+  data1[0] = NumTraits<Scalar>::infinity();
+  data1[1] = -NumTraits<Scalar>::infinity();
+  CHECK_CWISE1_IF(PacketTraits::HasExpm1, std::expm1, internal::pexpm1);
 #endif
 
-  if(PacketTraits::HasLog && PacketTraits::size>=2)
-  {
-    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
-    data1[1] = std::numeric_limits<Scalar>::epsilon();
-    packet_helper<PacketTraits::HasLog,Packet> h;
-    h.store(data2, internal::plog(h.load(data1)));
-    VERIFY((numext::isnan)(data2[0]));
-    VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::epsilon()), data2[1]);
+  if (PacketSize >= 2) {
+    data1[0] = NumTraits<Scalar>::quiet_NaN();
+    data1[1] = NumTraits<Scalar>::epsilon();
+    if (PacketTraits::HasLog) {
+      test::packet_helper<PacketTraits::HasLog, Packet> h;
+      h.store(data2, internal::plog(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      // TODO(cantonios): Re-enable for bfloat16.
+      if (!internal::is_same<Scalar, bfloat16>::value) {
+        VERIFY_IS_APPROX(std::log(data1[1]), data2[1]);
+      }
+
+      data1[0] = -NumTraits<Scalar>::epsilon();
+      data1[1] = Scalar(0);
+      h.store(data2, internal::plog(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      VERIFY_IS_EQUAL(std::log(Scalar(0)), data2[1]);
+
+      data1[0] = (std::numeric_limits<Scalar>::min)();
+      data1[1] = -(std::numeric_limits<Scalar>::min)();
+      h.store(data2, internal::plog(h.load(data1)));
+      // TODO(cantonios): Re-enable for bfloat16.
+      if (!internal::is_same<Scalar, bfloat16>::value) {
+        VERIFY_IS_APPROX(std::log((std::numeric_limits<Scalar>::min)()), data2[0]);
+      }
+      VERIFY((numext::isnan)(data2[1]));
+
+      // Note: 32-bit arm always flushes denorms to zero.
+#if !EIGEN_ARCH_ARM
+      if (std::numeric_limits<Scalar>::has_denorm == std::denorm_present) {
+        data1[0] = std::numeric_limits<Scalar>::denorm_min();
+        data1[1] = -std::numeric_limits<Scalar>::denorm_min();
+        h.store(data2, internal::plog(h.load(data1)));
+        // TODO(rmlarsen): Reenable.
+        //        VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
+        VERIFY((numext::isnan)(data2[1]));
+      }
+#endif
 
-    data1[0] = -std::numeric_limits<Scalar>::epsilon();
-    data1[1] = 0;
-    h.store(data2, internal::plog(h.load(data1)));
-    VERIFY((numext::isnan)(data2[0]));
-    VERIFY_IS_EQUAL(std::log(Scalar(0)), data2[1]);
+      data1[0] = Scalar(-1.0f);
+      h.store(data2, internal::plog(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
 
-    data1[0] = (std::numeric_limits<Scalar>::min)();
-    data1[1] = -(std::numeric_limits<Scalar>::min)();
-    h.store(data2, internal::plog(h.load(data1)));
-    VERIFY_IS_EQUAL(std::log((std::numeric_limits<Scalar>::min)()), data2[0]);
-    VERIFY((numext::isnan)(data2[1]));
+      data1[0] = NumTraits<Scalar>::infinity();
+      h.store(data2, internal::plog(h.load(data1)));
+      VERIFY((numext::isinf)(data2[0]));
+    }
+    if (PacketTraits::HasLog1p) {
+      test::packet_helper<PacketTraits::HasLog1p, Packet> h;
+      data1[0] = Scalar(-2);
+      data1[1] = -NumTraits<Scalar>::infinity();
+      h.store(data2, internal::plog1p(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      VERIFY((numext::isnan)(data2[1]));
+    }
+    if (PacketTraits::HasSqrt) {
+      test::packet_helper<PacketTraits::HasSqrt, Packet> h;
+      data1[0] = Scalar(-1.0f);
+      if (std::numeric_limits<Scalar>::has_denorm == std::denorm_present) {
+        data1[1] = -std::numeric_limits<Scalar>::denorm_min();
+      } else {
+        data1[1] = -NumTraits<Scalar>::epsilon();
+      }
+      h.store(data2, internal::psqrt(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      VERIFY((numext::isnan)(data2[1]));
+    }
+    // TODO(rmlarsen): Re-enable for half and bfloat16.
+    if (PacketTraits::HasCos
+        && !internal::is_same<Scalar, half>::value
+        && !internal::is_same<Scalar, bfloat16>::value) {
+      test::packet_helper<PacketTraits::HasCos, Packet> h;
+      for (Scalar k = Scalar(1); k < Scalar(10000) / NumTraits<Scalar>::epsilon(); k *= Scalar(2)) {
+        for (int k1 = 0; k1 <= 1; ++k1) {
+          data1[0] = Scalar((2 * double(k) + k1) * double(EIGEN_PI) / 2 * internal::random<double>(0.8, 1.2));
+          data1[1] = Scalar((2 * double(k) + 2 + k1) * double(EIGEN_PI) / 2 * internal::random<double>(0.8, 1.2));
+          h.store(data2, internal::pcos(h.load(data1)));
+          h.store(data2 + PacketSize, internal::psin(h.load(data1)));
+          VERIFY(data2[0] <= Scalar(1.) && data2[0] >= Scalar(-1.));
+          VERIFY(data2[1] <= Scalar(1.) && data2[1] >= Scalar(-1.));
+          VERIFY(data2[PacketSize + 0] <= Scalar(1.) && data2[PacketSize + 0] >= Scalar(-1.));
+          VERIFY(data2[PacketSize + 1] <= Scalar(1.) && data2[PacketSize + 1] >= Scalar(-1.));
+
+          VERIFY_IS_APPROX(data2[0], std::cos(data1[0]));
+          VERIFY_IS_APPROX(data2[1], std::cos(data1[1]));
+          VERIFY_IS_APPROX(data2[PacketSize + 0], std::sin(data1[0]));
+          VERIFY_IS_APPROX(data2[PacketSize + 1], std::sin(data1[1]));
+
+          VERIFY_IS_APPROX(numext::abs2(data2[0]) + numext::abs2(data2[PacketSize + 0]), Scalar(1));
+          VERIFY_IS_APPROX(numext::abs2(data2[1]) + numext::abs2(data2[PacketSize + 1]), Scalar(1));
+        }
+      }
+
+      data1[0] = NumTraits<Scalar>::infinity();
+      data1[1] = -NumTraits<Scalar>::infinity();
+      h.store(data2, internal::psin(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      VERIFY((numext::isnan)(data2[1]));
+
+      h.store(data2, internal::pcos(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      VERIFY((numext::isnan)(data2[1]));
+
+      data1[0] = NumTraits<Scalar>::quiet_NaN();
+      h.store(data2, internal::psin(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      h.store(data2, internal::pcos(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+
+      data1[0] = -Scalar(0.);
+      h.store(data2, internal::psin(h.load(data1)));
+      VERIFY(internal::biteq(data2[0], data1[0]));
+      h.store(data2, internal::pcos(h.load(data1)));
+      VERIFY_IS_EQUAL(data2[0], Scalar(1));
+    }
+  }
+}
 
-    data1[0] = std::numeric_limits<Scalar>::denorm_min();
-    data1[1] = -std::numeric_limits<Scalar>::denorm_min();
-    h.store(data2, internal::plog(h.load(data1)));
-    // VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
-    VERIFY((numext::isnan)(data2[1]));
+#define CAST_CHECK_CWISE1_IF(COND, REFOP, POP, SCALAR, REFTYPE) if(COND) { \
+  test::packet_helper<COND,Packet> h; \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = SCALAR(REFOP(static_cast<REFTYPE>(data1[i]))); \
+  h.store(data2, POP(h.load(data1))); \
+  VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \
+}
 
-    data1[0] = Scalar(-1.0f);
-    h.store(data2, internal::plog(h.load(data1)));
-    VERIFY((numext::isnan)(data2[0]));
-    h.store(data2, internal::psqrt(h.load(data1)));
-    VERIFY((numext::isnan)(data2[0]));
-    VERIFY((numext::isnan)(data2[1]));
-  }
+template <typename Scalar>
+Scalar propagate_nan_max(const Scalar& a, const Scalar& b) {
+  if ((numext::isnan)(a)) return a;
+  if ((numext::isnan)(b)) return b;
+  return (numext::maxi)(a,b);
 }
 
-template<typename Scalar> void packetmath_notcomplex()
-{
-  using std::abs;
-  typedef internal::packet_traits<Scalar> PacketTraits;
-  typedef typename PacketTraits::type Packet;
-  const int PacketSize = PacketTraits::size;
+template <typename Scalar>
+Scalar propagate_nan_min(const Scalar& a, const Scalar& b) {
+  if ((numext::isnan)(a)) return a;
+  if ((numext::isnan)(b)) return b;
+  return (numext::mini)(a,b);
+}
 
-  EIGEN_ALIGN_MAX Scalar data1[PacketTraits::size*4];
-  EIGEN_ALIGN_MAX Scalar data2[PacketTraits::size*4];
-  EIGEN_ALIGN_MAX Scalar ref[PacketTraits::size*4];
+template <typename Scalar>
+Scalar propagate_number_max(const Scalar& a, const Scalar& b) {
+  if ((numext::isnan)(a)) return b;
+  if ((numext::isnan)(b)) return a;
+  return (numext::maxi)(a,b);
+}
 
-  Array<Scalar,Dynamic,1>::Map(data1, PacketTraits::size*4).setRandom();
+template <typename Scalar>
+Scalar propagate_number_min(const Scalar& a, const Scalar& b) {
+  if ((numext::isnan)(a)) return b;
+  if ((numext::isnan)(b)) return a;
+  return (numext::mini)(a,b);
+}
 
-  ref[0] = data1[0];
-  for (int i=0; i<PacketSize; ++i)
-    ref[0] = (std::min)(ref[0],data1[i]);
-  VERIFY(internal::isApprox(ref[0], internal::predux_min(internal::pload<Packet>(data1))) && "internal::predux_min");
+template <typename Scalar, typename Packet>
+void packetmath_notcomplex() {
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+
+  EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4];
+  EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4];
+  EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4];
+
+  Array<Scalar, Dynamic, 1>::Map(data1, PacketSize * 4).setRandom();
 
   VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMin);
   VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMax);
 
   CHECK_CWISE2_IF(PacketTraits::HasMin, (std::min), internal::pmin);
   CHECK_CWISE2_IF(PacketTraits::HasMax, (std::max), internal::pmax);
-  CHECK_CWISE1(abs, internal::pabs);
 
+  CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_number_min, internal::pmin<PropagateNumbers>);
+  CHECK_CWISE2_IF(PacketTraits::HasMax, propagate_number_max, internal::pmax<PropagateNumbers>);
+  CHECK_CWISE1(numext::abs, internal::pabs);
+  CHECK_CWISE2_IF(PacketTraits::HasAbsDiff, REF_ABS_DIFF, internal::pabsdiff);
+
+  ref[0] = data1[0];
+  for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmin(ref[0], data1[i]);
+  VERIFY(internal::isApprox(ref[0], internal::predux_min(internal::pload<Packet>(data1))) && "internal::predux_min");
   ref[0] = data1[0];
-  for (int i=0; i<PacketSize; ++i)
-    ref[0] = (std::max)(ref[0],data1[i]);
+  for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmax(ref[0], data1[i]);
   VERIFY(internal::isApprox(ref[0], internal::predux_max(internal::pload<Packet>(data1))) && "internal::predux_max");
 
-  for (int i=0; i<PacketSize; ++i)
-    ref[i] = data1[0]+Scalar(i);
+  for (int i = 0; i < PacketSize; ++i) ref[i] = data1[0] + Scalar(i);
   internal::pstore(data2, internal::plset<Packet>(data1[0]));
-  VERIFY(areApprox(ref, data2, PacketSize) && "internal::plset");
+  VERIFY(test::areApprox(ref, data2, PacketSize) && "internal::plset");
+
+  {
+    unsigned char* data1_bits = reinterpret_cast<unsigned char*>(data1);
+    // predux_all - not needed yet
+    // for (unsigned int i=0; i<PacketSize*sizeof(Scalar); ++i) data1_bits[i] = 0xff;
+    // VERIFY(internal::predux_all(internal::pload<Packet>(data1)) && "internal::predux_all(1111)");
+    // for(int k=0; k<PacketSize; ++k)
+    // {
+    //   for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0x0;
+    //   VERIFY( (!internal::predux_all(internal::pload<Packet>(data1))) && "internal::predux_all(0101)");
+    //   for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0xff;
+    // }
+
+    // predux_any
+    for (unsigned int i = 0; i < PacketSize * sizeof(Scalar); ++i) data1_bits[i] = 0x0;
+    VERIFY((!internal::predux_any(internal::pload<Packet>(data1))) && "internal::predux_any(0000)");
+    for (int k = 0; k < PacketSize; ++k) {
+      for (unsigned int i = 0; i < sizeof(Scalar); ++i) data1_bits[k * sizeof(Scalar) + i] = 0xff;
+      VERIFY(internal::predux_any(internal::pload<Packet>(data1)) && "internal::predux_any(0101)");
+      for (unsigned int i = 0; i < sizeof(Scalar); ++i) data1_bits[k * sizeof(Scalar) + i] = 0x00;
+    }
+  }
+
+
+  // Test NaN propagation.
+  if (!NumTraits<Scalar>::IsInteger) {
+    // Test reductions with no NaNs.
+    ref[0] = data1[0];
+    for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmin<PropagateNumbers>(ref[0], data1[i]);
+    VERIFY(internal::isApprox(ref[0], internal::predux_min<PropagateNumbers>(internal::pload<Packet>(data1))) && "internal::predux_min<PropagateNumbers>");
+    ref[0] = data1[0];
+    for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmin<PropagateNaN>(ref[0], data1[i]);
+    VERIFY(internal::isApprox(ref[0], internal::predux_min<PropagateNaN>(internal::pload<Packet>(data1))) && "internal::predux_min<PropagateNaN>");
+    ref[0] = data1[0];
+    for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmax<PropagateNumbers>(ref[0], data1[i]);
+    VERIFY(internal::isApprox(ref[0], internal::predux_max<PropagateNumbers>(internal::pload<Packet>(data1))) && "internal::predux_max<PropagateNumbers>");
+    ref[0] = data1[0];
+    for (int i = 0; i < PacketSize; ++i) ref[0] = internal::pmax<PropagateNaN>(ref[0], data1[i]);
+    VERIFY(internal::isApprox(ref[0], internal::predux_max<PropagateNaN>(internal::pload<Packet>(data1))) && "internal::predux_max<PropagateNumbers>");
+    // A single NaN.
+    const size_t index = std::numeric_limits<size_t>::quiet_NaN() % PacketSize;
+    data1[index] = NumTraits<Scalar>::quiet_NaN();
+    VERIFY(PacketSize==1 || !(numext::isnan)(internal::predux_min<PropagateNumbers>(internal::pload<Packet>(data1))));
+    VERIFY((numext::isnan)(internal::predux_min<PropagateNaN>(internal::pload<Packet>(data1))));
+    VERIFY(PacketSize==1 || !(numext::isnan)(internal::predux_max<PropagateNumbers>(internal::pload<Packet>(data1))));
+    VERIFY((numext::isnan)(internal::predux_max<PropagateNaN>(internal::pload<Packet>(data1))));
+    // All NaNs.
+    for (int i = 0; i < 4 * PacketSize; ++i) data1[i] = NumTraits<Scalar>::quiet_NaN();
+    VERIFY((numext::isnan)(internal::predux_min<PropagateNumbers>(internal::pload<Packet>(data1))));
+    VERIFY((numext::isnan)(internal::predux_min<PropagateNaN>(internal::pload<Packet>(data1))));
+    VERIFY((numext::isnan)(internal::predux_max<PropagateNumbers>(internal::pload<Packet>(data1))));
+    VERIFY((numext::isnan)(internal::predux_max<PropagateNaN>(internal::pload<Packet>(data1))));
+
+    // Test NaN propagation for coefficient-wise min and max.
+    for (int i = 0; i < PacketSize; ++i) {
+      data1[i] = internal::random<bool>() ? NumTraits<Scalar>::quiet_NaN() : Scalar(0);
+      data1[i + PacketSize] = internal::random<bool>() ? NumTraits<Scalar>::quiet_NaN() : Scalar(0);
+    }
+    // Note: NaN propagation is implementation defined for pmin/pmax, so we do not test it here.
+    CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_number_min, (internal::pmin<PropagateNumbers>));
+    CHECK_CWISE2_IF(PacketTraits::HasMax, propagate_number_max, internal::pmax<PropagateNumbers>);
+    CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_nan_min, (internal::pmin<PropagateNaN>));
+    CHECK_CWISE2_IF(PacketTraits::HasMax, propagate_nan_max, internal::pmax<PropagateNaN>);
+  }
+
+  packetmath_boolean_mask_ops_notcomplex<Scalar, Packet>();
 }
 
-template<typename Scalar,bool ConjLhs,bool ConjRhs> void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval)
-{
-  typedef internal::packet_traits<Scalar> PacketTraits;
-  typedef typename PacketTraits::type Packet;
-  const int PacketSize = PacketTraits::size;
+template <typename Scalar, typename Packet, bool ConjLhs, bool ConjRhs>
+void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval) {
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
 
   internal::conj_if<ConjLhs> cj0;
   internal::conj_if<ConjRhs> cj1;
-  internal::conj_helper<Scalar,Scalar,ConjLhs,ConjRhs> cj;
-  internal::conj_helper<Packet,Packet,ConjLhs,ConjRhs> pcj;
+  internal::conj_helper<Scalar, Scalar, ConjLhs, ConjRhs> cj;
+  internal::conj_helper<Packet, Packet, ConjLhs, ConjRhs> pcj;
 
-  for(int i=0;i<PacketSize;++i)
-  {
+  for (int i = 0; i < PacketSize; ++i) {
     ref[i] = cj0(data1[i]) * cj1(data2[i]);
-    VERIFY(internal::isApprox(ref[i], cj.pmul(data1[i],data2[i])) && "conj_helper pmul");
+    VERIFY(internal::isApprox(ref[i], cj.pmul(data1[i], data2[i])) && "conj_helper pmul");
   }
-  internal::pstore(pval,pcj.pmul(internal::pload<Packet>(data1),internal::pload<Packet>(data2)));
-  VERIFY(areApprox(ref, pval, PacketSize) && "conj_helper pmul");
+  internal::pstore(pval, pcj.pmul(internal::pload<Packet>(data1), internal::pload<Packet>(data2)));
+  VERIFY(test::areApprox(ref, pval, PacketSize) && "conj_helper pmul");
 
-  for(int i=0;i<PacketSize;++i)
-  {
+  for (int i = 0; i < PacketSize; ++i) {
     Scalar tmp = ref[i];
     ref[i] += cj0(data1[i]) * cj1(data2[i]);
-    VERIFY(internal::isApprox(ref[i], cj.pmadd(data1[i],data2[i],tmp)) && "conj_helper pmadd");
+    VERIFY(internal::isApprox(ref[i], cj.pmadd(data1[i], data2[i], tmp)) && "conj_helper pmadd");
   }
-  internal::pstore(pval,pcj.pmadd(internal::pload<Packet>(data1),internal::pload<Packet>(data2),internal::pload<Packet>(pval)));
-  VERIFY(areApprox(ref, pval, PacketSize) && "conj_helper pmadd");
+  internal::pstore(
+      pval, pcj.pmadd(internal::pload<Packet>(data1), internal::pload<Packet>(data2), internal::pload<Packet>(pval)));
+  VERIFY(test::areApprox(ref, pval, PacketSize) && "conj_helper pmadd");
 }
 
-template<typename Scalar> void packetmath_complex()
-{
+template <typename Scalar, typename Packet>
+void packetmath_complex() {
   typedef internal::packet_traits<Scalar> PacketTraits;
-  typedef typename PacketTraits::type Packet;
-  const int PacketSize = PacketTraits::size;
+  typedef typename Scalar::value_type RealScalar;
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
 
-  const int size = PacketSize*4;
-  EIGEN_ALIGN_MAX Scalar data1[PacketSize*4];
-  EIGEN_ALIGN_MAX Scalar data2[PacketSize*4];
-  EIGEN_ALIGN_MAX Scalar ref[PacketSize*4];
-  EIGEN_ALIGN_MAX Scalar pval[PacketSize*4];
+  const int size = PacketSize * 4;
+  EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4];
+  EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4];
+  EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4];
+  EIGEN_ALIGN_MAX Scalar pval[PacketSize * 4];
 
-  for (int i=0; i<size; ++i)
-  {
+  for (int i = 0; i < size; ++i) {
     data1[i] = internal::random<Scalar>() * Scalar(1e2);
     data2[i] = internal::random<Scalar>() * Scalar(1e2);
   }
 
-  test_conj_helper<Scalar,false,false> (data1,data2,ref,pval);
-  test_conj_helper<Scalar,false,true>  (data1,data2,ref,pval);
-  test_conj_helper<Scalar,true,false>  (data1,data2,ref,pval);
-  test_conj_helper<Scalar,true,true>   (data1,data2,ref,pval);
+  test_conj_helper<Scalar, Packet, false, false>(data1, data2, ref, pval);
+  test_conj_helper<Scalar, Packet, false, true>(data1, data2, ref, pval);
+  test_conj_helper<Scalar, Packet, true, false>(data1, data2, ref, pval);
+  test_conj_helper<Scalar, Packet, true, true>(data1, data2, ref, pval);
 
+  // Test pcplxflip.
   {
-    for(int i=0;i<PacketSize;++i)
-      ref[i] = Scalar(std::imag(data1[i]),std::real(data1[i]));
-    internal::pstore(pval,internal::pcplxflip(internal::pload<Packet>(data1)));
-    VERIFY(areApprox(ref, pval, PacketSize) && "pcplxflip");
+    for (int i = 0; i < PacketSize; ++i) ref[i] = Scalar(std::imag(data1[i]), std::real(data1[i]));
+    internal::pstore(pval, internal::pcplxflip(internal::pload<Packet>(data1)));
+    VERIFY(test::areApprox(ref, pval, PacketSize) && "pcplxflip");
+  }
+
+  if (PacketTraits::HasSqrt) {
+    for (int i = 0; i < size; ++i) {
+      data1[i] = Scalar(internal::random<RealScalar>(), internal::random<RealScalar>());
+    }
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, size);
+
+    // Test misc. corner cases.
+    const RealScalar zero = RealScalar(0);
+    const RealScalar one = RealScalar(1);
+    const RealScalar inf = std::numeric_limits<RealScalar>::infinity();
+    const RealScalar nan = std::numeric_limits<RealScalar>::quiet_NaN();
+    data1[0] = Scalar(zero, zero);
+    data1[1] = Scalar(-zero, zero);
+    data1[2] = Scalar(one, zero);
+    data1[3] = Scalar(zero, one);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
+    data1[0] = Scalar(-one, zero);
+    data1[1] = Scalar(zero, -one);
+    data1[2] = Scalar(one, one);
+    data1[3] = Scalar(-one, -one);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
+    data1[0] = Scalar(inf, zero);
+    data1[1] = Scalar(zero, inf);
+    data1[2] = Scalar(-inf, zero);
+    data1[3] = Scalar(zero, -inf);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
+    data1[0] = Scalar(inf, inf);
+    data1[1] = Scalar(-inf, inf);
+    data1[2] = Scalar(inf, -inf);
+    data1[3] = Scalar(-inf, -inf);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
+    data1[0] = Scalar(nan, zero);
+    data1[1] = Scalar(zero, nan);
+    data1[2] = Scalar(nan, one);
+    data1[3] = Scalar(one, nan);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
+    data1[0] = Scalar(nan, nan);
+    data1[1] = Scalar(inf, nan);
+    data1[2] = Scalar(nan, inf);
+    data1[3] = Scalar(-inf, nan);
+    CHECK_CWISE1_N(numext::sqrt, internal::psqrt, 4);
   }
 }
 
-template<typename Scalar> void packetmath_scatter_gather()
-{
-  typedef internal::packet_traits<Scalar> PacketTraits;
-  typedef typename PacketTraits::type Packet;
+template <typename Scalar, typename Packet>
+void packetmath_scatter_gather() {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  const int PacketSize = PacketTraits::size;
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
   EIGEN_ALIGN_MAX Scalar data1[PacketSize];
-  RealScalar refvalue = 0;
-  for (int i=0; i<PacketSize; ++i) {
-    data1[i] = internal::random<Scalar>()/RealScalar(PacketSize);
+  RealScalar refvalue = RealScalar(0);
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = internal::random<Scalar>() / RealScalar(PacketSize);
   }
 
-  int stride = internal::random<int>(1,20);
+  int stride = internal::random<int>(1, 20);
+
+  // Buffer of zeros.
+  EIGEN_ALIGN_MAX Scalar buffer[PacketSize * 20] = {};
 
-  EIGEN_ALIGN_MAX Scalar buffer[PacketSize*20];
-  memset(buffer, 0, 20*PacketSize*sizeof(Scalar));
   Packet packet = internal::pload<Packet>(data1);
   internal::pscatter<Scalar, Packet>(buffer, packet, stride);
 
-  for (int i = 0; i < PacketSize*20; ++i) {
-    if ((i%stride) == 0 && i<stride*PacketSize) {
-      VERIFY(isApproxAbs(buffer[i], data1[i/stride], refvalue) && "pscatter");
+  for (int i = 0; i < PacketSize * 20; ++i) {
+    if ((i % stride) == 0 && i < stride * PacketSize) {
+      VERIFY(test::isApproxAbs(buffer[i], data1[i / stride], refvalue) && "pscatter");
     } else {
-      VERIFY(isApproxAbs(buffer[i], Scalar(0), refvalue) && "pscatter");
+      VERIFY(test::isApproxAbs(buffer[i], Scalar(0), refvalue) && "pscatter");
     }
   }
 
-  for (int i=0; i<PacketSize*7; ++i) {
-    buffer[i] = internal::random<Scalar>()/RealScalar(PacketSize);
+  for (int i = 0; i < PacketSize * 7; ++i) {
+    buffer[i] = internal::random<Scalar>() / RealScalar(PacketSize);
   }
   packet = internal::pgather<Scalar, Packet>(buffer, 7);
   internal::pstore(data1, packet);
   for (int i = 0; i < PacketSize; ++i) {
-    VERIFY(isApproxAbs(data1[i], buffer[i*7], refvalue) && "pgather");
+    VERIFY(test::isApproxAbs(data1[i], buffer[i * 7], refvalue) && "pgather");
   }
 }
 
-void test_packetmath()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( packetmath<float>() );
-    CALL_SUBTEST_2( packetmath<double>() );
-    CALL_SUBTEST_3( packetmath<int>() );
-    CALL_SUBTEST_4( packetmath<std::complex<float> >() );
-    CALL_SUBTEST_5( packetmath<std::complex<double> >() );
-
-    CALL_SUBTEST_1( packetmath_notcomplex<float>() );
-    CALL_SUBTEST_2( packetmath_notcomplex<double>() );
-    CALL_SUBTEST_3( packetmath_notcomplex<int>() );
-
-    CALL_SUBTEST_1( packetmath_real<float>() );
-    CALL_SUBTEST_2( packetmath_real<double>() );
-
-    CALL_SUBTEST_4( packetmath_complex<std::complex<float> >() );
-    CALL_SUBTEST_5( packetmath_complex<std::complex<double> >() );
-
-    CALL_SUBTEST_1( packetmath_scatter_gather<float>() );
-    CALL_SUBTEST_2( packetmath_scatter_gather<double>() );
-    CALL_SUBTEST_3( packetmath_scatter_gather<int>() );
-    CALL_SUBTEST_4( packetmath_scatter_gather<std::complex<float> >() );
-    CALL_SUBTEST_5( packetmath_scatter_gather<std::complex<double> >() );
+namespace Eigen {
+namespace test {
+
+template <typename Scalar, typename PacketType>
+struct runall<Scalar, PacketType, false, false> {  // i.e. float or double
+  static void run() {
+    packetmath<Scalar, PacketType>();
+    packetmath_scatter_gather<Scalar, PacketType>();
+    packetmath_notcomplex<Scalar, PacketType>();
+    packetmath_real<Scalar, PacketType>();
+  }
+};
+
+template <typename Scalar, typename PacketType>
+struct runall<Scalar, PacketType, false, true> {  // i.e. int
+  static void run() {
+    packetmath<Scalar, PacketType>();
+    packetmath_scatter_gather<Scalar, PacketType>();
+    packetmath_notcomplex<Scalar, PacketType>();
+  }
+};
+
+template <typename Scalar, typename PacketType>
+struct runall<Scalar, PacketType, true, false> {  // i.e. complex
+  static void run() {
+    packetmath<Scalar, PacketType>();
+    packetmath_scatter_gather<Scalar, PacketType>();
+    packetmath_complex<Scalar, PacketType>();
+  }
+};
+
+}  // namespace test
+}  // namespace Eigen
+
+EIGEN_DECLARE_TEST(packetmath) {
+  g_first_pass = true;
+  for (int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(test::runner<float>::run());
+    CALL_SUBTEST_2(test::runner<double>::run());
+    CALL_SUBTEST_3(test::runner<int8_t>::run());
+    CALL_SUBTEST_4(test::runner<uint8_t>::run());
+    CALL_SUBTEST_5(test::runner<int16_t>::run());
+    CALL_SUBTEST_6(test::runner<uint16_t>::run());
+    CALL_SUBTEST_7(test::runner<int32_t>::run());
+    CALL_SUBTEST_8(test::runner<uint32_t>::run());
+    CALL_SUBTEST_9(test::runner<int64_t>::run());
+    CALL_SUBTEST_10(test::runner<uint64_t>::run());
+    CALL_SUBTEST_11(test::runner<std::complex<float> >::run());
+    CALL_SUBTEST_12(test::runner<std::complex<double> >::run());
+    CALL_SUBTEST_13(test::runner<half>::run());
+    CALL_SUBTEST_14((packetmath<bool, internal::packet_traits<bool>::type>()));
+    CALL_SUBTEST_15(test::runner<bfloat16>::run());
+    g_first_pass = false;
   }
 }
diff --git a/contrib/eigen/test/packetmath_test_shared.h b/contrib/eigen/test/packetmath_test_shared.h
new file mode 100644
index 0000000000000000000000000000000000000000..8624fe2fe2c8ccd8ddeb871fa2cbd86bb4b363b2
--- /dev/null
+++ b/contrib/eigen/test/packetmath_test_shared.h
@@ -0,0 +1,275 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <typeinfo>
+
+#if defined __GNUC__ && __GNUC__>=6
+  #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+// using namespace Eigen;
+
+bool g_first_pass = true;
+
+namespace Eigen {
+namespace internal {
+
+template<typename T> T negate(const T& x) { return -x; }
+
+template<typename T>
+Map<const Array<unsigned char,sizeof(T),1> >
+bits(const T& x) {
+  return Map<const Array<unsigned char,sizeof(T),1> >(reinterpret_cast<const unsigned char *>(&x));
+}
+
+// The following implement bitwise operations on floating point types
+template<typename T,typename Bits,typename Func>
+T apply_bit_op(Bits a, Bits b, Func f) {
+  Array<unsigned char,sizeof(T),1> data;
+  T res;
+  for(Index i = 0; i < data.size(); ++i)
+    data[i] = f(a[i], b[i]);
+  // Note: The reinterpret_cast works around GCC's class-memaccess warnings:
+  std::memcpy(reinterpret_cast<unsigned char*>(&res), data.data(), sizeof(T));
+  return res;
+}
+
+#define EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,T)             \
+  template<> T EIGEN_CAT(p,OP)(const T& a,const T& b) { \
+    return apply_bit_op<T>(bits(a),bits(b),FUNC);     \
+  }
+
+#define EIGEN_TEST_MAKE_BITWISE(OP,FUNC)                  \
+  EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,float)                 \
+  EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,double)                \
+  EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,half)                  \
+  EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,bfloat16)              \
+  EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,std::complex<float>)   \
+  EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,std::complex<double>)
+
+EIGEN_TEST_MAKE_BITWISE(xor,std::bit_xor<unsigned char>())
+EIGEN_TEST_MAKE_BITWISE(and,std::bit_and<unsigned char>())
+EIGEN_TEST_MAKE_BITWISE(or, std::bit_or<unsigned char>())
+struct bit_andnot{
+  template<typename T> T
+  operator()(T a, T b) const { return a & (~b); }
+};
+EIGEN_TEST_MAKE_BITWISE(andnot, bit_andnot())
+template<typename T>
+bool biteq(T a, T b) {
+  return (bits(a) == bits(b)).all();
+}
+
+}
+
+namespace test {
+
+// NOTE: we disable inlining for this function to workaround a GCC issue when using -O3 and the i387 FPU.
+template<typename Scalar> EIGEN_DONT_INLINE
+bool isApproxAbs(const Scalar& a, const Scalar& b, const typename NumTraits<Scalar>::Real& refvalue)
+{
+  return internal::isMuchSmallerThan(a-b, refvalue);
+}
+
+template<typename Scalar>
+inline void print_mismatch(const Scalar* ref, const Scalar* vec, int size) {
+  std::cout << "ref: [" << Map<const Matrix<Scalar,1,Dynamic> >(ref,size) << "]" << " != vec: [" << Map<const Matrix<Scalar,1,Dynamic> >(vec,size) << "]\n";
+}
+
+template<typename Scalar> bool areApproxAbs(const Scalar* a, const Scalar* b, int size, const typename NumTraits<Scalar>::Real& refvalue)
+{
+  for (int i=0; i<size; ++i)
+  {
+    if (!isApproxAbs(a[i],b[i],refvalue))
+    {
+      print_mismatch(a, b, size);
+      return false;
+    }
+  }
+  return true;
+}
+
+template<typename Scalar> bool areApprox(const Scalar* a, const Scalar* b, int size)
+{
+  for (int i=0; i<size; ++i)
+  {
+    if ( a[i]!=b[i] && !internal::isApprox(a[i],b[i]) 
+         && !((numext::isnan)(a[i]) && (numext::isnan)(b[i])) )
+    {
+      print_mismatch(a, b, size);
+      return false;
+    }
+  }
+  return true;
+}
+
+template<typename Scalar> bool areEqual(const Scalar* a, const Scalar* b, int size)
+{
+  for (int i=0; i<size; ++i)
+  {
+    if ( (a[i] != b[i]) && !((numext::isnan)(a[i]) && (numext::isnan)(b[i])) )
+    {
+      print_mismatch(a, b, size);
+      return false;
+    }
+  }
+  return true;
+}
+
+#define CHECK_CWISE1(REFOP, POP) { \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = REFOP(data1[i]); \
+  internal::pstore(data2, POP(internal::pload<Packet>(data1))); \
+  VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \
+}
+
+// Checks component-wise for input of size N. All of data1, data2, and ref
+// should have size at least ceil(N/PacketSize)*PacketSize to avoid memory
+// access errors.
+#define CHECK_CWISE1_N(REFOP, POP, N) { \
+  for (int i=0; i<N; ++i) \
+    ref[i] = REFOP(data1[i]); \
+  for (int j=0; j<N; j+=PacketSize) \
+    internal::pstore(data2 + j, POP(internal::pload<Packet>(data1 + j))); \
+  VERIFY(test::areApprox(ref, data2, N) && #POP); \
+}
+
+template<bool Cond,typename Packet>
+struct packet_helper
+{
+  template<typename T>
+  inline Packet load(const T* from) const { return internal::pload<Packet>(from); }
+
+  template<typename T>
+  inline Packet loadu(const T* from) const { return internal::ploadu<Packet>(from); }
+
+  template<typename T>
+  inline Packet load(const T* from, unsigned long long umask) const { return internal::ploadu<Packet>(from, umask); }
+
+  template<typename T>
+  inline void store(T* to, const Packet& x) const { internal::pstore(to,x); }
+
+  template<typename T>
+  inline void store(T* to, const Packet& x, unsigned long long umask) const { internal::pstoreu(to, x, umask); }
+
+  template<typename T>
+  inline Packet& forward_reference(Packet& packet, T& /*scalar*/) const { return packet; }
+};
+
+template<typename Packet>
+struct packet_helper<false,Packet>
+{
+  template<typename T>
+  inline T load(const T* from) const { return *from; }
+
+  template<typename T>
+  inline T loadu(const T* from) const { return *from; }
+
+  template<typename T>
+  inline T load(const T* from, unsigned long long) const { return *from; }
+
+  template<typename T>
+  inline void store(T* to, const T& x) const { *to = x; }
+
+  template<typename T>
+  inline void store(T* to, const T& x, unsigned long long) const { *to = x; }
+
+  template<typename T>
+  inline T& forward_reference(Packet& /*packet*/, T& scalar) const { return scalar; }
+};
+
+#define CHECK_CWISE1_IF(COND, REFOP, POP) if(COND) { \
+  test::packet_helper<COND,Packet> h; \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = Scalar(REFOP(data1[i])); \
+  h.store(data2, POP(h.load(data1))); \
+  VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \
+}
+
+#define CHECK_CWISE1_EXACT_IF(COND, REFOP, POP) if(COND) { \
+  test::packet_helper<COND,Packet> h; \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = Scalar(REFOP(data1[i])); \
+  h.store(data2, POP(h.load(data1))); \
+  VERIFY(test::areEqual(ref, data2, PacketSize) && #POP); \
+}
+
+#define CHECK_CWISE2_IF(COND, REFOP, POP) if(COND) { \
+  test::packet_helper<COND,Packet> h; \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = Scalar(REFOP(data1[i], data1[i+PacketSize]));     \
+  h.store(data2, POP(h.load(data1),h.load(data1+PacketSize))); \
+  VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \
+}
+
+// One input, one output by reference.
+#define CHECK_CWISE1_BYREF1_IF(COND, REFOP, POP) if(COND) { \
+  test::packet_helper<COND,Packet> h; \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = Scalar(REFOP(data1[i], ref[i+PacketSize]));     \
+  Packet pout; \
+  Scalar sout; \
+  h.store(data2, POP(h.load(data1), h.forward_reference(pout, sout))); \
+  h.store(data2+PacketSize, h.forward_reference(pout, sout)); \
+  VERIFY(test::areApprox(ref, data2, 2 * PacketSize) && #POP); \
+}
+
+#define CHECK_CWISE3_IF(COND, REFOP, POP) if (COND) {                      \
+  test::packet_helper<COND, Packet> h;                                     \
+  for (int i = 0; i < PacketSize; ++i)                                     \
+    ref[i] = Scalar(REFOP(data1[i], data1[i + PacketSize],                 \
+                          data1[i + 2 * PacketSize]));                     \
+  h.store(data2, POP(h.load(data1), h.load(data1 + PacketSize),            \
+                     h.load(data1 + 2 * PacketSize)));                     \
+  VERIFY(test::areApprox(ref, data2, PacketSize) && #POP);                 \
+}
+
+// Specialize the runall struct in your test file by defining run().
+template<
+  typename Scalar,
+  typename PacketType,
+  bool IsComplex = NumTraits<Scalar>::IsComplex,
+  bool IsInteger = NumTraits<Scalar>::IsInteger>
+struct runall;
+
+template<
+  typename Scalar,
+  typename PacketType = typename internal::packet_traits<Scalar>::type,
+  bool Vectorized = internal::packet_traits<Scalar>::Vectorizable,
+  bool HasHalf = !internal::is_same<typename internal::unpacket_traits<PacketType>::half,PacketType>::value >
+struct runner;
+
+template<typename Scalar,typename PacketType>
+struct runner<Scalar,PacketType,true,true>
+{
+  static void run() {
+    runall<Scalar,PacketType>::run();
+    runner<Scalar,typename internal::unpacket_traits<PacketType>::half>::run();
+  }
+};
+
+template<typename Scalar,typename PacketType>
+struct runner<Scalar,PacketType,true,false>
+{
+  static void run() {
+    runall<Scalar,PacketType>::run();
+  }
+};
+
+template<typename Scalar,typename PacketType>
+struct runner<Scalar,PacketType,false,false>
+{
+  static void run() {
+    runall<Scalar,PacketType>::run();
+  }
+};
+
+}
+}
diff --git a/contrib/eigen/test/pardiso_support.cpp b/contrib/eigen/test/pardiso_support.cpp
index 67efad6d80450fd6137185330790c81ab4485561..9c16ded5bcfc6427c35158455d4a6f4bdf853b3a 100644
--- a/contrib/eigen/test/pardiso_support.cpp
+++ b/contrib/eigen/test/pardiso_support.cpp
@@ -20,7 +20,7 @@ template<typename T> void test_pardiso_T()
   check_sparse_square_solving(pardiso_lu);
 }
 
-void test_pardiso_support()
+EIGEN_DECLARE_TEST(pardiso_support)
 {
   CALL_SUBTEST_1(test_pardiso_T<float>());
   CALL_SUBTEST_2(test_pardiso_T<double>());
diff --git a/contrib/eigen/test/pastix_support.cpp b/contrib/eigen/test/pastix_support.cpp
index b62f857394dce4e944b2e80c6c85a7ceda2767dd..9b64417c13d83033187350085da38a8d04e08cfa 100644
--- a/contrib/eigen/test/pastix_support.cpp
+++ b/contrib/eigen/test/pastix_support.cpp
@@ -45,7 +45,7 @@ template<typename T> void test_pastix_T_LU()
   check_sparse_square_solving(pastix_lu);
 }
 
-void test_pastix_support()
+EIGEN_DECLARE_TEST(pastix_support)
 {
   CALL_SUBTEST_1(test_pastix_T<float>());
   CALL_SUBTEST_2(test_pastix_T<double>());
diff --git a/contrib/eigen/test/permutationmatrices.cpp b/contrib/eigen/test/permutationmatrices.cpp
index e885f0e0414dd8b34a8e22eff7898a2948e47d85..d4b68b2d488390c67030280a3e1b125d37620a57 100644
--- a/contrib/eigen/test/permutationmatrices.cpp
+++ b/contrib/eigen/test/permutationmatrices.cpp
@@ -54,7 +54,11 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
   m_permuted = m_original;
   VERIFY_EVALUATION_COUNT(m_permuted = lp * m_permuted * rp, 1);
   VERIFY_IS_APPROX(m_permuted, lm*m_original*rm);
-  
+
+  LeftPermutationType lpi;
+  lpi = lp.inverse();
+  VERIFY_IS_APPROX(lpi*m_permuted,lp.inverse()*m_permuted);
+
   VERIFY_IS_APPROX(lp.inverse()*m_permuted*rp.inverse(), m_original);
   VERIFY_IS_APPROX(lv.asPermutation().inverse()*m_permuted*rv.asPermutation().inverse(), m_original);
   VERIFY_IS_APPROX(MapLeftPerm(lv.data(),lv.size()).inverse()*m_permuted*MapRightPerm(rv.data(),rv.size()).inverse(), m_original);
@@ -126,6 +130,16 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
   VERIFY_IS_APPROX(m_permuted, lp*m_original*rp.transpose());
   
   VERIFY_IS_APPROX(lt.inverse()*m_permuted*rt.inverse(), m_original);
+
+  // Check inplace transpositions
+  m_permuted = m_original;
+  VERIFY_IS_APPROX(m_permuted = lt * m_permuted, lp * m_original);
+  m_permuted = m_original;
+  VERIFY_IS_APPROX(m_permuted = lt.inverse() * m_permuted, lp.inverse() * m_original);
+  m_permuted = m_original;
+  VERIFY_IS_APPROX(m_permuted = m_permuted * rt, m_original * rt);
+  m_permuted = m_original;
+  VERIFY_IS_APPROX(m_permuted = m_permuted * rt.inverse(), m_original * rt.inverse());
 }
 
 template<typename T>
@@ -147,12 +161,12 @@ void bug890()
 
   MapType(v1.data(),2,1,S(1,1)) = P * MapType(rhs.data(),2,1,S(1,1));
   VERIFY_IS_APPROX(v1, (P * rhs).eval());
-  
+
   MapType(v1.data(),2,1,S(1,1)) = P.inverse() * MapType(rhs.data(),2,1,S(1,1));
   VERIFY_IS_APPROX(v1, (P.inverse() * rhs).eval());
 }
 
-void test_permutationmatrices()
+EIGEN_DECLARE_TEST(permutationmatrices)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( permutationmatrices(Matrix<float, 1, 1>()) );
@@ -160,8 +174,8 @@ void test_permutationmatrices()
     CALL_SUBTEST_3( permutationmatrices(Matrix<double,3,3,RowMajor>()) );
     CALL_SUBTEST_4( permutationmatrices(Matrix4d()) );
     CALL_SUBTEST_5( permutationmatrices(Matrix<double,40,60>()) );
-    CALL_SUBTEST_6( permutationmatrices(Matrix<double,Dynamic,Dynamic,RowMajor>(20, 30)) );
-    CALL_SUBTEST_7( permutationmatrices(MatrixXcf(15, 10)) );
+    CALL_SUBTEST_6( permutationmatrices(Matrix<double,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_7( permutationmatrices(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
   }
   CALL_SUBTEST_5( bug890<double>() );
 }
diff --git a/contrib/eigen/test/prec_inverse_4x4.cpp b/contrib/eigen/test/prec_inverse_4x4.cpp
index eb6ad18c94de01d11ab696ca5f4529bce20f8c15..86f057118cf1c3ba1e906eb7fd6160c4cbd4636f 100644
--- a/contrib/eigen/test/prec_inverse_4x4.cpp
+++ b/contrib/eigen/test/prec_inverse_4x4.cpp
@@ -30,18 +30,17 @@ template<typename MatrixType> void inverse_general_4x4(int repeat)
 {
   using std::abs;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
   double error_sum = 0., error_max = 0.;
   for(int i = 0; i < repeat; ++i)
   {
     MatrixType m;
-    RealScalar absdet;
+    bool is_invertible;
     do {
       m = MatrixType::Random();
-      absdet = abs(m.determinant());
-    } while(absdet < NumTraits<Scalar>::epsilon());
+      is_invertible = Eigen::FullPivLU<MatrixType>(m).isInvertible();
+    } while(!is_invertible);
     MatrixType inv = m.inverse();
-    double error = double( (m*inv-MatrixType::Identity()).norm() * absdet / NumTraits<Scalar>::epsilon() );
+    double error = double( (m*inv-MatrixType::Identity()).norm());
     error_sum += error;
     error_max = (std::max)(error_max, error);
   }
@@ -68,7 +67,7 @@ template<typename MatrixType> void inverse_general_4x4(int repeat)
   }
 }
 
-void test_prec_inverse_4x4()
+EIGEN_DECLARE_TEST(prec_inverse_4x4)
 {
   CALL_SUBTEST_1((inverse_permutation_4x4<Matrix4f>()));
   CALL_SUBTEST_1(( inverse_general_4x4<Matrix4f>(200000 * g_repeat) ));
diff --git a/contrib/eigen/test/product.h b/contrib/eigen/test/product.h
index a4e0c569e1442bff567f14caea51b7f086f6d548..c6c78fbd8155f8e483fe58292612d3ac34226666 100644
--- a/contrib/eigen/test/product.h
+++ b/contrib/eigen/test/product.h
@@ -227,6 +227,8 @@ template<typename MatrixType> void product(const MatrixType& m)
     // CwiseBinaryOp
     VERIFY_IS_APPROX(x = y + A*x, A*z);
     x = z;
+    VERIFY_IS_APPROX(x = y - A*x, A*(-z));
+    x = z;
     // CwiseUnaryOp
     VERIFY_IS_APPROX(x = Scalar(1.)*(A*x), A*z);
   }
diff --git a/contrib/eigen/test/product_extra.cpp b/contrib/eigen/test/product_extra.cpp
index de2709d8b5a7bd9455a497591f76662ab13c3669..15c69896e00dd3746df16cad2d7144c75e9a205c 100644
--- a/contrib/eigen/test/product_extra.cpp
+++ b/contrib/eigen/test/product_extra.cpp
@@ -93,6 +93,22 @@ template<typename MatrixType> void product_extra(const MatrixType& m)
 
   VERIFY_IS_APPROX(m1.col(j2).adjoint() * m1.block(0,j,m1.rows(),c), m1.col(j2).adjoint().eval() * m1.block(0,j,m1.rows(),c).eval());
   VERIFY_IS_APPROX(m1.block(i,0,r,m1.cols()) * m1.row(i2).adjoint(), m1.block(i,0,r,m1.cols()).eval() * m1.row(i2).adjoint().eval());
+
+  // test negative strides
+  {
+    Map<MatrixType,Unaligned,Stride<Dynamic,Dynamic> > map1(&m1(rows-1,cols-1), rows, cols, Stride<Dynamic,Dynamic>(-m1.outerStride(),-1));
+    Map<MatrixType,Unaligned,Stride<Dynamic,Dynamic> > map2(&m2(rows-1,cols-1), rows, cols, Stride<Dynamic,Dynamic>(-m2.outerStride(),-1));
+    Map<RowVectorType,Unaligned,InnerStride<-1> > mapv1(&v1(v1.size()-1), v1.size(), InnerStride<-1>(-1));
+    Map<ColVectorType,Unaligned,InnerStride<-1> > mapvc2(&vc2(vc2.size()-1), vc2.size(), InnerStride<-1>(-1));
+    VERIFY_IS_APPROX(MatrixType(map1), m1.reverse());
+    VERIFY_IS_APPROX(MatrixType(map2), m2.reverse());
+    VERIFY_IS_APPROX(m3.noalias() = MatrixType(map1) * MatrixType(map2).adjoint(), m1.reverse() * m2.reverse().adjoint());
+    VERIFY_IS_APPROX(m3.noalias() = map1 * map2.adjoint(), m1.reverse() * m2.reverse().adjoint());
+    VERIFY_IS_APPROX(map1 * vc2, m1.reverse() * vc2);
+    VERIFY_IS_APPROX(m1 * mapvc2, m1 * mapvc2);
+    VERIFY_IS_APPROX(map1.adjoint() * v1.transpose(), m1.adjoint().reverse() * v1.transpose());
+    VERIFY_IS_APPROX(m1.adjoint() * mapv1.transpose(), m1.adjoint() * v1.reverse().transpose());
+  }
   
   // regression test
   MatrixType tmp = m1 * m1.adjoint() * s1;
@@ -352,7 +368,7 @@ void bug_1308()
   VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.col(0).transpose(), ones44);
 }
 
-void test_product_extra()
+EIGEN_DECLARE_TEST(product_extra)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( product_extra(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
diff --git a/contrib/eigen/test/product_large.cpp b/contrib/eigen/test/product_large.cpp
index 14a4f739d4857beee982b0db22724b55c8f35d2b..3d0204b5fba4f16c3c5dbe386dcbc34e21ff9e50 100644
--- a/contrib/eigen/test/product_large.cpp
+++ b/contrib/eigen/test/product_large.cpp
@@ -8,6 +8,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "product.h"
+#include <Eigen/LU>
 
 template<typename T>
 void test_aliasing()
@@ -30,21 +31,9 @@ void test_aliasing()
   x = z;
 }
 
-void test_product_large()
+template<int>
+void product_large_regressions()
 {
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( product(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
-    CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
-    CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,10), internal::random<int>(1,10))) );
-
-    CALL_SUBTEST_3( product(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
-    CALL_SUBTEST_4( product(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
-    CALL_SUBTEST_5( product(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
-
-    CALL_SUBTEST_1( test_aliasing<float>() );
-  }
-
-#if defined EIGEN_TEST_PART_6
   {
     // test a specific issue in DiagonalProduct
     int N = 1000000;
@@ -97,7 +86,40 @@ void test_product_large()
                 * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)));
     VERIFY_IS_APPROX(B,C);
   }
-#endif
+}
+
+template<int>
+void bug_1622() {
+  typedef Matrix<double, 2, -1, 0, 2, -1> Mat2X;
+  Mat2X x(2,2); x.setRandom();
+  MatrixXd y(2,2); y.setRandom();
+  const Mat2X K1 = x * y.inverse();
+  const Matrix2d K2 = x * y.inverse();
+  VERIFY_IS_APPROX(K1,K2);
+}
+
+EIGEN_DECLARE_TEST(product_large)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( product(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,10), internal::random<int>(1,10))) );
+
+    CALL_SUBTEST_3( product(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_4( product(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
+    CALL_SUBTEST_5( product(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+
+    CALL_SUBTEST_1( test_aliasing<float>() );
+
+    CALL_SUBTEST_6( bug_1622<1>() );
+
+    CALL_SUBTEST_7( product(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
+    CALL_SUBTEST_8( product(Matrix<double,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_9( product(Matrix<std::complex<float>,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_10( product(Matrix<std::complex<double>,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  }
+
+  CALL_SUBTEST_6( product_large_regressions<0>() );
 
   // Regression test for bug 714:
 #if defined EIGEN_HAS_OPENMP
diff --git a/contrib/eigen/test/product_mmtr.cpp b/contrib/eigen/test/product_mmtr.cpp
index 35686460cdc46681dba2f4f014721e19bdb1b41a..8f8c5fe1f9c96eb0f8f53dfccbaf34353eb84c0d 100644
--- a/contrib/eigen/test/product_mmtr.cpp
+++ b/contrib/eigen/test/product_mmtr.cpp
@@ -94,7 +94,7 @@ template<typename Scalar> void mmtr(int size)
   }
 }
 
-void test_product_mmtr()
+EIGEN_DECLARE_TEST(product_mmtr)
 {
   for(int i = 0; i < g_repeat ; i++)
   {
diff --git a/contrib/eigen/test/product_notemporary.cpp b/contrib/eigen/test/product_notemporary.cpp
index 28865d3980653cec087b64fca7b340a578e17626..20cb7c0801bb468296afd468e20fa1e57b189db2 100644
--- a/contrib/eigen/test/product_notemporary.cpp
+++ b/contrib/eigen/test/product_notemporary.cpp
@@ -11,6 +11,37 @@
 
 #include "main.h"
 
+template<typename Dst, typename Lhs, typename Rhs>
+void check_scalar_multiple3(Dst &dst, const Lhs& A, const Rhs& B)
+{
+  VERIFY_EVALUATION_COUNT( (dst.noalias()  = A * B), 0);
+  VERIFY_IS_APPROX( dst, (A.eval() * B.eval()).eval() );
+  VERIFY_EVALUATION_COUNT( (dst.noalias() += A * B), 0);
+  VERIFY_IS_APPROX( dst, 2*(A.eval() * B.eval()).eval() );
+  VERIFY_EVALUATION_COUNT( (dst.noalias() -= A * B), 0);
+  VERIFY_IS_APPROX( dst, (A.eval() * B.eval()).eval() );
+}
+
+template<typename Dst, typename Lhs, typename Rhs, typename S2>
+void check_scalar_multiple2(Dst &dst, const Lhs& A, const Rhs& B, S2 s2)
+{
+  CALL_SUBTEST( check_scalar_multiple3(dst, A,    B) );
+  CALL_SUBTEST( check_scalar_multiple3(dst, A,   -B) );
+  CALL_SUBTEST( check_scalar_multiple3(dst, A, s2*B) );
+  CALL_SUBTEST( check_scalar_multiple3(dst, A, B*s2) );
+  CALL_SUBTEST( check_scalar_multiple3(dst, A, (B*s2).conjugate()) );
+}
+
+template<typename Dst, typename Lhs, typename Rhs, typename S1, typename S2>
+void check_scalar_multiple1(Dst &dst, const Lhs& A, const Rhs& B, S1 s1, S2 s2)
+{
+  CALL_SUBTEST( check_scalar_multiple2(dst,    A, B, s2) );
+  CALL_SUBTEST( check_scalar_multiple2(dst,   -A, B, s2) );
+  CALL_SUBTEST( check_scalar_multiple2(dst, s1*A, B, s2) );
+  CALL_SUBTEST( check_scalar_multiple2(dst, A*s1, B, s2) );
+  CALL_SUBTEST( check_scalar_multiple2(dst, (A*s1).conjugate(), B, s2) );
+}
+
 template<typename MatrixType> void product_notemporary(const MatrixType& m)
 {
   /* This test checks the number of temporaries created
@@ -105,7 +136,9 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
   VERIFY_EVALUATION_COUNT( m3.noalias() = m1.block(r0,r0,r1,r1).template triangularView<UnitUpper>()  * m2.block(r0,c0,r1,c1), 1);
 
   // Zero temporaries for lazy products ...
+  m3.setRandom(rows,cols);
   VERIFY_EVALUATION_COUNT( Scalar tmp = 0; tmp += Scalar(RealScalar(1)) /  (m3.transpose().lazyProduct(m3)).diagonal().sum(), 0 );
+  VERIFY_EVALUATION_COUNT( m3.noalias() = m1.conjugate().lazyProduct(m2.conjugate()), 0);
 
   // ... and even no temporary for even deeply (>=2) nested products
   VERIFY_EVALUATION_COUNT( Scalar tmp = 0; tmp += Scalar(RealScalar(1)) /  (m3.transpose() * m3).diagonal().sum(), 0 );
@@ -127,11 +160,19 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
   VERIFY_EVALUATION_COUNT( cvres.noalias() = (rm3+rm3) * (m1*cv1), 1 );
 
   // Check outer products
+  #ifdef EIGEN_ALLOCA
+  bool temp_via_alloca = m3.rows()*sizeof(Scalar) <= EIGEN_STACK_ALLOCATION_LIMIT;
+  #else
+  bool temp_via_alloca = false;
+  #endif
   m3 = cv1 * rv1;
   VERIFY_EVALUATION_COUNT( m3.noalias() = cv1 * rv1, 0 );
-  VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), 1 );
+  VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 );
   VERIFY_EVALUATION_COUNT( m3.noalias() = (m1*cv1) * (rv1), 1 );
   VERIFY_EVALUATION_COUNT( m3.noalias() += (m1*cv1) * (rv1), 1 );
+  rm3 = cv1 * rv1;
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = cv1 * rv1, 0 );
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 );
   VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1) * (rv1 * m1), 1 );
   VERIFY_EVALUATION_COUNT( rm3.noalias() -= (cv1) * (rv1 * m1), 1 );
   VERIFY_EVALUATION_COUNT( rm3.noalias() = (m1*cv1) * (rv1 * m1), 2 );
@@ -140,9 +181,18 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
   // Check nested products
   VERIFY_EVALUATION_COUNT( cvres.noalias() = m1.adjoint() * m1 * cv1, 1 );
   VERIFY_EVALUATION_COUNT( rvres.noalias() = rv1 * (m1 * m2.adjoint()), 1 );
+
+  // exhaustively check all scalar multiple combinations:
+  {
+    // Generic path:
+    check_scalar_multiple1(m3, m1, m2, s1, s2);
+    // Force fall back to coeff-based:
+    typename ColMajorMatrixType::BlockXpr m3_blck = m3.block(r0,r0,1,1);
+    check_scalar_multiple1(m3_blck, m1.block(r0,c0,1,1), m2.block(c0,r0,1,1), s1, s2);
+  }
 }
 
-void test_product_notemporary()
+EIGEN_DECLARE_TEST(product_notemporary)
 {
   int s;
   for(int i = 0; i < g_repeat; i++) {
diff --git a/contrib/eigen/test/product_selfadjoint.cpp b/contrib/eigen/test/product_selfadjoint.cpp
index 88d68391ba59f068f6c30b36adbdcac3de7f07c2..bdccd049129a8a3d3525fa3b4ad3e62b6b4be629 100644
--- a/contrib/eigen/test/product_selfadjoint.cpp
+++ b/contrib/eigen/test/product_selfadjoint.cpp
@@ -59,7 +59,7 @@ template<typename MatrixType> void product_selfadjoint(const MatrixType& m)
   }
 }
 
-void test_product_selfadjoint()
+EIGEN_DECLARE_TEST(product_selfadjoint)
 {
   int s = 0;
   for(int i = 0; i < g_repeat ; i++) {
diff --git a/contrib/eigen/test/product_small.cpp b/contrib/eigen/test/product_small.cpp
index fdfdd9f6c54ecfdbe163207732366a9853ab1dd2..1d6df6e58765451d0bbf58508111816a85b75620 100644
--- a/contrib/eigen/test/product_small.cpp
+++ b/contrib/eigen/test/product_small.cpp
@@ -56,6 +56,30 @@ test_lazy_single(int rows, int cols, int depth)
   VERIFY_IS_APPROX(C+=A.lazyProduct(B), ref_prod(D,A,B));
 }
 
+void test_dynamic_bool()
+{
+  int rows = internal::random<int>(1,64);
+  int cols = internal::random<int>(1,64);
+  int depth = internal::random<int>(1,65);
+
+  typedef Matrix<bool,Dynamic,Dynamic> MatrixX;
+  MatrixX A(rows,depth); A.setRandom();
+  MatrixX B(depth,cols); B.setRandom();
+  MatrixX C(rows,cols);  C.setRandom();
+  MatrixX D(C);
+  for(Index i=0;i<C.rows();++i)
+    for(Index j=0;j<C.cols();++j)
+      for(Index k=0;k<A.cols();++k)
+       D.coeffRef(i,j) |= A.coeff(i,k) & B.coeff(k,j);
+  C += A * B;
+  VERIFY_IS_EQUAL(C, D);
+
+  MatrixX E = B.transpose();
+  for(Index i=0;i<B.rows();++i)
+    for(Index j=0;j<B.cols();++j)
+      VERIFY_IS_EQUAL(B(i,j), E(j,i));
+}
+
 template<typename T, int Rows, int Cols, int Depth, int OC, int OA, int OB>
 typename internal::enable_if<  ( (Rows ==1&&Depth!=1&&OA==ColMajor)
                               || (Depth==1&&Rows !=1&&OA==RowMajor)
@@ -78,7 +102,7 @@ void test_lazy_all_layout(int rows=Rows, int cols=Cols, int depth=Depth)
   CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,RowMajor,ColMajor,RowMajor>(rows,cols,depth) ));
   CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,ColMajor,RowMajor,RowMajor>(rows,cols,depth) ));
   CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,RowMajor,RowMajor,RowMajor>(rows,cols,depth) ));
-}
+}  
 
 template<typename T>
 void test_lazy_l1()
@@ -228,7 +252,37 @@ void bug_1311()
   VERIFY_IS_APPROX(res, A*b);
 }
 
-void test_product_small()
+template<int>
+void product_small_regressions()
+{
+  {
+    // test compilation of (outer_product) * vector
+    Vector3f v = Vector3f::Random();
+    VERIFY_IS_APPROX( (v * v.transpose()) * v, (v * v.transpose()).eval() * v);
+  }
+  
+  {
+    // regression test for pull-request #93
+    Eigen::Matrix<double, 1, 1> A;  A.setRandom();
+    Eigen::Matrix<double, 18, 1> B; B.setRandom();
+    Eigen::Matrix<double, 1, 18> C; C.setRandom();
+    VERIFY_IS_APPROX(B * A.inverse(), B * A.inverse()[0]);
+    VERIFY_IS_APPROX(A.inverse() * C, A.inverse()[0] * C);
+  }
+
+  {
+    Eigen::Matrix<double, 10, 10> A, B, C;
+    A.setRandom();
+    C = A;
+    for(int k=0; k<79; ++k)
+      C = C * A;
+    B.noalias() = (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)))
+                * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)));
+    VERIFY_IS_APPROX(B,C);
+  }
+}
+
+EIGEN_DECLARE_TEST(product_small)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( product(Matrix<float, 3, 2>()) );
@@ -261,33 +315,9 @@ void test_product_small()
 
     CALL_SUBTEST_6( bug_1311<3>() );
     CALL_SUBTEST_6( bug_1311<5>() );
-  }
 
-#ifdef EIGEN_TEST_PART_6
-  {
-    // test compilation of (outer_product) * vector
-    Vector3f v = Vector3f::Random();
-    VERIFY_IS_APPROX( (v * v.transpose()) * v, (v * v.transpose()).eval() * v);
-  }
-  
-  {
-    // regression test for pull-request #93
-    Eigen::Matrix<double, 1, 1> A;  A.setRandom();
-    Eigen::Matrix<double, 18, 1> B; B.setRandom();
-    Eigen::Matrix<double, 1, 18> C; C.setRandom();
-    VERIFY_IS_APPROX(B * A.inverse(), B * A.inverse()[0]);
-    VERIFY_IS_APPROX(A.inverse() * C, A.inverse()[0] * C);
+    CALL_SUBTEST_9( test_dynamic_bool() );
   }
 
-  {
-    Eigen::Matrix<double, 10, 10> A, B, C;
-    A.setRandom();
-    C = A;
-    for(int k=0; k<79; ++k)
-      C = C * A;
-    B.noalias() = (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)))
-                * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)));
-    VERIFY_IS_APPROX(B,C);
-  }
-#endif
+  CALL_SUBTEST_6( product_small_regressions<0>() );
 }
diff --git a/contrib/eigen/test/product_symm.cpp b/contrib/eigen/test/product_symm.cpp
index 0ed027dff1aec61100add13bbcb0383689a61906..ea8d4d5cfee6a8b1dff1bcd9bf8083aa35c7c3fe 100644
--- a/contrib/eigen/test/product_symm.cpp
+++ b/contrib/eigen/test/product_symm.cpp
@@ -108,7 +108,7 @@ template<typename Scalar, int Size, int OtherSize> void symm(int size = Size, in
   }
 }
 
-void test_product_symm()
+EIGEN_DECLARE_TEST(product_symm)
 {
   for(int i = 0; i < g_repeat ; i++)
   {
diff --git a/contrib/eigen/test/product_syrk.cpp b/contrib/eigen/test/product_syrk.cpp
index b8578215f9931d1384d77f9c384c58fc8795b1ac..8becd37fc21be55d8d59cff62339c597f308d4df 100644
--- a/contrib/eigen/test/product_syrk.cpp
+++ b/contrib/eigen/test/product_syrk.cpp
@@ -128,7 +128,7 @@ template<typename MatrixType> void syrk(const MatrixType& m)
   }
 }
 
-void test_product_syrk()
+EIGEN_DECLARE_TEST(product_syrk)
 {
   for(int i = 0; i < g_repeat ; i++)
   {
diff --git a/contrib/eigen/test/product_trmm.cpp b/contrib/eigen/test/product_trmm.cpp
index ddcde9622ac8458821a8f3f0708a09ee6ee05e8d..2bb4b9e4782f8f9f47122d69714d20eaa3496ff9 100644
--- a/contrib/eigen/test/product_trmm.cpp
+++ b/contrib/eigen/test/product_trmm.cpp
@@ -125,7 +125,7 @@ void trmm(int rows=get_random_size<Scalar>(), int cols=get_random_size<Scalar>()
   CALL_ALL_ORDERS(EIGEN_CAT(3,NB),SCALAR,StrictlyLower)
   
 
-void test_product_trmm()
+EIGEN_DECLARE_TEST(product_trmm)
 {
   for(int i = 0; i < g_repeat ; i++)
   {
diff --git a/contrib/eigen/test/product_trmv.cpp b/contrib/eigen/test/product_trmv.cpp
index 65d66e57b781ee498cbbf9bd5adf757dd6164b1e..5eb1b5ac0855885fdf02c36b49568d2019a68dc3 100644
--- a/contrib/eigen/test/product_trmv.cpp
+++ b/contrib/eigen/test/product_trmv.cpp
@@ -70,7 +70,7 @@ template<typename MatrixType> void trmv(const MatrixType& m)
   // TODO check with sub-matrices
 }
 
-void test_product_trmv()
+EIGEN_DECLARE_TEST(product_trmv)
 {
   int s = 0;
   for(int i = 0; i < g_repeat ; i++) {
diff --git a/contrib/eigen/test/product_trsolve.cpp b/contrib/eigen/test/product_trsolve.cpp
index eaf62cb1139e79bccdf00a48d43e4d029cc7391f..c59748c5b16814bb7b3204c63f3026a70ccb0f7d 100644
--- a/contrib/eigen/test/product_trsolve.cpp
+++ b/contrib/eigen/test/product_trsolve.cpp
@@ -99,7 +99,7 @@ template<typename Scalar,int Size, int Cols> void trsolve(int size=Size,int cols
   }
 }
 
-void test_product_trsolve()
+EIGEN_DECLARE_TEST(product_trsolve)
 {
   for(int i = 0; i < g_repeat ; i++)
   {
diff --git a/contrib/eigen/test/qr.cpp b/contrib/eigen/test/qr.cpp
index 56884605b6c51eaf6f165d56d0426352db3a64da..c38e3439b69053fc970aacc6ae0fb022aff3ecb7 100644
--- a/contrib/eigen/test/qr.cpp
+++ b/contrib/eigen/test/qr.cpp
@@ -9,6 +9,7 @@
 
 #include "main.h"
 #include <Eigen/QR>
+#include "solverbase.h"
 
 template<typename MatrixType> void qr(const MatrixType& m)
 {
@@ -41,11 +42,7 @@ template<typename MatrixType, int Cols2> void qr_fixedsize()
 
   VERIFY_IS_APPROX(m1, qr.householderQ() * r);
 
-  Matrix<Scalar,Cols,Cols2> m2 = Matrix<Scalar,Cols,Cols2>::Random(Cols,Cols2);
-  Matrix<Scalar,Rows,Cols2> m3 = m1*m2;
-  m2 = Matrix<Scalar,Cols,Cols2>::Random(Cols,Cols2);
-  m2 = qr.solve(m3);
-  VERIFY_IS_APPROX(m3, m1*m2);
+  check_solverbase<Matrix<Scalar,Cols,Cols2>, Matrix<Scalar,Rows,Cols2> >(m1, qr, Rows, Cols, Cols2);
 }
 
 template<typename MatrixType> void qr_invertible()
@@ -57,6 +54,8 @@ template<typename MatrixType> void qr_invertible()
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
   typedef typename MatrixType::Scalar Scalar;
 
+  STATIC_CHECK(( internal::is_same<typename HouseholderQR<MatrixType>::StorageIndex,int>::value ));
+
   int size = internal::random<int>(10,50);
 
   MatrixType m1(size, size), m2(size, size), m3(size, size);
@@ -70,9 +69,8 @@ template<typename MatrixType> void qr_invertible()
   }
 
   HouseholderQR<MatrixType> qr(m1);
-  m3 = MatrixType::Random(size,size);
-  m2 = qr.solve(m3);
-  VERIFY_IS_APPROX(m3, m1*m2);
+
+  check_solverbase<MatrixType, MatrixType>(m1, qr, size, size, size);
 
   // now construct a matrix with prescribed determinant
   m1.setZero();
@@ -83,7 +81,7 @@ template<typename MatrixType> void qr_invertible()
   qr.compute(m1);
   VERIFY_IS_APPROX(log(absdet), qr.logAbsDeterminant());
   // This test is tricky if the determinant becomes too small.
-  // Since we generate random numbers with magnitude rrange [0,1], the average determinant is 0.5^size
+  // Since we generate random numbers with magnitude range [0,1], the average determinant is 0.5^size
   VERIFY_IS_MUCH_SMALLER_THAN( abs(absdet-qr.absDeterminant()), numext::maxi(RealScalar(pow(0.5,size)),numext::maxi<RealScalar>(abs(absdet),abs(qr.absDeterminant()))) );
   
 }
@@ -95,12 +93,14 @@ template<typename MatrixType> void qr_verify_assert()
   HouseholderQR<MatrixType> qr;
   VERIFY_RAISES_ASSERT(qr.matrixQR())
   VERIFY_RAISES_ASSERT(qr.solve(tmp))
+  VERIFY_RAISES_ASSERT(qr.transpose().solve(tmp))
+  VERIFY_RAISES_ASSERT(qr.adjoint().solve(tmp))
   VERIFY_RAISES_ASSERT(qr.householderQ())
   VERIFY_RAISES_ASSERT(qr.absDeterminant())
   VERIFY_RAISES_ASSERT(qr.logAbsDeterminant())
 }
 
-void test_qr()
+EIGEN_DECLARE_TEST(qr)
 {
   for(int i = 0; i < g_repeat; i++) {
    CALL_SUBTEST_1( qr(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE),internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
diff --git a/contrib/eigen/test/qr_colpivoting.cpp b/contrib/eigen/test/qr_colpivoting.cpp
index 96c0badb7eff076067fac50228183d8acbb14b6e..06f16438f1c0f03a097801d98656dd501ef4e94a 100644
--- a/contrib/eigen/test/qr_colpivoting.cpp
+++ b/contrib/eigen/test/qr_colpivoting.cpp
@@ -11,9 +11,12 @@
 #include "main.h"
 #include <Eigen/QR>
 #include <Eigen/SVD>
+#include "solverbase.h"
 
 template <typename MatrixType>
 void cod() {
+  STATIC_CHECK(( internal::is_same<typename CompleteOrthogonalDecomposition<MatrixType>::StorageIndex,int>::value ));
+
   Index rows = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
   Index cols = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
   Index cols2 = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
@@ -46,12 +49,12 @@ void cod() {
   MatrixType c = q * t * z * cod.colsPermutation().inverse();
   VERIFY_IS_APPROX(matrix, c);
 
+  check_solverbase<MatrixType, MatrixType>(matrix, cod, rows, cols, cols2);
+
+  // Verify that we get the same minimum-norm solution as the SVD.
   MatrixType exact_solution = MatrixType::Random(cols, cols2);
   MatrixType rhs = matrix * exact_solution;
   MatrixType cod_solution = cod.solve(rhs);
-  VERIFY_IS_APPROX(rhs, matrix * cod_solution);
-
-  // Verify that we get the same minimum-norm solution as the SVD.
   JacobiSVD<MatrixType> svd(matrix, ComputeThinU | ComputeThinV);
   MatrixType svd_solution = svd.solve(rhs);
   VERIFY_IS_APPROX(cod_solution, svd_solution);
@@ -67,32 +70,38 @@ void cod_fixedsize() {
     Cols = MatrixType::ColsAtCompileTime
   };
   typedef typename MatrixType::Scalar Scalar;
+  typedef CompleteOrthogonalDecomposition<Matrix<Scalar, Rows, Cols> > COD;
   int rank = internal::random<int>(1, (std::min)(int(Rows), int(Cols)) - 1);
   Matrix<Scalar, Rows, Cols> matrix;
   createRandomPIMatrixOfRank(rank, Rows, Cols, matrix);
-  CompleteOrthogonalDecomposition<Matrix<Scalar, Rows, Cols> > cod(matrix);
+  COD cod(matrix);
   VERIFY(rank == cod.rank());
   VERIFY(Cols - cod.rank() == cod.dimensionOfKernel());
   VERIFY(cod.isInjective() == (rank == Rows));
   VERIFY(cod.isSurjective() == (rank == Cols));
   VERIFY(cod.isInvertible() == (cod.isInjective() && cod.isSurjective()));
 
+  check_solverbase<Matrix<Scalar, Cols, Cols2>, Matrix<Scalar, Rows, Cols2> >(matrix, cod, Rows, Cols, Cols2);
+
+  // Verify that we get the same minimum-norm solution as the SVD.
   Matrix<Scalar, Cols, Cols2> exact_solution;
   exact_solution.setRandom(Cols, Cols2);
   Matrix<Scalar, Rows, Cols2> rhs = matrix * exact_solution;
   Matrix<Scalar, Cols, Cols2> cod_solution = cod.solve(rhs);
-  VERIFY_IS_APPROX(rhs, matrix * cod_solution);
-
-  // Verify that we get the same minimum-norm solution as the SVD.
   JacobiSVD<MatrixType> svd(matrix, ComputeFullU | ComputeFullV);
   Matrix<Scalar, Cols, Cols2> svd_solution = svd.solve(rhs);
   VERIFY_IS_APPROX(cod_solution, svd_solution);
+
+  typename Inverse<COD>::PlainObject pinv = cod.pseudoInverse();
+  VERIFY_IS_APPROX(cod_solution, pinv * rhs);
 }
 
 template<typename MatrixType> void qr()
 {
   using std::sqrt;
 
+  STATIC_CHECK(( internal::is_same<typename ColPivHouseholderQR<MatrixType>::StorageIndex,int>::value ));
+
   Index rows = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols2 = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE);
   Index rank = internal::random<Index>(1, (std::min)(rows, cols)-1);
 
@@ -133,13 +142,10 @@ template<typename MatrixType> void qr()
     VERIFY_IS_APPROX_OR_LESS_THAN(y, x);
   }
 
-  MatrixType m2 = MatrixType::Random(cols,cols2);
-  MatrixType m3 = m1*m2;
-  m2 = MatrixType::Random(cols,cols2);
-  m2 = qr.solve(m3);
-  VERIFY_IS_APPROX(m3, m1*m2);
+  check_solverbase<MatrixType, MatrixType>(m1, qr, rows, cols, cols2);
 
   {
+    MatrixType m2, m3;
     Index size = rows;
     do {
       m1 = MatrixType::Random(size,size);
@@ -173,11 +179,8 @@ template<typename MatrixType, int Cols2> void qr_fixedsize()
   Matrix<Scalar,Rows,Cols> c = qr.householderQ() * r * qr.colsPermutation().inverse();
   VERIFY_IS_APPROX(m1, c);
 
-  Matrix<Scalar,Cols,Cols2> m2 = Matrix<Scalar,Cols,Cols2>::Random(Cols,Cols2);
-  Matrix<Scalar,Rows,Cols2> m3 = m1*m2;
-  m2 = Matrix<Scalar,Cols,Cols2>::Random(Cols,Cols2);
-  m2 = qr.solve(m3);
-  VERIFY_IS_APPROX(m3, m1*m2);
+  check_solverbase<Matrix<Scalar,Cols,Cols2>, Matrix<Scalar,Rows,Cols2> >(m1, qr, Rows, Cols, Cols2);
+
   // Verify that the absolute value of the diagonal elements in R are
   // non-increasing until they reache the singularity threshold.
   RealScalar threshold =
@@ -264,9 +267,8 @@ template<typename MatrixType> void qr_invertible()
   }
 
   ColPivHouseholderQR<MatrixType> qr(m1);
-  m3 = MatrixType::Random(size,size);
-  m2 = qr.solve(m3);
-  //VERIFY_IS_APPROX(m3, m1*m2);
+
+  check_solverbase<MatrixType, MatrixType>(m1, qr, size, size, size);
 
   // now construct a matrix with prescribed determinant
   m1.setZero();
@@ -286,6 +288,8 @@ template<typename MatrixType> void qr_verify_assert()
   ColPivHouseholderQR<MatrixType> qr;
   VERIFY_RAISES_ASSERT(qr.matrixQR())
   VERIFY_RAISES_ASSERT(qr.solve(tmp))
+  VERIFY_RAISES_ASSERT(qr.transpose().solve(tmp))
+  VERIFY_RAISES_ASSERT(qr.adjoint().solve(tmp))
   VERIFY_RAISES_ASSERT(qr.householderQ())
   VERIFY_RAISES_ASSERT(qr.dimensionOfKernel())
   VERIFY_RAISES_ASSERT(qr.isInjective())
@@ -296,7 +300,26 @@ template<typename MatrixType> void qr_verify_assert()
   VERIFY_RAISES_ASSERT(qr.logAbsDeterminant())
 }
 
-void test_qr_colpivoting()
+template<typename MatrixType> void cod_verify_assert()
+{
+  MatrixType tmp;
+
+  CompleteOrthogonalDecomposition<MatrixType> cod;
+  VERIFY_RAISES_ASSERT(cod.matrixQTZ())
+  VERIFY_RAISES_ASSERT(cod.solve(tmp))
+  VERIFY_RAISES_ASSERT(cod.transpose().solve(tmp))
+  VERIFY_RAISES_ASSERT(cod.adjoint().solve(tmp))
+  VERIFY_RAISES_ASSERT(cod.householderQ())
+  VERIFY_RAISES_ASSERT(cod.dimensionOfKernel())
+  VERIFY_RAISES_ASSERT(cod.isInjective())
+  VERIFY_RAISES_ASSERT(cod.isSurjective())
+  VERIFY_RAISES_ASSERT(cod.isInvertible())
+  VERIFY_RAISES_ASSERT(cod.pseudoInverse())
+  VERIFY_RAISES_ASSERT(cod.absDeterminant())
+  VERIFY_RAISES_ASSERT(cod.logAbsDeterminant())
+}
+
+EIGEN_DECLARE_TEST(qr_colpivoting)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( qr<MatrixXf>() );
@@ -330,6 +353,13 @@ void test_qr_colpivoting()
   CALL_SUBTEST_6(qr_verify_assert<MatrixXcf>());
   CALL_SUBTEST_3(qr_verify_assert<MatrixXcd>());
 
+  CALL_SUBTEST_7(cod_verify_assert<Matrix3f>());
+  CALL_SUBTEST_8(cod_verify_assert<Matrix3d>());
+  CALL_SUBTEST_1(cod_verify_assert<MatrixXf>());
+  CALL_SUBTEST_2(cod_verify_assert<MatrixXd>());
+  CALL_SUBTEST_6(cod_verify_assert<MatrixXcf>());
+  CALL_SUBTEST_3(cod_verify_assert<MatrixXcd>());
+
   // Test problem size constructors
   CALL_SUBTEST_9(ColPivHouseholderQR<MatrixXf>(10, 20));
 
diff --git a/contrib/eigen/test/qr_fullpivoting.cpp b/contrib/eigen/test/qr_fullpivoting.cpp
index 4d8ef686e895424f7dfab035a9a53a73dc4f15f3..f2d8cb33ecb1cc88a539c7703e157e969052508e 100644
--- a/contrib/eigen/test/qr_fullpivoting.cpp
+++ b/contrib/eigen/test/qr_fullpivoting.cpp
@@ -10,14 +10,18 @@
 
 #include "main.h"
 #include <Eigen/QR>
+#include "solverbase.h"
 
 template<typename MatrixType> void qr()
 {
+  STATIC_CHECK(( internal::is_same<typename FullPivHouseholderQR<MatrixType>::StorageIndex,int>::value ));
+
+  static const int Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime;
   Index max_size = EIGEN_TEST_MAX_SIZE;
   Index min_size = numext::maxi(1,EIGEN_TEST_MAX_SIZE/10);
-  Index rows  = internal::random<Index>(min_size,max_size),
-        cols  = internal::random<Index>(min_size,max_size),
-        cols2 = internal::random<Index>(min_size,max_size),
+  Index rows  = Rows == Dynamic ? internal::random<Index>(min_size,max_size) : Rows,
+        cols  = Cols == Dynamic ? internal::random<Index>(min_size,max_size) : Cols,
+        cols2 = Cols == Dynamic ? internal::random<Index>(min_size,max_size) : Cols,
         rank  = internal::random<Index>(1, (std::min)(rows, cols)-1);
 
   typedef typename MatrixType::Scalar Scalar;
@@ -47,13 +51,10 @@ template<typename MatrixType> void qr()
   MatrixType tmp;
   VERIFY_IS_APPROX(tmp.noalias() = qr.matrixQ() * r, (qr.matrixQ() * r).eval());
   
-  MatrixType m2 = MatrixType::Random(cols,cols2);
-  MatrixType m3 = m1*m2;
-  m2 = MatrixType::Random(cols,cols2);
-  m2 = qr.solve(m3);
-  VERIFY_IS_APPROX(m3, m1*m2);
+  check_solverbase<MatrixType, MatrixType>(m1, qr, rows, cols, cols2);
 
   {
+    MatrixType m2, m3;
     Index size = rows;
     do {
       m1 = MatrixType::Random(size,size);
@@ -92,9 +93,7 @@ template<typename MatrixType> void qr_invertible()
   VERIFY(qr.isInvertible());
   VERIFY(qr.isSurjective());
 
-  m3 = MatrixType::Random(size,size);
-  m2 = qr.solve(m3);
-  VERIFY_IS_APPROX(m3, m1*m2);
+  check_solverbase<MatrixType, MatrixType>(m1, qr, size, size, size);
 
   // now construct a matrix with prescribed determinant
   m1.setZero();
@@ -114,6 +113,8 @@ template<typename MatrixType> void qr_verify_assert()
   FullPivHouseholderQR<MatrixType> qr;
   VERIFY_RAISES_ASSERT(qr.matrixQR())
   VERIFY_RAISES_ASSERT(qr.solve(tmp))
+  VERIFY_RAISES_ASSERT(qr.transpose().solve(tmp))
+  VERIFY_RAISES_ASSERT(qr.adjoint().solve(tmp))
   VERIFY_RAISES_ASSERT(qr.matrixQ())
   VERIFY_RAISES_ASSERT(qr.dimensionOfKernel())
   VERIFY_RAISES_ASSERT(qr.isInjective())
@@ -124,11 +125,12 @@ template<typename MatrixType> void qr_verify_assert()
   VERIFY_RAISES_ASSERT(qr.logAbsDeterminant())
 }
 
-void test_qr_fullpivoting()
+EIGEN_DECLARE_TEST(qr_fullpivoting)
 {
- for(int i = 0; i < 1; i++) {
-    // FIXME : very weird bug here
-//     CALL_SUBTEST(qr(Matrix2f()) );
+  for(int i = 0; i < 1; i++) {
+    CALL_SUBTEST_5( qr<Matrix3f>() );
+    CALL_SUBTEST_6( qr<Matrix3d>() );
+    CALL_SUBTEST_8( qr<Matrix2f>() );
     CALL_SUBTEST_1( qr<MatrixXf>() );
     CALL_SUBTEST_2( qr<MatrixXd>() );
     CALL_SUBTEST_3( qr<MatrixXcd>() );
diff --git a/contrib/eigen/test/qtvector.cpp b/contrib/eigen/test/qtvector.cpp
index 22df0d515711733a5e76ed2f4abf5bd42926d892..4ec79b1e609ee7f0d8bffa7d3f9834ef73cf1e36 100644
--- a/contrib/eigen/test/qtvector.cpp
+++ b/contrib/eigen/test/qtvector.cpp
@@ -125,7 +125,7 @@ void check_qtvector_quaternion(const QuaternionType&)
   }
 }
 
-void test_qtvector()
+EIGEN_DECLARE_TEST(qtvector)
 {
   // some non vectorizable fixed sizes
   CALL_SUBTEST(check_qtvector_matrix(Vector2f()));
diff --git a/contrib/eigen/test/rand.cpp b/contrib/eigen/test/rand.cpp
index 51cf01773f55b9e8350dbcfc0f588a63c0bca65a..984c01f53cfa3c4f66d123a34026e40f20dc01c6 100644
--- a/contrib/eigen/test/rand.cpp
+++ b/contrib/eigen/test/rand.cpp
@@ -51,10 +51,10 @@ template<typename Scalar> void check_histogram(Scalar x, Scalar y, int bins)
     Scalar r = check_in_range(x,y);
     hist( int((int64(r)-int64(x))/divisor) )++;
   }
-  VERIFY( (((hist.cast<double>()/double(f))-1.0).abs()<0.02).all() );
+  VERIFY( (((hist.cast<double>()/double(f))-1.0).abs()<0.03).all() );
 }
 
-void test_rand()
+EIGEN_DECLARE_TEST(rand)
 {
   long long_ref = NumTraits<long>::highest()/10;
   signed char char_offset = (std::min)(g_repeat,64);
diff --git a/contrib/eigen/test/random_without_cast_overflow.h b/contrib/eigen/test/random_without_cast_overflow.h
new file mode 100644
index 0000000000000000000000000000000000000000..0003451108b24f4bbcec6bd9eda4d2cd1cd13423
--- /dev/null
+++ b/contrib/eigen/test/random_without_cast_overflow.h
@@ -0,0 +1,152 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 C. Antonio Sanchez <cantonios@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Utilities for generating random numbers without overflows, which might
+// otherwise result in undefined behavior.
+
+namespace Eigen {
+namespace internal {
+
+// Default implementation assuming SrcScalar fits into TgtScalar.
+template <typename SrcScalar, typename TgtScalar, typename EnableIf = void>
+struct random_without_cast_overflow {
+  static SrcScalar value() { return internal::random<SrcScalar>(); }
+};
+
+// Signed to unsigned integer widening cast.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<NumTraits<SrcScalar>::IsInteger && NumTraits<TgtScalar>::IsInteger &&
+                                 !NumTraits<TgtScalar>::IsSigned &&
+                                 (std::numeric_limits<SrcScalar>::digits < std::numeric_limits<TgtScalar>::digits ||
+                                  (std::numeric_limits<SrcScalar>::digits == std::numeric_limits<TgtScalar>::digits &&
+                                   NumTraits<SrcScalar>::IsSigned))>::type> {
+  static SrcScalar value() {
+    SrcScalar a = internal::random<SrcScalar>();
+    return a < SrcScalar(0) ? -(a + 1) : a;
+  }
+};
+
+// Integer to unsigned narrowing cast.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<
+        NumTraits<SrcScalar>::IsInteger && NumTraits<TgtScalar>::IsInteger && !NumTraits<SrcScalar>::IsSigned &&
+        (std::numeric_limits<SrcScalar>::digits > std::numeric_limits<TgtScalar>::digits)>::type> {
+  static SrcScalar value() {
+    TgtScalar b = internal::random<TgtScalar>();
+    return static_cast<SrcScalar>(b < TgtScalar(0) ? -(b + 1) : b);
+  }
+};
+
+// Integer to signed narrowing cast.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<
+        NumTraits<SrcScalar>::IsInteger && NumTraits<TgtScalar>::IsInteger && NumTraits<SrcScalar>::IsSigned &&
+        (std::numeric_limits<SrcScalar>::digits > std::numeric_limits<TgtScalar>::digits)>::type> {
+  static SrcScalar value() { return static_cast<SrcScalar>(internal::random<TgtScalar>()); }
+};
+
+// Unsigned to signed integer narrowing cast.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<NumTraits<SrcScalar>::IsInteger && NumTraits<TgtScalar>::IsInteger &&
+                                 !NumTraits<SrcScalar>::IsSigned && NumTraits<TgtScalar>::IsSigned &&
+                                 (std::numeric_limits<SrcScalar>::digits ==
+                                  std::numeric_limits<TgtScalar>::digits)>::type> {
+  static SrcScalar value() { return internal::random<SrcScalar>() / 2; }
+};
+
+// Floating-point to integer, full precision.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<
+        !NumTraits<SrcScalar>::IsInteger && !NumTraits<SrcScalar>::IsComplex && NumTraits<TgtScalar>::IsInteger &&
+        (std::numeric_limits<TgtScalar>::digits <= std::numeric_limits<SrcScalar>::digits)>::type> {
+  static SrcScalar value() { return static_cast<SrcScalar>(internal::random<TgtScalar>()); }
+};
+
+// Floating-point to integer, narrowing precision.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<
+        !NumTraits<SrcScalar>::IsInteger && !NumTraits<SrcScalar>::IsComplex && NumTraits<TgtScalar>::IsInteger &&
+        (std::numeric_limits<TgtScalar>::digits > std::numeric_limits<SrcScalar>::digits)>::type> {
+  static SrcScalar value() {
+    // NOTE: internal::random<T>() is limited by RAND_MAX, so random<int64_t> is always within that range.
+    // This prevents us from simply shifting bits, which would result in only 0 or -1.
+    // Instead, keep least-significant K bits and sign.
+    static const TgtScalar KeepMask = (static_cast<TgtScalar>(1) << std::numeric_limits<SrcScalar>::digits) - 1;
+    const TgtScalar a = internal::random<TgtScalar>();
+    return static_cast<SrcScalar>(a > TgtScalar(0) ? (a & KeepMask) : -(a & KeepMask));
+  }
+};
+
+// Integer to floating-point, re-use above logic.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<NumTraits<SrcScalar>::IsInteger && !NumTraits<TgtScalar>::IsInteger &&
+                                 !NumTraits<TgtScalar>::IsComplex>::type> {
+  static SrcScalar value() {
+    return static_cast<SrcScalar>(random_without_cast_overflow<TgtScalar, SrcScalar>::value());
+  }
+};
+
+// Floating-point narrowing conversion.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<!NumTraits<SrcScalar>::IsInteger && !NumTraits<SrcScalar>::IsComplex &&
+                                 !NumTraits<TgtScalar>::IsInteger && !NumTraits<TgtScalar>::IsComplex &&
+                                 (std::numeric_limits<SrcScalar>::digits >
+                                  std::numeric_limits<TgtScalar>::digits)>::type> {
+  static SrcScalar value() { return static_cast<SrcScalar>(internal::random<TgtScalar>()); }
+};
+
+// Complex to non-complex.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<NumTraits<SrcScalar>::IsComplex && !NumTraits<TgtScalar>::IsComplex>::type> {
+  typedef typename NumTraits<SrcScalar>::Real SrcReal;
+  static SrcScalar value() { return SrcScalar(random_without_cast_overflow<SrcReal, TgtScalar>::value(), 0); }
+};
+
+// Non-complex to complex.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<!NumTraits<SrcScalar>::IsComplex && NumTraits<TgtScalar>::IsComplex>::type> {
+  typedef typename NumTraits<TgtScalar>::Real TgtReal;
+  static SrcScalar value() { return random_without_cast_overflow<SrcScalar, TgtReal>::value(); }
+};
+
+// Complex to complex.
+template <typename SrcScalar, typename TgtScalar>
+struct random_without_cast_overflow<
+    SrcScalar, TgtScalar,
+    typename internal::enable_if<NumTraits<SrcScalar>::IsComplex && NumTraits<TgtScalar>::IsComplex>::type> {
+  typedef typename NumTraits<SrcScalar>::Real SrcReal;
+  typedef typename NumTraits<TgtScalar>::Real TgtReal;
+  static SrcScalar value() {
+    return SrcScalar(random_without_cast_overflow<SrcReal, TgtReal>::value(),
+                     random_without_cast_overflow<SrcReal, TgtReal>::value());
+  }
+};
+
+}  // namespace internal
+}  // namespace Eigen
diff --git a/contrib/eigen/test/real_qz.cpp b/contrib/eigen/test/real_qz.cpp
index 3c1492e4bb37d6acbcbbdccb7b7b3a5997bec8e3..1cf7aba2d3d0106d5f9ccb4c5f6332b21356a068 100644
--- a/contrib/eigen/test/real_qz.cpp
+++ b/contrib/eigen/test/real_qz.cpp
@@ -75,7 +75,7 @@ template<typename MatrixType> void real_qz(const MatrixType& m)
   VERIFY_IS_APPROX(qz.matrixZ()*qz.matrixZ().adjoint(), MatrixType::Identity(dim,dim));
 }
 
-void test_real_qz()
+EIGEN_DECLARE_TEST(real_qz)
 {
   int s = 0;
   for(int i = 0; i < g_repeat; i++) {
diff --git a/contrib/eigen/test/redux.cpp b/contrib/eigen/test/redux.cpp
index 213f080aa48ad535f468cd1892ecab1655f78adb..fdbab771473fab5afa61dcdb7ae334164a6f2eec 100644
--- a/contrib/eigen/test/redux.cpp
+++ b/contrib/eigen/test/redux.cpp
@@ -28,6 +28,9 @@ template<typename MatrixType> void matrixRedux(const MatrixType& m)
   // failures if we underflow into denormals. Thus, we scale so that entries are close to 1.
   MatrixType m1_for_prod = MatrixType::Ones(rows, cols) + RealScalar(0.2) * m1;
 
+  Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> m2(rows,rows);
+  m2.setRandom();
+
   VERIFY_IS_MUCH_SMALLER_THAN(MatrixType::Zero(rows, cols).sum(), Scalar(1));
   VERIFY_IS_APPROX(MatrixType::Ones(rows, cols).sum(), Scalar(float(rows*cols))); // the float() here to shut up excessive MSVC warning about int->complex conversion being lossy
   Scalar s(0), p(1), minc(numext::real(m1.coeff(0))), maxc(numext::real(m1.coeff(0)));
@@ -46,6 +49,10 @@ template<typename MatrixType> void matrixRedux(const MatrixType& m)
   VERIFY_IS_APPROX(m1_for_prod.prod(), p);
   VERIFY_IS_APPROX(m1.real().minCoeff(), numext::real(minc));
   VERIFY_IS_APPROX(m1.real().maxCoeff(), numext::real(maxc));
+  
+  // test that partial reduction works if nested expressions is forced to evaluate early
+  VERIFY_IS_APPROX((m1.matrix() * m1.matrix().transpose())       .cwiseProduct(m2.matrix()).rowwise().sum().sum(), 
+                   (m1.matrix() * m1.matrix().transpose()).eval().cwiseProduct(m2.matrix()).rowwise().sum().sum());
 
   // test slice vectorization assuming assign is ok
   Index r0 = internal::random<Index>(0,rows-1);
@@ -72,8 +79,6 @@ template<typename MatrixType> void matrixRedux(const MatrixType& m)
 
   // test nesting complex expression
   VERIFY_EVALUATION_COUNT( (m1.matrix()*m1.matrix().transpose()).sum(), (MatrixType::IsVectorAtCompileTime && MatrixType::SizeAtCompileTime!=1 ? 0 : 1) );
-  Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> m2(rows,rows);
-  m2.setRandom();
   VERIFY_EVALUATION_COUNT( ((m1.matrix()*m1.matrix().transpose())+m2).sum(),(MatrixType::IsVectorAtCompileTime && MatrixType::SizeAtCompileTime!=1 ? 0 : 1));
 }
 
@@ -146,7 +151,7 @@ template<typename VectorType> void vectorRedux(const VectorType& w)
   VERIFY_RAISES_ASSERT(v.head(0).maxCoeff());
 }
 
-void test_redux()
+EIGEN_DECLARE_TEST(redux)
 {
   // the max size cannot be too large, otherwise reduxion operations obviously generate large errors.
   int maxsize = (std::min)(100,EIGEN_TEST_MAX_SIZE);
diff --git a/contrib/eigen/test/ref.cpp b/contrib/eigen/test/ref.cpp
index da399e287f1dd25258a34034ae9ade5a14871994..ebfc70d3d1ce4e4280df37262c9a8901332fd4bc 100644
--- a/contrib/eigen/test/ref.cpp
+++ b/contrib/eigen/test/ref.cpp
@@ -141,6 +141,69 @@ template<typename VectorType> void ref_vector(const VectorType& m)
   VERIFY_IS_APPROX(mat1, mat2);
 }
 
+template<typename Scalar, int Rows, int Cols>
+void ref_vector_fixed_sizes()
+{
+  typedef Matrix<Scalar,Rows,Cols,RowMajor> RowMajorMatrixType;
+  typedef Matrix<Scalar,Rows,Cols,ColMajor> ColMajorMatrixType;
+  typedef Matrix<Scalar,1,Cols> RowVectorType;
+  typedef Matrix<Scalar,Rows,1> ColVectorType;
+  typedef Matrix<Scalar,Cols,1> RowVectorTransposeType;
+  typedef Matrix<Scalar,1,Rows> ColVectorTransposeType;
+  typedef Stride<Dynamic, Dynamic> DynamicStride;
+
+  RowMajorMatrixType mr = RowMajorMatrixType::Random();
+  ColMajorMatrixType mc = ColMajorMatrixType::Random();
+
+  Index i = internal::random<Index>(0,Rows-1);
+  Index j = internal::random<Index>(0,Cols-1);
+
+  // Reference ith row.
+  Ref<RowVectorType, 0, DynamicStride> mr_ri = mr.row(i);
+  VERIFY_IS_EQUAL(mr_ri, mr.row(i));
+  Ref<RowVectorType, 0, DynamicStride> mc_ri = mc.row(i);
+  VERIFY_IS_EQUAL(mc_ri, mc.row(i));
+
+  // Reference jth col.
+  Ref<ColVectorType, 0, DynamicStride> mr_cj = mr.col(j);
+  VERIFY_IS_EQUAL(mr_cj, mr.col(j));
+  Ref<ColVectorType, 0, DynamicStride> mc_cj = mc.col(j);
+  VERIFY_IS_EQUAL(mc_cj, mc.col(j));
+
+  // Reference the transpose of row i.
+  Ref<RowVectorTransposeType, 0, DynamicStride> mr_rit = mr.row(i);
+  VERIFY_IS_EQUAL(mr_rit, mr.row(i).transpose());
+  Ref<RowVectorTransposeType, 0, DynamicStride> mc_rit = mc.row(i);
+  VERIFY_IS_EQUAL(mc_rit, mc.row(i).transpose());
+
+  // Reference the transpose of col j.
+  Ref<ColVectorTransposeType, 0, DynamicStride> mr_cjt = mr.col(j);
+  VERIFY_IS_EQUAL(mr_cjt, mr.col(j).transpose());
+  Ref<ColVectorTransposeType, 0, DynamicStride> mc_cjt = mc.col(j);
+  VERIFY_IS_EQUAL(mc_cjt, mc.col(j).transpose());
+  
+  // Const references without strides.
+  Ref<const RowVectorType> cmr_ri = mr.row(i);
+  VERIFY_IS_EQUAL(cmr_ri, mr.row(i));
+  Ref<const RowVectorType> cmc_ri = mc.row(i);
+  VERIFY_IS_EQUAL(cmc_ri, mc.row(i));
+
+  Ref<const ColVectorType> cmr_cj = mr.col(j);
+  VERIFY_IS_EQUAL(cmr_cj, mr.col(j));
+  Ref<const ColVectorType> cmc_cj = mc.col(j);
+  VERIFY_IS_EQUAL(cmc_cj, mc.col(j));
+
+  Ref<const RowVectorTransposeType> cmr_rit = mr.row(i);
+  VERIFY_IS_EQUAL(cmr_rit, mr.row(i).transpose());
+  Ref<const RowVectorTransposeType> cmc_rit = mc.row(i);
+  VERIFY_IS_EQUAL(cmc_rit, mc.row(i).transpose());
+
+  Ref<const ColVectorTransposeType> cmr_cjt = mr.col(j);
+  VERIFY_IS_EQUAL(cmr_cjt, mr.col(j).transpose());
+  Ref<const ColVectorTransposeType> cmc_cjt = mc.col(j);
+  VERIFY_IS_EQUAL(cmc_cjt, mc.col(j).transpose());
+}
+
 template<typename PlainObjectType> void check_const_correctness(const PlainObjectType&)
 {
   // verify that ref-to-const don't have LvalueBit
@@ -259,8 +322,8 @@ void test_ref_overloads()
 
 void test_ref_fixed_size_assert()
 {
-  Vector4f v4;
-  VectorXf vx(10);
+  Vector4f v4 = Vector4f::Random();
+  VectorXf vx = VectorXf::Random(10);
   VERIFY_RAISES_STATIC_ASSERT( Ref<Vector3f> y = v4; (void)y; );
   VERIFY_RAISES_STATIC_ASSERT( Ref<Vector3f> y = vx.head<4>(); (void)y; );
   VERIFY_RAISES_STATIC_ASSERT( Ref<const Vector3f> y = v4; (void)y; );
@@ -268,7 +331,7 @@ void test_ref_fixed_size_assert()
   VERIFY_RAISES_STATIC_ASSERT( Ref<const Vector3f> y = 2*v4; (void)y; );
 }
 
-void test_ref()
+EIGEN_DECLARE_TEST(ref)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( ref_vector(Matrix<float, 1, 1>()) );
@@ -287,6 +350,9 @@ void test_ref()
     CALL_SUBTEST_4( ref_matrix(Matrix<std::complex<double>,10,15>()) );
     CALL_SUBTEST_5( ref_matrix(MatrixXi(internal::random<int>(1,10),internal::random<int>(1,10))) );
     CALL_SUBTEST_6( call_ref() );
+
+    CALL_SUBTEST_8( (ref_vector_fixed_sizes<float,3,5>()) );
+    CALL_SUBTEST_8( (ref_vector_fixed_sizes<float,15,10>()) );
   }
   
   CALL_SUBTEST_7( test_ref_overloads() );
diff --git a/contrib/eigen/test/reshape.cpp b/contrib/eigen/test/reshape.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b16742a23faeaafadad3fc95dec7a9918c0db7b
--- /dev/null
+++ b/contrib/eigen/test/reshape.cpp
@@ -0,0 +1,216 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2014 yoco <peter.xiau@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template<typename T1,typename T2>
+typename internal::enable_if<internal::is_same<T1,T2>::value,bool>::type
+is_same_eq(const T1& a, const T2& b)
+{
+  return (a.array() == b.array()).all();
+}
+
+template <int Order,typename MatType>
+void check_auto_reshape4x4(MatType m)
+{
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1: 1>  v1( 1);
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1: 2>  v2( 2);
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1: 4>  v4( 4);
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1: 8>  v8( 8);
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1:16> v16(16);
+
+  VERIFY(is_same_eq(m.template reshaped<Order>( 1,       AutoSize), m.template reshaped<Order>( 1, 16)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize, 16      ), m.template reshaped<Order>( 1, 16)));
+  VERIFY(is_same_eq(m.template reshaped<Order>( 2,       AutoSize), m.template reshaped<Order>( 2,  8)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize, 8       ), m.template reshaped<Order>( 2,  8)));
+  VERIFY(is_same_eq(m.template reshaped<Order>( 4,       AutoSize), m.template reshaped<Order>( 4,  4)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize, 4       ), m.template reshaped<Order>( 4,  4)));
+  VERIFY(is_same_eq(m.template reshaped<Order>( 8,       AutoSize), m.template reshaped<Order>( 8,  2)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize, 2       ), m.template reshaped<Order>( 8,  2)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(16,       AutoSize), m.template reshaped<Order>(16,  1)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize, 1       ), m.template reshaped<Order>(16,  1)));
+
+  VERIFY(is_same_eq(m.template reshaped<Order>(fix< 1>,   AutoSize),  m.template reshaped<Order>(fix< 1>, v16    )));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize,  fix<16> ),  m.template reshaped<Order>( v1,     fix<16>)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(fix< 2>,   AutoSize),  m.template reshaped<Order>(fix< 2>, v8     )));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize,  fix< 8> ),  m.template reshaped<Order>( v2,     fix< 8>)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(fix< 4>,   AutoSize),  m.template reshaped<Order>(fix< 4>, v4     )));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize,  fix< 4> ),  m.template reshaped<Order>( v4,     fix< 4>)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(fix< 8>,   AutoSize),  m.template reshaped<Order>(fix< 8>, v2     )));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize,  fix< 2> ),  m.template reshaped<Order>( v8,     fix< 2>)));
+  VERIFY(is_same_eq(m.template reshaped<Order>(fix<16>,   AutoSize),  m.template reshaped<Order>(fix<16>, v1     )));
+  VERIFY(is_same_eq(m.template reshaped<Order>(AutoSize,  fix< 1> ),  m.template reshaped<Order>(v16,     fix< 1>)));
+}
+
+template <typename MatType>
+void check_direct_access_reshape4x4(MatType , internal::FixedInt<RowMajorBit>) {}
+
+template <typename MatType>
+void check_direct_access_reshape4x4(MatType m, internal::FixedInt<0>) {
+  VERIFY_IS_EQUAL(m.reshaped( 1, 16).data(), m.data());
+  VERIFY_IS_EQUAL(m.reshaped( 1, 16).innerStride(), 1);
+
+  VERIFY_IS_EQUAL(m.reshaped( 2, 8).data(), m.data());
+  VERIFY_IS_EQUAL(m.reshaped( 2, 8).innerStride(), 1);
+  VERIFY_IS_EQUAL(m.reshaped( 2, 8).outerStride(), 2);
+}
+
+// just test a 4x4 matrix, enumerate all combination manually
+template <typename MatType>
+void reshape4x4(MatType m)
+{
+  typedef typename MatType::Scalar Scalar;
+
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1: 1>  v1( 1);
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1: 2>  v2( 2);
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1: 4>  v4( 4);
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1: 8>  v8( 8);
+  internal::VariableAndFixedInt<MatType::SizeAtCompileTime==Dynamic?-1:16> v16(16);
+
+  if((MatType::Flags&RowMajorBit)==0)
+  {
+    typedef Map<MatrixXi> MapMat;
+    // dynamic
+    VERIFY_IS_EQUAL((m.reshaped( 1, 16)), MapMat(m.data(),  1, 16));
+    VERIFY_IS_EQUAL((m.reshaped( 2,  8)), MapMat(m.data(),  2,  8));
+    VERIFY_IS_EQUAL((m.reshaped( 4,  4)), MapMat(m.data(),  4,  4));
+    VERIFY_IS_EQUAL((m.reshaped( 8,  2)), MapMat(m.data(),  8,  2));
+    VERIFY_IS_EQUAL((m.reshaped(16,  1)), MapMat(m.data(), 16,  1));
+
+    // static
+    VERIFY_IS_EQUAL(m.reshaped(fix< 1>, fix<16>), MapMat(m.data(),  1, 16));
+    VERIFY_IS_EQUAL(m.reshaped(fix< 2>, fix< 8>), MapMat(m.data(),  2,  8));
+    VERIFY_IS_EQUAL(m.reshaped(fix< 4>, fix< 4>), MapMat(m.data(),  4,  4));
+    VERIFY_IS_EQUAL(m.reshaped(fix< 8>, fix< 2>), MapMat(m.data(),  8,  2));
+    VERIFY_IS_EQUAL(m.reshaped(fix<16>, fix< 1>), MapMat(m.data(), 16,  1));
+
+
+    // reshape chain
+    VERIFY_IS_EQUAL(
+      (m
+      .reshaped( 1, 16)
+      .reshaped(fix< 2>,fix< 8>)
+      .reshaped(16,  1)
+      .reshaped(fix< 8>,fix< 2>)
+      .reshaped( 2,  8)
+      .reshaped(fix< 1>,fix<16>)
+      .reshaped( 4,  4)
+      .reshaped(fix<16>,fix< 1>)
+      .reshaped( 8,  2)
+      .reshaped(fix< 4>,fix< 4>)
+      ),
+      MapMat(m.data(), 4,  4)
+    );
+  }
+
+  VERIFY(is_same_eq(m.reshaped( 1,       AutoSize), m.reshaped( 1, 16)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize, 16),       m.reshaped( 1, 16)));
+  VERIFY(is_same_eq(m.reshaped( 2,       AutoSize), m.reshaped( 2,  8)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize, 8),        m.reshaped( 2,  8)));
+  VERIFY(is_same_eq(m.reshaped( 4,       AutoSize), m.reshaped( 4,  4)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize, 4),        m.reshaped( 4,  4)));
+  VERIFY(is_same_eq(m.reshaped( 8,       AutoSize), m.reshaped( 8,  2)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize, 2),        m.reshaped( 8,  2)));
+  VERIFY(is_same_eq(m.reshaped(16,       AutoSize), m.reshaped(16,  1)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize,  1),       m.reshaped(16,  1)));
+
+  VERIFY(is_same_eq(m.reshaped(fix< 1>,   AutoSize),  m.reshaped(fix< 1>, v16)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize,  fix<16>),   m.reshaped( v1,     fix<16>)));
+  VERIFY(is_same_eq(m.reshaped(fix< 2>,   AutoSize),  m.reshaped(fix< 2>, v8)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize,  fix< 8>),   m.reshaped( v2,     fix< 8>)));
+  VERIFY(is_same_eq(m.reshaped(fix< 4>,   AutoSize),  m.reshaped(fix< 4>, v4)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize,  fix< 4>),   m.reshaped( v4,     fix< 4>)));
+  VERIFY(is_same_eq(m.reshaped(fix< 8>,   AutoSize),  m.reshaped(fix< 8>, v2)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize,  fix< 2>),   m.reshaped( v8,     fix< 2>)));
+  VERIFY(is_same_eq(m.reshaped(fix<16>,   AutoSize),  m.reshaped(fix<16>, v1)));
+  VERIFY(is_same_eq(m.reshaped(AutoSize,  fix< 1>),   m.reshaped(v16,     fix< 1>)));
+
+  check_auto_reshape4x4<ColMajor> (m);
+  check_auto_reshape4x4<RowMajor> (m);
+  check_auto_reshape4x4<AutoOrder>(m);
+  check_auto_reshape4x4<ColMajor> (m.transpose());
+  check_auto_reshape4x4<ColMajor> (m.transpose());
+  check_auto_reshape4x4<AutoOrder>(m.transpose());
+
+  check_direct_access_reshape4x4(m,fix<MatType::Flags&RowMajorBit>);
+
+  if((MatType::Flags&RowMajorBit)==0)
+  {
+    VERIFY_IS_EQUAL(m.template reshaped<ColMajor>(2,8),m.reshaped(2,8));
+    VERIFY_IS_EQUAL(m.template reshaped<ColMajor>(2,8),m.template reshaped<AutoOrder>(2,8));
+    VERIFY_IS_EQUAL(m.transpose().template reshaped<RowMajor>(2,8),m.transpose().template reshaped<AutoOrder>(2,8));
+  }
+  else
+  {
+    VERIFY_IS_EQUAL(m.template reshaped<ColMajor>(2,8),m.reshaped(2,8));
+    VERIFY_IS_EQUAL(m.template reshaped<RowMajor>(2,8),m.template reshaped<AutoOrder>(2,8));
+    VERIFY_IS_EQUAL(m.transpose().template reshaped<ColMajor>(2,8),m.transpose().template reshaped<AutoOrder>(2,8));
+    VERIFY_IS_EQUAL(m.transpose().reshaped(2,8),m.transpose().template reshaped<AutoOrder>(2,8));
+  }
+
+  MatrixXi m28r1 = m.template reshaped<RowMajor>(2,8);
+  MatrixXi m28r2 = m.transpose().template reshaped<ColMajor>(8,2).transpose();
+  VERIFY_IS_EQUAL( m28r1, m28r2);
+
+  VERIFY(is_same_eq(m.reshaped(v16,fix<1>), m.reshaped()));
+  VERIFY_IS_EQUAL(m.reshaped(16,1).eval(), m.reshaped().eval());
+  VERIFY_IS_EQUAL(m.reshaped(1,16).eval(), m.reshaped().transpose().eval());
+  VERIFY_IS_EQUAL(m.reshaped().reshaped(2,8), m.reshaped(2,8));
+  VERIFY_IS_EQUAL(m.reshaped().reshaped(4,4), m.reshaped(4,4));
+  VERIFY_IS_EQUAL(m.reshaped().reshaped(8,2), m.reshaped(8,2));
+
+  VERIFY_IS_EQUAL(m.reshaped(), m.template reshaped<ColMajor>());
+  VERIFY_IS_EQUAL(m.transpose().reshaped(), m.template reshaped<RowMajor>());
+  VERIFY_IS_EQUAL(m.template reshaped<RowMajor>(AutoSize,fix<1>), m.template reshaped<RowMajor>());
+  VERIFY_IS_EQUAL(m.template reshaped<AutoOrder>(AutoSize,fix<1>), m.template reshaped<AutoOrder>());
+
+  VERIFY(is_same_eq(m.reshaped(AutoSize,fix<1>), m.reshaped()));
+  VERIFY_IS_EQUAL(m.template reshaped<RowMajor>(fix<1>,AutoSize), m.transpose().reshaped().transpose());
+
+  // check assignment
+  {
+    Matrix<Scalar,Dynamic,1> m1x(m.size()); m1x.setRandom();
+    VERIFY_IS_APPROX(m.reshaped() = m1x, m1x);
+    VERIFY_IS_APPROX(m, m1x.reshaped(4,4));
+    
+    Matrix<Scalar,Dynamic,Dynamic> m28(2,8); m28.setRandom();
+    VERIFY_IS_APPROX(m.reshaped(2,8) = m28, m28);
+    VERIFY_IS_APPROX(m, m28.reshaped(4,4));
+    VERIFY_IS_APPROX(m.template reshaped<RowMajor>(2,8) = m28, m28);
+
+    Matrix<Scalar,Dynamic,Dynamic> m24(2,4); m24.setRandom();
+    VERIFY_IS_APPROX(m(seq(0,last,2),all).reshaped(2,4) = m24, m24);
+
+    // check constness:
+    m.reshaped(2,8).nestedExpression() = m;
+  }
+}
+
+EIGEN_DECLARE_TEST(reshape)
+{
+  typedef Matrix<int,Dynamic,Dynamic,RowMajor> RowMatrixXi;
+  typedef Matrix<int,4,4,RowMajor> RowMatrix4i;
+  MatrixXi mx = MatrixXi::Random(4, 4);
+  Matrix4i m4 = Matrix4i::Random(4, 4);
+  RowMatrixXi rmx = RowMatrixXi::Random(4, 4);
+  RowMatrix4i rm4 = RowMatrix4i::Random(4, 4);
+
+  // test dynamic-size matrix
+  CALL_SUBTEST(reshape4x4(mx));
+  // test static-size matrix
+  CALL_SUBTEST(reshape4x4(m4));
+  // test dynamic-size const matrix
+  CALL_SUBTEST(reshape4x4(static_cast<const MatrixXi>(mx)));
+  // test static-size const matrix
+  CALL_SUBTEST(reshape4x4(static_cast<const Matrix4i>(m4)));
+
+  CALL_SUBTEST(reshape4x4(rmx));
+  CALL_SUBTEST(reshape4x4(rm4));
+}
diff --git a/contrib/eigen/test/resize.cpp b/contrib/eigen/test/resize.cpp
index 4adaafe5695a982d500e35f3992a63008c806ae3..646a75b8f126eed916fb9b8f62bce424e4b7e252 100644
--- a/contrib/eigen/test/resize.cpp
+++ b/contrib/eigen/test/resize.cpp
@@ -33,7 +33,7 @@ void resizeLikeTest12() { resizeLikeTest<1,2>(); }
 void resizeLikeTest1020() { resizeLikeTest<10,20>(); }
 void resizeLikeTest31() { resizeLikeTest<3,1>(); }
 
-void test_resize()
+EIGEN_DECLARE_TEST(resize)
 {
   CALL_SUBTEST(resizeLikeTest12() );
   CALL_SUBTEST(resizeLikeTest1020() );
diff --git a/contrib/eigen/test/rvalue_types.cpp b/contrib/eigen/test/rvalue_types.cpp
index 8887f1b1b99072375855f2890429a4f516f0087c..2c9999ce80851e46579b405669bf8d912a77c6bf 100644
--- a/contrib/eigen/test/rvalue_types.cpp
+++ b/contrib/eigen/test/rvalue_types.cpp
@@ -7,7 +7,13 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#define EIGEN_RUNTIME_NO_MALLOC
+
 #include "main.h"
+#if EIGEN_HAS_CXX11
+#include "MovableScalar.h"
+#endif
+#include "SafeScalar.h"
 
 #include <Eigen/Core>
 
@@ -24,41 +30,128 @@ void rvalue_copyassign(const MatrixType& m)
   MatrixType tmp = m;
   UIntPtr src_address = reinterpret_cast<UIntPtr>(tmp.data());
   
+  Eigen::internal::set_is_malloc_allowed(false); // moving from an rvalue reference shall never allocate
   // move the temporary to n
   MatrixType n = std::move(tmp);
   UIntPtr dst_address = reinterpret_cast<UIntPtr>(n.data());
-
   if (MatrixType::RowsAtCompileTime==Dynamic|| MatrixType::ColsAtCompileTime==Dynamic)
   {
     // verify that we actually moved the guts
     VERIFY_IS_EQUAL(src_address, dst_address);
+    VERIFY_IS_EQUAL(tmp.size(), 0);
+    VERIFY_IS_EQUAL(reinterpret_cast<UIntPtr>(tmp.data()), UIntPtr(0));
   }
 
   // verify that the content did not change
   Scalar abs_diff = (m-n).array().abs().sum();
   VERIFY_IS_EQUAL(abs_diff, Scalar(0));
+  Eigen::internal::set_is_malloc_allowed(true);
+}
+template<typename TranspositionsType>
+void rvalue_transpositions(Index rows)
+{
+  typedef typename TranspositionsType::IndicesType PermutationVectorType;
+
+  PermutationVectorType vec;
+  randomPermutationVector(vec, rows);
+  TranspositionsType t0(vec);
+
+  Eigen::internal::set_is_malloc_allowed(false); // moving from an rvalue reference shall never allocate
+
+  UIntPtr t0_address = reinterpret_cast<UIntPtr>(t0.indices().data());
+
+  // Move constructors:
+  TranspositionsType t1 = std::move(t0);
+  UIntPtr t1_address = reinterpret_cast<UIntPtr>(t1.indices().data());
+  VERIFY_IS_EQUAL(t0_address, t1_address);
+  // t0 must be de-allocated:
+  VERIFY_IS_EQUAL(t0.size(), 0);
+  VERIFY_IS_EQUAL(reinterpret_cast<UIntPtr>(t0.indices().data()), UIntPtr(0));
+
+
+  // Move assignment:
+  t0 = std::move(t1);
+  t0_address = reinterpret_cast<UIntPtr>(t0.indices().data());
+  VERIFY_IS_EQUAL(t0_address, t1_address);
+  // t1 must be de-allocated:
+  VERIFY_IS_EQUAL(t1.size(), 0);
+  VERIFY_IS_EQUAL(reinterpret_cast<UIntPtr>(t1.indices().data()), UIntPtr(0));
+
+  Eigen::internal::set_is_malloc_allowed(true);
+}
+
+template <typename MatrixType>
+void rvalue_move(const MatrixType& m)
+{
+    // lvalue reference is copied
+    MatrixType b(m);
+    VERIFY_IS_EQUAL(b, m);
+
+    // lvalue reference is copied
+    MatrixType c{m};
+    VERIFY_IS_EQUAL(c, m);
+
+    // lvalue reference is copied
+    MatrixType d = m;
+    VERIFY_IS_EQUAL(d, m);
+
+    // rvalue reference is moved - copy constructor.
+    MatrixType e_src(m);
+    VERIFY_IS_EQUAL(e_src, m);
+    MatrixType e_dst(std::move(e_src));
+    VERIFY_IS_EQUAL(e_dst, m);
+
+    // rvalue reference is moved - copy constructor.
+    MatrixType f_src(m);
+    VERIFY_IS_EQUAL(f_src, m);
+    MatrixType f_dst = std::move(f_src);
+    VERIFY_IS_EQUAL(f_dst, m);
+    
+    // rvalue reference is moved - copy assignment.
+    MatrixType g_src(m);
+    VERIFY_IS_EQUAL(g_src, m);
+    MatrixType g_dst;
+    g_dst = std::move(g_src);
+    VERIFY_IS_EQUAL(g_dst, m);
 }
 #else
 template <typename MatrixType>
 void rvalue_copyassign(const MatrixType&) {}
+template<typename TranspositionsType>
+void rvalue_transpositions(Index) {}
+template <typename MatrixType>
+void rvalue_move(const MatrixType&) {}
 #endif
 
-void test_rvalue_types()
+EIGEN_DECLARE_TEST(rvalue_types)
 {
-  CALL_SUBTEST_1(rvalue_copyassign( MatrixXf::Random(50,50).eval() ));
-  CALL_SUBTEST_1(rvalue_copyassign( ArrayXXf::Random(50,50).eval() ));
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(rvalue_copyassign( MatrixXf::Random(50,50).eval() ));
+    CALL_SUBTEST_1(rvalue_copyassign( ArrayXXf::Random(50,50).eval() ));
+
+    CALL_SUBTEST_1(rvalue_copyassign( Matrix<float,1,Dynamic>::Random(50).eval() ));
+    CALL_SUBTEST_1(rvalue_copyassign( Array<float,1,Dynamic>::Random(50).eval() ));
 
-  CALL_SUBTEST_1(rvalue_copyassign( Matrix<float,1,Dynamic>::Random(50).eval() ));
-  CALL_SUBTEST_1(rvalue_copyassign( Array<float,1,Dynamic>::Random(50).eval() ));
+    CALL_SUBTEST_1(rvalue_copyassign( Matrix<float,Dynamic,1>::Random(50).eval() ));
+    CALL_SUBTEST_1(rvalue_copyassign( Array<float,Dynamic,1>::Random(50).eval() ));
 
-  CALL_SUBTEST_1(rvalue_copyassign( Matrix<float,Dynamic,1>::Random(50).eval() ));
-  CALL_SUBTEST_1(rvalue_copyassign( Array<float,Dynamic,1>::Random(50).eval() ));
+    CALL_SUBTEST_2(rvalue_copyassign( Array<float,2,1>::Random().eval() ));
+    CALL_SUBTEST_2(rvalue_copyassign( Array<float,3,1>::Random().eval() ));
+    CALL_SUBTEST_2(rvalue_copyassign( Array<float,4,1>::Random().eval() ));
+
+    CALL_SUBTEST_2(rvalue_copyassign( Array<float,2,2>::Random().eval() ));
+    CALL_SUBTEST_2(rvalue_copyassign( Array<float,3,3>::Random().eval() ));
+    CALL_SUBTEST_2(rvalue_copyassign( Array<float,4,4>::Random().eval() ));
   
-  CALL_SUBTEST_2(rvalue_copyassign( Array<float,2,1>::Random().eval() ));
-  CALL_SUBTEST_2(rvalue_copyassign( Array<float,3,1>::Random().eval() ));
-  CALL_SUBTEST_2(rvalue_copyassign( Array<float,4,1>::Random().eval() ));
+    CALL_SUBTEST_3((rvalue_transpositions<PermutationMatrix<Dynamic, Dynamic, int> >(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_3((rvalue_transpositions<PermutationMatrix<Dynamic, Dynamic, Index> >(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_4((rvalue_transpositions<Transpositions<Dynamic, Dynamic, int> >(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
+    CALL_SUBTEST_4((rvalue_transpositions<Transpositions<Dynamic, Dynamic, Index> >(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))));
 
-  CALL_SUBTEST_2(rvalue_copyassign( Array<float,2,2>::Random().eval() ));
-  CALL_SUBTEST_2(rvalue_copyassign( Array<float,3,3>::Random().eval() ));
-  CALL_SUBTEST_2(rvalue_copyassign( Array<float,4,4>::Random().eval() ));
+#if EIGEN_HAS_CXX11
+    CALL_SUBTEST_5(rvalue_move(Eigen::Matrix<MovableScalar<float>,1,3>::Random().eval()));
+    CALL_SUBTEST_5(rvalue_move(Eigen::Matrix<SafeScalar<float>,1,3>::Random().eval()));
+    CALL_SUBTEST_5(rvalue_move(Eigen::Matrix<SafeScalar<float>,Eigen::Dynamic,Eigen::Dynamic>::Random(1,3).eval()));
+#endif
+  }
 }
diff --git a/contrib/eigen/test/schur_complex.cpp b/contrib/eigen/test/schur_complex.cpp
index deb78e44e696062d2ea3dcc1b0d6a93f6487b333..03e17e81defd09f67f87866a34ba684aa629f5d6 100644
--- a/contrib/eigen/test/schur_complex.cpp
+++ b/contrib/eigen/test/schur_complex.cpp
@@ -79,7 +79,7 @@ template<typename MatrixType> void schur(int size = MatrixType::ColsAtCompileTim
   }
 }
 
-void test_schur_complex()
+EIGEN_DECLARE_TEST(schur_complex)
 {
   CALL_SUBTEST_1(( schur<Matrix4cd>() ));
   CALL_SUBTEST_2(( schur<MatrixXcf>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4)) ));
diff --git a/contrib/eigen/test/schur_real.cpp b/contrib/eigen/test/schur_real.cpp
index e5229e6e8d24891c4186e95608b20b777dd6f2a6..945461027ae31656cfb63396e9b5d08611ee24d7 100644
--- a/contrib/eigen/test/schur_real.cpp
+++ b/contrib/eigen/test/schur_real.cpp
@@ -98,7 +98,7 @@ template<typename MatrixType> void schur(int size = MatrixType::ColsAtCompileTim
   }
 }
 
-void test_schur_real()
+EIGEN_DECLARE_TEST(schur_real)
 {
   CALL_SUBTEST_1(( schur<Matrix4f>() ));
   CALL_SUBTEST_2(( schur<MatrixXd>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4)) ));
diff --git a/contrib/eigen/test/selfadjoint.cpp b/contrib/eigen/test/selfadjoint.cpp
index bb11cc351005bac3696c5815bd2cca9188b93ea3..9ca9cef9e11aafc5279dc56a59b0987eea0ce696 100644
--- a/contrib/eigen/test/selfadjoint.cpp
+++ b/contrib/eigen/test/selfadjoint.cpp
@@ -56,7 +56,7 @@ void bug_159()
   EIGEN_UNUSED_VARIABLE(m)
 }
 
-void test_selfadjoint()
+EIGEN_DECLARE_TEST(selfadjoint)
 {
   for(int i = 0; i < g_repeat ; i++)
   {
diff --git a/contrib/eigen/test/simplicial_cholesky.cpp b/contrib/eigen/test/simplicial_cholesky.cpp
index 649c817b4b4cf368c712aa9297cff259b6dfdeef..538d01ab5223fb0c4cfb0703e4ff8cf0123c08b9 100644
--- a/contrib/eigen/test/simplicial_cholesky.cpp
+++ b/contrib/eigen/test/simplicial_cholesky.cpp
@@ -9,17 +9,17 @@
 
 #include "sparse_solver.h"
 
-template<typename T, typename I> void test_simplicial_cholesky_T()
+template<typename T, typename I_, int flag> void test_simplicial_cholesky_T()
 {
-  typedef SparseMatrix<T,0,I> SparseMatrixType;
+  typedef SparseMatrix<T,flag,I_> SparseMatrixType;
   SimplicialCholesky<SparseMatrixType, Lower> chol_colmajor_lower_amd;
   SimplicialCholesky<SparseMatrixType, Upper> chol_colmajor_upper_amd;
   SimplicialLLT<     SparseMatrixType, Lower> llt_colmajor_lower_amd;
   SimplicialLLT<     SparseMatrixType, Upper> llt_colmajor_upper_amd;
   SimplicialLDLT<    SparseMatrixType, Lower> ldlt_colmajor_lower_amd;
   SimplicialLDLT<    SparseMatrixType, Upper> ldlt_colmajor_upper_amd;
-  SimplicialLDLT<    SparseMatrixType, Lower, NaturalOrdering<I> > ldlt_colmajor_lower_nat;
-  SimplicialLDLT<    SparseMatrixType, Upper, NaturalOrdering<I> > ldlt_colmajor_upper_nat;
+  SimplicialLDLT<    SparseMatrixType, Lower, NaturalOrdering<I_> > ldlt_colmajor_lower_nat;
+  SimplicialLDLT<    SparseMatrixType, Upper, NaturalOrdering<I_> > ldlt_colmajor_upper_nat;
 
   check_sparse_spd_solving(chol_colmajor_lower_amd);
   check_sparse_spd_solving(chol_colmajor_upper_amd);
@@ -35,13 +35,16 @@ template<typename T, typename I> void test_simplicial_cholesky_T()
   check_sparse_spd_determinant(ldlt_colmajor_lower_amd);
   check_sparse_spd_determinant(ldlt_colmajor_upper_amd);
   
-  check_sparse_spd_solving(ldlt_colmajor_lower_nat, 300, 1000);
-  check_sparse_spd_solving(ldlt_colmajor_upper_nat, 300, 1000);
+  check_sparse_spd_solving(ldlt_colmajor_lower_nat, (std::min)(300,EIGEN_TEST_MAX_SIZE), 1000);
+  check_sparse_spd_solving(ldlt_colmajor_upper_nat, (std::min)(300,EIGEN_TEST_MAX_SIZE), 1000);
 }
 
-void test_simplicial_cholesky()
+EIGEN_DECLARE_TEST(simplicial_cholesky)
 {
-  CALL_SUBTEST_1(( test_simplicial_cholesky_T<double,int>() ));
-  CALL_SUBTEST_2(( test_simplicial_cholesky_T<std::complex<double>, int>() ));
-  CALL_SUBTEST_3(( test_simplicial_cholesky_T<double,long int>() ));
+  CALL_SUBTEST_11(( test_simplicial_cholesky_T<double,               int, ColMajor>() ));
+  CALL_SUBTEST_12(( test_simplicial_cholesky_T<std::complex<double>, int, ColMajor>() ));
+  CALL_SUBTEST_13(( test_simplicial_cholesky_T<double,          long int, ColMajor>() ));
+  CALL_SUBTEST_21(( test_simplicial_cholesky_T<double,               int, RowMajor>() ));
+  CALL_SUBTEST_22(( test_simplicial_cholesky_T<std::complex<double>, int, RowMajor>() ));
+  CALL_SUBTEST_23(( test_simplicial_cholesky_T<double,          long int, RowMajor>() ));
 }
diff --git a/contrib/eigen/test/sizeof.cpp b/contrib/eigen/test/sizeof.cpp
index 03ad2045386642e2f2140458aef23a07804d17c0..af34e97ddfc743d4ab5da3eb7ccdc19d8b299c13 100644
--- a/contrib/eigen/test/sizeof.cpp
+++ b/contrib/eigen/test/sizeof.cpp
@@ -15,10 +15,10 @@ template<typename MatrixType> void verifySizeOf(const MatrixType&)
   if (MatrixType::RowsAtCompileTime!=Dynamic && MatrixType::ColsAtCompileTime!=Dynamic)
     VERIFY_IS_EQUAL(std::ptrdiff_t(sizeof(MatrixType)),std::ptrdiff_t(sizeof(Scalar))*std::ptrdiff_t(MatrixType::SizeAtCompileTime));
   else
-    VERIFY_IS_EQUAL(sizeof(MatrixType),sizeof(Scalar*) + 2 * sizeof(typename MatrixType::Index));
+    VERIFY_IS_EQUAL(sizeof(MatrixType),sizeof(Scalar*) + 2 * sizeof(Index));
 }
 
-void test_sizeof()
+EIGEN_DECLARE_TEST(sizeof)
 {
   CALL_SUBTEST(verifySizeOf(Matrix<float, 1, 1>()) );
   CALL_SUBTEST(verifySizeOf(Array<float, 2, 1>()) );
diff --git a/contrib/eigen/test/sizeoverflow.cpp b/contrib/eigen/test/sizeoverflow.cpp
index 240d22294cb615482a210d8f8ffa9477326c7c04..4213512335d7f91d6ef8dc2cc1d7928b8b678521 100644
--- a/contrib/eigen/test/sizeoverflow.cpp
+++ b/contrib/eigen/test/sizeoverflow.cpp
@@ -34,7 +34,7 @@ void triggerVectorBadAlloc(Index size)
   VERIFY_THROWS_BADALLOC( VectorType v; v.conservativeResize(size) );
 }
 
-void test_sizeoverflow()
+EIGEN_DECLARE_TEST(sizeoverflow)
 {
   // there are 2 levels of overflow checking. first in PlainObjectBase.h we check for overflow in rows*cols computations.
   // this is tested in tests of the form times_itself_gives_0 * times_itself_gives_0
diff --git a/contrib/eigen/test/smallvectors.cpp b/contrib/eigen/test/smallvectors.cpp
index 781511397cc631a02641e92ca0394f2ad9a753dc..f9803acbbda587c001d096b3ee040f18b94948f3 100644
--- a/contrib/eigen/test/smallvectors.cpp
+++ b/contrib/eigen/test/smallvectors.cpp
@@ -57,7 +57,7 @@ template<typename Scalar> void smallVectors()
   }
 }
 
-void test_smallvectors()
+EIGEN_DECLARE_TEST(smallvectors)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST(smallVectors<int>() );
diff --git a/contrib/eigen/test/solverbase.h b/contrib/eigen/test/solverbase.h
new file mode 100644
index 0000000000000000000000000000000000000000..13c09593ace9bdb6bcc243cdc9603c5849414c7d
--- /dev/null
+++ b/contrib/eigen/test/solverbase.h
@@ -0,0 +1,36 @@
+#ifndef TEST_SOLVERBASE_H
+#define TEST_SOLVERBASE_H
+
+template<typename DstType, typename RhsType, typename MatrixType, typename SolverType>
+void check_solverbase(const MatrixType& matrix, const SolverType& solver, Index rows, Index cols, Index cols2)
+{
+  // solve
+  DstType m2               = DstType::Random(cols,cols2);
+  RhsType m3               = matrix*m2;
+  DstType solver_solution  = DstType::Random(cols,cols2);
+  solver._solve_impl(m3, solver_solution);
+  VERIFY_IS_APPROX(m3, matrix*solver_solution);
+  solver_solution          = DstType::Random(cols,cols2);
+  solver_solution          = solver.solve(m3);
+  VERIFY_IS_APPROX(m3, matrix*solver_solution);
+  // test solve with transposed
+  m3                       = RhsType::Random(rows,cols2);
+  m2                       = matrix.transpose()*m3;
+  RhsType solver_solution2 = RhsType::Random(rows,cols2);
+  solver.template _solve_impl_transposed<false>(m2, solver_solution2);
+  VERIFY_IS_APPROX(m2, matrix.transpose()*solver_solution2);
+  solver_solution2         = RhsType::Random(rows,cols2);
+  solver_solution2         = solver.transpose().solve(m2);
+  VERIFY_IS_APPROX(m2, matrix.transpose()*solver_solution2);
+  // test solve with conjugate transposed
+  m3                       = RhsType::Random(rows,cols2);
+  m2                       = matrix.adjoint()*m3;
+  solver_solution2         = RhsType::Random(rows,cols2);
+  solver.template _solve_impl_transposed<true>(m2, solver_solution2);
+  VERIFY_IS_APPROX(m2, matrix.adjoint()*solver_solution2);
+  solver_solution2         = RhsType::Random(rows,cols2);
+  solver_solution2         = solver.adjoint().solve(m2);
+  VERIFY_IS_APPROX(m2, matrix.adjoint()*solver_solution2);
+}
+
+#endif // TEST_SOLVERBASE_H
diff --git a/contrib/eigen/test/sparse.h b/contrib/eigen/test/sparse.h
index 9912e1e246b16a73207985c0b15cc928db3eee42..6cd07fc0aa6fd942aa64623ff66303ad644727c2 100644
--- a/contrib/eigen/test/sparse.h
+++ b/contrib/eigen/test/sparse.h
@@ -14,7 +14,7 @@
 
 #include "main.h"
 
-#if EIGEN_GNUC_AT_LEAST(4,0) && !defined __ICC && !defined(__clang__)
+#if EIGEN_HAS_CXX11
 
 #ifdef min
 #undef min
@@ -24,15 +24,9 @@
 #undef max
 #endif
 
-#include <tr1/unordered_map>
+#include <unordered_map>
 #define EIGEN_UNORDERED_MAP_SUPPORT
-namespace std {
-  using std::tr1::unordered_map;
-}
-#endif
 
-#ifdef EIGEN_GOOGLEHASH_SUPPORT
-  #include <google/sparse_hash_map>
 #endif
 
 #include <Eigen/Cholesky>
diff --git a/contrib/eigen/test/sparseLM.cpp b/contrib/eigen/test/sparseLM.cpp
index 8e148f9bc71bff99133e586ceca1416ae8a88c74..a48fcb685f7566897dce843fb7f28f2b1a8f5b53 100644
--- a/contrib/eigen/test/sparseLM.cpp
+++ b/contrib/eigen/test/sparseLM.cpp
@@ -168,7 +168,7 @@ void test_sparseLM_T()
       return ;
 
 }
-void test_sparseLM()
+EIGEN_DECLARE_TEST(sparseLM)
 {
   CALL_SUBTEST_1(test_sparseLM_T<double>());
   
diff --git a/contrib/eigen/test/sparse_basic.cpp b/contrib/eigen/test/sparse_basic.cpp
index 43318da796c3e520850d276b35e92b8ba2a91a3e..9453111b7f8294d8c85938368153e35bbe5e8c71 100644
--- a/contrib/eigen/test/sparse_basic.cpp
+++ b/contrib/eigen/test/sparse_basic.cpp
@@ -9,9 +9,16 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#ifndef EIGEN_SPARSE_TEST_INCLUDED_FROM_SPARSE_EXTRA
 static long g_realloc_count = 0;
 #define EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN g_realloc_count++;
 
+static long g_dense_op_sparse_count = 0;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN g_dense_op_sparse_count++;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN g_dense_op_sparse_count+=10;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN g_dense_op_sparse_count+=20;
+#endif
+
 #include "sparse.h"
 
 template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& ref)
@@ -194,6 +201,7 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     VERIFY_IS_APPROX(refM4.cwiseProduct(m3), refM4.cwiseProduct(refM3));
 //     VERIFY_IS_APPROX(m3.cwise()/refM4, refM3.cwise()/refM4);
 
+    // mixed sparse-dense
     VERIFY_IS_APPROX(refM4 + m3, refM4 + refM3);
     VERIFY_IS_APPROX(m3 + refM4, refM3 + refM4);
     VERIFY_IS_APPROX(refM4 - m3, refM4 - refM3);
@@ -222,6 +230,26 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     VERIFY_IS_APPROX(m1+=m2, refM1+=refM2);
     VERIFY_IS_APPROX(m1-=m2, refM1-=refM2);
 
+    refM3 = refM1;
+    
+    VERIFY_IS_APPROX(refM1+=m2, refM3+=refM2);
+    VERIFY_IS_APPROX(refM1-=m2, refM3-=refM2);
+
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1 =m2+refM4, refM3 =refM2+refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,10);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1+=m2+refM4, refM3+=refM2+refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1-=m2+refM4, refM3-=refM2+refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1 =refM4+m2, refM3 =refM2+refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1+=refM4+m2, refM3+=refM2+refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1-=refM4+m2, refM3-=refM2+refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1 =m2-refM4, refM3 =refM2-refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,20);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1+=m2-refM4, refM3+=refM2-refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1-=m2-refM4, refM3-=refM2-refM4);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1 =refM4-m2, refM3 =refM4-refM2);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1+=refM4-m2, refM3+=refM4-refM2);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    g_dense_op_sparse_count=0; VERIFY_IS_APPROX(refM1-=refM4-m2, refM3-=refM4-refM2);  VERIFY_IS_EQUAL(g_dense_op_sparse_count,1);
+    refM3 = m3;
+
     if (rows>=2 && cols>=2)
     {
       VERIFY_RAISES_ASSERT( m1 += m1.innerVector(0) );
@@ -385,7 +413,7 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
 
     m.setFromTriplets(triplets.begin(), triplets.end(), std::multiplies<Scalar>());
     VERIFY_IS_APPROX(m, refMat_prod);
-#if (defined(__cplusplus) && __cplusplus >= 201103L)
+#if (EIGEN_COMP_CXXVER >= 11)
     m.setFromTriplets(triplets.begin(), triplets.end(), [] (Scalar,Scalar b) { return b; });
     VERIFY_IS_APPROX(m, refMat_last);
 #endif
@@ -518,7 +546,7 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
   {
     DenseVector d = DenseVector::Random(rows);
     DenseMatrix refMat2 = d.asDiagonal();
-    SparseMatrixType m2(rows, rows);
+    SparseMatrixType m2;
     m2 = d.asDiagonal();
     VERIFY_IS_APPROX(m2, refMat2);
     SparseMatrixType m3(d.asDiagonal());
@@ -526,6 +554,28 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     refMat2 += d.asDiagonal();
     m2 += d.asDiagonal();
     VERIFY_IS_APPROX(m2, refMat2);
+    m2.setZero();       m2 += d.asDiagonal();
+    refMat2.setZero();  refMat2 += d.asDiagonal();
+    VERIFY_IS_APPROX(m2, refMat2);
+    m2.setZero();       m2 -= d.asDiagonal();
+    refMat2.setZero();  refMat2 -= d.asDiagonal();
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    initSparse<Scalar>(density, refMat2, m2);
+    m2.makeCompressed();
+    m2 += d.asDiagonal();
+    refMat2 += d.asDiagonal();
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    initSparse<Scalar>(density, refMat2, m2);
+    m2.makeCompressed();
+    VectorXi res(rows);
+    for(Index i=0; i<rows; ++i)
+      res(i) = internal::random<int>(0,3);
+    m2.reserve(res);
+    m2 -= d.asDiagonal();
+    refMat2 -= d.asDiagonal();
+    VERIFY_IS_APPROX(m2, refMat2);
   }
   
   // test conservative resize
@@ -537,30 +587,38 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
       inc.push_back(std::pair<StorageIndex,StorageIndex>(3,2));
       inc.push_back(std::pair<StorageIndex,StorageIndex>(3,0));
       inc.push_back(std::pair<StorageIndex,StorageIndex>(0,3));
-      
+      inc.push_back(std::pair<StorageIndex,StorageIndex>(0,-1));
+      inc.push_back(std::pair<StorageIndex,StorageIndex>(-1,0));
+      inc.push_back(std::pair<StorageIndex,StorageIndex>(-1,-1));
+
       for(size_t i = 0; i< inc.size(); i++) {
         StorageIndex incRows = inc[i].first;
         StorageIndex incCols = inc[i].second;
         SparseMatrixType m1(rows, cols);
         DenseMatrix refMat1 = DenseMatrix::Zero(rows, cols);
         initSparse<Scalar>(density, refMat1, m1);
-        
+
+        SparseMatrixType m2 = m1;
+        m2.makeCompressed();
+
         m1.conservativeResize(rows+incRows, cols+incCols);
+        m2.conservativeResize(rows+incRows, cols+incCols);
         refMat1.conservativeResize(rows+incRows, cols+incCols);
         if (incRows > 0) refMat1.bottomRows(incRows).setZero();
         if (incCols > 0) refMat1.rightCols(incCols).setZero();
-        
+
         VERIFY_IS_APPROX(m1, refMat1);
-        
+        VERIFY_IS_APPROX(m2, refMat1);
+
         // Insert new values
         if (incRows > 0) 
           m1.insert(m1.rows()-1, 0) = refMat1(refMat1.rows()-1, 0) = 1;
         if (incCols > 0) 
           m1.insert(0, m1.cols()-1) = refMat1(0, refMat1.cols()-1) = 1;
-          
+
         VERIFY_IS_APPROX(m1, refMat1);
-          
-          
+
+
       }
   }
 
@@ -630,7 +688,7 @@ void big_sparse_triplet(Index rows, Index cols, double density) {
   typedef Triplet<Scalar,Index> TripletType;
   std::vector<TripletType> triplets;
   double nelements = density * rows*cols;
-  VERIFY(nelements>=0 && nelements <  NumTraits<StorageIndex>::highest());
+  VERIFY(nelements>=0 && nelements < static_cast<double>(NumTraits<StorageIndex>::highest()));
   Index ntriplets = Index(nelements);
   triplets.reserve(ntriplets);
   Scalar sum = Scalar(0);
@@ -649,9 +707,26 @@ void big_sparse_triplet(Index rows, Index cols, double density) {
   VERIFY_IS_APPROX(sum, m.sum());
 }
 
+template<int>
+void bug1105()
+{
+  // Regression test for bug 1105
+  int n = Eigen::internal::random<int>(200,600);
+  SparseMatrix<std::complex<double>,0, long> mat(n, n);
+  std::complex<double> val;
+
+  for(int i=0; i<n; ++i)
+  {
+    mat.coeffRef(i, i%(n/10)) = val;
+    VERIFY(mat.data().allocatedSize()<20*n);
+  }
+}
+
+#ifndef EIGEN_SPARSE_TEST_INCLUDED_FROM_SPARSE_EXTRA
 
-void test_sparse_basic()
+EIGEN_DECLARE_TEST(sparse_basic)
 {
+  g_dense_op_sparse_count = 0;  // Suppresses compiler warning.
   for(int i = 0; i < g_repeat; i++) {
     int r = Eigen::internal::random<int>(1,200), c = Eigen::internal::random<int>(1,200);
     if(Eigen::internal::random<int>(0,4) == 0) {
@@ -680,18 +755,6 @@ void test_sparse_basic()
   CALL_SUBTEST_3((big_sparse_triplet<SparseMatrix<float, RowMajor, int> >(10000, 10000, 0.125)));
   CALL_SUBTEST_4((big_sparse_triplet<SparseMatrix<double, ColMajor, long int> >(10000, 10000, 0.125)));
 
-  // Regression test for bug 1105
-#ifdef EIGEN_TEST_PART_7
-  {
-    int n = Eigen::internal::random<int>(200,600);
-    SparseMatrix<std::complex<double>,0, long> mat(n, n);
-    std::complex<double> val;
-
-    for(int i=0; i<n; ++i)
-    {
-      mat.coeffRef(i, i%(n/10)) = val;
-      VERIFY(mat.data().allocatedSize()<20*n);
-    }
-  }
-#endif
+  CALL_SUBTEST_7( bug1105<0>() );
 }
+#endif
diff --git a/contrib/eigen/test/sparse_block.cpp b/contrib/eigen/test/sparse_block.cpp
index 2a0b3b61720c3d6650cd219863d8621fabf85032..b4905b0531743a5d3eefa066d6994382b5eae4ec 100644
--- a/contrib/eigen/test/sparse_block.cpp
+++ b/contrib/eigen/test/sparse_block.cpp
@@ -8,6 +8,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "sparse.h"
+#include "AnnoyingScalar.h"
 
 template<typename T>
 typename Eigen::internal::enable_if<(T::Flags&RowMajorBit)==RowMajorBit, typename T::RowXpr>::type
@@ -31,6 +32,7 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re
   const Index outer = ref.outerSize();
 
   typedef typename SparseMatrixType::Scalar Scalar;
+  typedef typename SparseMatrixType::RealScalar RealScalar;
   typedef typename SparseMatrixType::StorageIndex StorageIndex;
 
   double density = (std::max)(8./(rows*cols), 0.01);
@@ -164,14 +166,14 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re
     {
       VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
       if(j>0)
-        VERIFY(j==numext::real(m3.innerVector(j).lastCoeff()));
+        VERIFY(RealScalar(j)==numext::real(m3.innerVector(j).lastCoeff()));
     }
     m3.makeCompressed();
     for(Index j=0; j<(std::min)(outer, inner); ++j)
     {
       VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
       if(j>0)
-        VERIFY(j==numext::real(m3.innerVector(j).lastCoeff()));
+        VERIFY(RealScalar(j)==numext::real(m3.innerVector(j).lastCoeff()));
     }
 
     VERIFY(m3.innerVector(j0).nonZeros() == m3.transpose().innerVector(j0).nonZeros());
@@ -288,7 +290,7 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re
   }
 }
 
-void test_sparse_block()
+EIGEN_DECLARE_TEST(sparse_block)
 {
   for(int i = 0; i < g_repeat; i++) {
     int r = Eigen::internal::random<int>(1,200), c = Eigen::internal::random<int>(1,200);
@@ -313,5 +315,9 @@ void test_sparse_block()
     
     CALL_SUBTEST_4(( sparse_block(SparseMatrix<double,ColMajor,short int>(short(r), short(c))) ));
     CALL_SUBTEST_4(( sparse_block(SparseMatrix<double,RowMajor,short int>(short(r), short(c))) ));
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+    AnnoyingScalar::dont_throw = true;
+#endif
+    CALL_SUBTEST_5((  sparse_block(SparseMatrix<AnnoyingScalar>(r,c)) ));
   }
 }
diff --git a/contrib/eigen/test/sparse_permutations.cpp b/contrib/eigen/test/sparse_permutations.cpp
index b82cceff808f95c8ccde6a8033c240c96d9aa7cd..e93493c394bba8d517ae70597b070bed8b961976 100644
--- a/contrib/eigen/test/sparse_permutations.cpp
+++ b/contrib/eigen/test/sparse_permutations.cpp
@@ -220,7 +220,7 @@ template<typename Scalar> void sparse_permutations_all(int size)
   CALL_SUBTEST(( sparse_permutations<RowMajor>(SparseMatrix<Scalar, RowMajor>(size,size)) ));
 }
 
-void test_sparse_permutations()
+EIGEN_DECLARE_TEST(sparse_permutations)
 {
   for(int i = 0; i < g_repeat; i++) {
     int s = Eigen::internal::random<int>(1,50);
diff --git a/contrib/eigen/test/sparse_product.cpp b/contrib/eigen/test/sparse_product.cpp
index 7f77bb7427dcb044644962fc3e3e446fc93ef4d5..6e85f6914810e7bbf7285dfb57662fd483f44eb9 100644
--- a/contrib/eigen/test/sparse_product.cpp
+++ b/contrib/eigen/test/sparse_product.cpp
@@ -100,13 +100,15 @@ template<typename SparseMatrixType> void sparse_product()
     VERIFY_IS_APPROX(m4=(m2t.transpose()*m3t.transpose()).pruned(0), refMat4=refMat2t.transpose()*refMat3t.transpose());
     VERIFY_IS_APPROX(m4=(m2*m3t.transpose()).pruned(0), refMat4=refMat2*refMat3t.transpose());
 
+#ifndef EIGEN_SPARSE_PRODUCT_IGNORE_TEMPORARY_COUNT
     // make sure the right product implementation is called:
     if((!SparseMatrixType::IsRowMajor) && m2.rows()<=m3.cols())
     {
-      VERIFY_EVALUATION_COUNT(m4 = m2*m3, 3); // 1 temp for the result + 2 for transposing and get a sorted result.
+      VERIFY_EVALUATION_COUNT(m4 = m2*m3, 2); // 2 for transposing and get a sorted result.
       VERIFY_EVALUATION_COUNT(m4 = (m2*m3).pruned(0), 1);
       VERIFY_EVALUATION_COUNT(m4 = (m2*m3).eval().pruned(0), 4);
     }
+#endif
 
     // and that pruning is effective:
     {
@@ -151,7 +153,7 @@ template<typename SparseMatrixType> void sparse_product()
     VERIFY_IS_APPROX(dm4.noalias()-=m2*refMat3, refMat4-=refMat2*refMat3);
     VERIFY_IS_APPROX(dm4=m2*(refMat3+refMat3), refMat4=refMat2*(refMat3+refMat3));
     VERIFY_IS_APPROX(dm4=m2t.transpose()*(refMat3+refMat5)*0.5, refMat4=refMat2t.transpose()*(refMat3+refMat5)*0.5);
-    
+
     // sparse * dense vector
     VERIFY_IS_APPROX(dm4.col(0)=m2*refMat3.col(0), refMat4.col(0)=refMat2*refMat3.col(0));
     VERIFY_IS_APPROX(dm4.col(0)=m2*refMat3t.transpose().col(0), refMat4.col(0)=refMat2*refMat3t.transpose().col(0));
@@ -182,7 +184,7 @@ template<typename SparseMatrixType> void sparse_product()
       VERIFY_IS_APPROX( m4=m2.middleCols(c,1)*dm5.col(c1).transpose(), refMat4=refMat2.col(c)*dm5.col(c1).transpose());
       VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
       VERIFY_IS_APPROX(dm4=m2.col(c)*dm5.col(c1).transpose(), refMat4=refMat2.col(c)*dm5.col(c1).transpose());
-      
+
       VERIFY_IS_APPROX(m4=dm5.col(c1)*m2.col(c).transpose(), refMat4=dm5.col(c1)*refMat2.col(c).transpose());
       VERIFY_IS_EQUAL(m4.nonZeros(), (refMat4.array()!=0).count());
       VERIFY_IS_APPROX(m4=dm5.col(c1)*m2.middleCols(c,1).transpose(), refMat4=dm5.col(c1)*refMat2.col(c).transpose());
@@ -211,23 +213,23 @@ template<typename SparseMatrixType> void sparse_product()
     }
 
     VERIFY_IS_APPROX(m6=m6*m6, refMat6=refMat6*refMat6);
-    
+
     // sparse matrix * sparse vector
     ColSpVector cv0(cols), cv1;
     DenseVector dcv0(cols), dcv1;
     initSparse(2*density,dcv0, cv0);
-    
+
     RowSpVector rv0(depth), rv1;
     RowDenseVector drv0(depth), drv1(rv1);
     initSparse(2*density,drv0, rv0);
 
-    VERIFY_IS_APPROX(cv1=m3*cv0, dcv1=refMat3*dcv0);    
+    VERIFY_IS_APPROX(cv1=m3*cv0, dcv1=refMat3*dcv0);
     VERIFY_IS_APPROX(rv1=rv0*m3, drv1=drv0*refMat3);
     VERIFY_IS_APPROX(cv1=m3t.adjoint()*cv0, dcv1=refMat3t.adjoint()*dcv0);
     VERIFY_IS_APPROX(cv1=rv0*m3, dcv1=drv0*refMat3);
     VERIFY_IS_APPROX(rv1=m3*cv0, drv1=refMat3*dcv0);
   }
-  
+
   // test matrix - diagonal product
   {
     DenseMatrix refM2 = DenseMatrix::Zero(rows, cols);
@@ -243,7 +245,7 @@ template<typename SparseMatrixType> void sparse_product()
     VERIFY_IS_APPROX(m3=m2.transpose()*d2, refM3=refM2.transpose()*d2);
     VERIFY_IS_APPROX(m3=d2*m2, refM3=d2*refM2);
     VERIFY_IS_APPROX(m3=d1*m2.transpose(), refM3=d1*refM2.transpose());
-    
+
     // also check with a SparseWrapper:
     DenseVector v1 = DenseVector::Random(cols);
     DenseVector v2 = DenseVector::Random(rows);
@@ -252,12 +254,12 @@ template<typename SparseMatrixType> void sparse_product()
     VERIFY_IS_APPROX(m3=m2.transpose()*v2.asDiagonal(), refM3=refM2.transpose()*v2.asDiagonal());
     VERIFY_IS_APPROX(m3=v2.asDiagonal()*m2, refM3=v2.asDiagonal()*refM2);
     VERIFY_IS_APPROX(m3=v1.asDiagonal()*m2.transpose(), refM3=v1.asDiagonal()*refM2.transpose());
-    
+
     VERIFY_IS_APPROX(m3=v2.asDiagonal()*m2*v1.asDiagonal(), refM3=v2.asDiagonal()*refM2*v1.asDiagonal());
 
     VERIFY_IS_APPROX(v2=m2*v1.asDiagonal()*v1, refM2*v1.asDiagonal()*v1);
     VERIFY_IS_APPROX(v3=v2.asDiagonal()*m2*v1, v2.asDiagonal()*refM2*v1);
-    
+
     // evaluate to a dense matrix to check the .row() and .col() iterator functions
     VERIFY_IS_APPROX(d3=m2*d1, refM3=refM2*d1);
     VERIFY_IS_APPROX(d3=m2.transpose()*d2, refM3=refM2.transpose()*d2);
@@ -310,20 +312,20 @@ template<typename SparseMatrixType> void sparse_product()
     VERIFY_IS_APPROX(x.noalias()+=mUp.template selfadjointView<Upper>()*b, refX+=refS*b);
     VERIFY_IS_APPROX(x.noalias()-=mLo.template selfadjointView<Lower>()*b, refX-=refS*b);
     VERIFY_IS_APPROX(x.noalias()+=mS.template selfadjointView<Upper|Lower>()*b, refX+=refS*b);
-    
+
     // sparse selfadjointView with sparse matrices
     SparseMatrixType mSres(rows,rows);
     VERIFY_IS_APPROX(mSres = mLo.template selfadjointView<Lower>()*mS,
                      refX = refLo.template selfadjointView<Lower>()*refS);
     VERIFY_IS_APPROX(mSres = mS * mLo.template selfadjointView<Lower>(),
                      refX = refS * refLo.template selfadjointView<Lower>());
-    
+
     // sparse triangularView with dense matrices
     VERIFY_IS_APPROX(x=mA.template triangularView<Upper>()*b, refX=refA.template triangularView<Upper>()*b);
     VERIFY_IS_APPROX(x=mA.template triangularView<Lower>()*b, refX=refA.template triangularView<Lower>()*b);
     VERIFY_IS_APPROX(x=b*mA.template triangularView<Upper>(), refX=b*refA.template triangularView<Upper>());
     VERIFY_IS_APPROX(x=b*mA.template triangularView<Lower>(), refX=b*refA.template triangularView<Lower>());
-    
+
     // sparse triangularView with sparse matrices
     VERIFY_IS_APPROX(mSres = mA.template triangularView<Lower>()*mS,   refX = refA.template triangularView<Lower>()*refS);
     VERIFY_IS_APPROX(mSres = mS * mA.template triangularView<Lower>(), refX = refS * refA.template triangularView<Lower>());
@@ -368,9 +370,9 @@ void bug_942()
 
   Vector d(1);
   d[0] = 2;
-  
+
   double res = 2;
-  
+
   VERIFY_IS_APPROX( ( cmA*d.asDiagonal() ).eval().coeff(0,0), res );
   VERIFY_IS_APPROX( ( d.asDiagonal()*rmA ).eval().coeff(0,0), res );
   VERIFY_IS_APPROX( ( rmA*d.asDiagonal() ).eval().coeff(0,0), res );
@@ -459,7 +461,7 @@ void test_mixing_types()
   VERIFY_IS_APPROX( dC2 = sC1 * dR1.col(0), dC3 = sC1 * dR1.template cast<Cplx>().col(0) );
 }
 
-void test_sparse_product()
+EIGEN_DECLARE_TEST(sparse_product)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( (sparse_product<SparseMatrix<double,ColMajor> >()) );
diff --git a/contrib/eigen/test/sparse_ref.cpp b/contrib/eigen/test/sparse_ref.cpp
index 5e9607234d106c52582bf337fc8dbfd016f0b39d..12b6f8a9dd7b66b6b9f9badb105e54e45bb50178 100644
--- a/contrib/eigen/test/sparse_ref.cpp
+++ b/contrib/eigen/test/sparse_ref.cpp
@@ -126,7 +126,7 @@ void call_ref()
   VERIFY_EVALUATION_COUNT( call_ref_5(A.row(2), A.row(2).transpose()),  1);
 }
 
-void test_sparse_ref()
+EIGEN_DECLARE_TEST(sparse_ref)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( check_const_correctness(SparseMatrix<float>()) );
diff --git a/contrib/eigen/test/sparse_solver.h b/contrib/eigen/test/sparse_solver.h
index 5145bc3eb5a7cd7a4067a03776e0e055ccc7adc0..58927944bdd98ac86133025c79a9437daefe9018 100644
--- a/contrib/eigen/test/sparse_solver.h
+++ b/contrib/eigen/test/sparse_solver.h
@@ -9,6 +9,7 @@
 
 #include "sparse.h"
 #include <Eigen/SparseCore>
+#include <Eigen/SparseLU>
 #include <sstream>
 
 template<typename Solver, typename Rhs, typename Guess,typename Result>
@@ -59,7 +60,11 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
     x = solver.solve(b);
     if (solver.info() != Success)
     {
-      std::cerr << "WARNING | sparse solver testing: solving failed (" << typeid(Solver).name() << ")\n";
+      std::cerr << "WARNING: sparse solver testing: solving failed (" << typeid(Solver).name() << ")\n";
+      // dump call stack:
+      g_test_level++; 
+      VERIFY(solver.info() == Success);
+      g_test_level--;
       return;
     }
     VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
@@ -67,7 +72,7 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
 
     x.setZero();
     solve_with_guess(solver, b, x, x);
-    VERIFY(solver.info() == Success && "solving failed when using analyzePattern/factorize API");
+    VERIFY(solver.info() == Success && "solving failed when using solve_with_guess API");
     VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
     VERIFY(x.isApprox(refX,test_precision<Scalar>()));
     
@@ -140,6 +145,136 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
   }
 }
 
+// specialization of generic check_sparse_solving for SuperLU in order to also test adjoint and transpose solves
+template<typename Scalar, typename Rhs, typename DenseMat, typename DenseRhs>
+void check_sparse_solving(Eigen::SparseLU<Eigen::SparseMatrix<Scalar> >& solver, const typename Eigen::SparseMatrix<Scalar>& A, const Rhs& b, const DenseMat& dA, const DenseRhs& db)
+{
+  typedef typename Eigen::SparseMatrix<Scalar> Mat;
+  typedef typename Mat::StorageIndex StorageIndex;
+  typedef typename Eigen::SparseLU<Eigen::SparseMatrix<Scalar> > Solver;
+
+  // reference solutions computed by dense QR solver
+  DenseRhs refX1 = dA.householderQr().solve(db); // solution of A x = db
+  DenseRhs refX2 = dA.transpose().householderQr().solve(db); // solution of A^T * x = db (use transposed matrix A^T)
+  DenseRhs refX3 = dA.adjoint().householderQr().solve(db);  // solution of A^* * x = db (use adjoint matrix A^*)
+
+
+  {
+    Rhs x1(A.cols(), b.cols());
+    Rhs x2(A.cols(), b.cols());
+    Rhs x3(A.cols(), b.cols());
+    Rhs oldb = b;
+
+    solver.compute(A);
+    if (solver.info() != Success)
+    {
+      std::cerr << "ERROR | sparse solver testing, factorization failed (" << typeid(Solver).name() << ")\n";
+      VERIFY(solver.info() == Success);
+    }
+    x1 = solver.solve(b);
+    if (solver.info() != Success)
+    {
+      std::cerr << "WARNING | sparse solver testing: solving failed (" << typeid(Solver).name() << ")\n";
+      return;
+    }
+    VERIFY(oldb.isApprox(b,0.0) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x1.isApprox(refX1,test_precision<Scalar>()));
+
+    // test solve with transposed
+    x2 = solver.transpose().solve(b);
+    VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x2.isApprox(refX2,test_precision<Scalar>()));
+
+
+    // test solve with adjoint
+    //solver.template _solve_impl_transposed<true>(b, x3);
+    x3 = solver.adjoint().solve(b);
+    VERIFY(oldb.isApprox(b,0.0) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x3.isApprox(refX3,test_precision<Scalar>()));
+
+    x1.setZero();
+    solve_with_guess(solver, b, x1, x1);
+    VERIFY(solver.info() == Success && "solving failed when using analyzePattern/factorize API");
+    VERIFY(oldb.isApprox(b,0.0) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x1.isApprox(refX1,test_precision<Scalar>()));
+
+    x1.setZero();
+    x2.setZero();
+    x3.setZero();
+    // test the analyze/factorize API
+    solver.analyzePattern(A);
+    solver.factorize(A);
+    VERIFY(solver.info() == Success && "factorization failed when using analyzePattern/factorize API");
+    x1 = solver.solve(b);
+    x2 = solver.transpose().solve(b);
+    x3 = solver.adjoint().solve(b);
+
+    VERIFY(solver.info() == Success && "solving failed when using analyzePattern/factorize API");
+    VERIFY(oldb.isApprox(b,0.0) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(x1.isApprox(refX1,test_precision<Scalar>()));
+    VERIFY(x2.isApprox(refX2,test_precision<Scalar>()));
+    VERIFY(x3.isApprox(refX3,test_precision<Scalar>()));
+
+    x1.setZero();
+    // test with Map
+    MappedSparseMatrix<Scalar,Mat::Options,StorageIndex> Am(A.rows(), A.cols(), A.nonZeros(), const_cast<StorageIndex*>(A.outerIndexPtr()), const_cast<StorageIndex*>(A.innerIndexPtr()), const_cast<Scalar*>(A.valuePtr()));
+    solver.compute(Am);
+    VERIFY(solver.info() == Success && "factorization failed when using Map");
+    DenseRhs dx(refX1);
+    dx.setZero();
+    Map<DenseRhs> xm(dx.data(), dx.rows(), dx.cols());
+    Map<const DenseRhs> bm(db.data(), db.rows(), db.cols());
+    xm = solver.solve(bm);
+    VERIFY(solver.info() == Success && "solving failed when using Map");
+    VERIFY(oldb.isApprox(bm,0.0) && "sparse solver testing: the rhs should not be modified!");
+    VERIFY(xm.isApprox(refX1,test_precision<Scalar>()));
+  }
+
+  // if not too large, do some extra check:
+  if(A.rows()<2000)
+  {
+    // test initialization ctor
+    {
+      Rhs x(b.rows(), b.cols());
+      Solver solver2(A);
+      VERIFY(solver2.info() == Success);
+      x = solver2.solve(b);
+      VERIFY(x.isApprox(refX1,test_precision<Scalar>()));
+    }
+
+    // test dense Block as the result and rhs:
+    {
+      DenseRhs x(refX1.rows(), refX1.cols());
+      DenseRhs oldb(db);
+      x.setZero();
+      x.block(0,0,x.rows(),x.cols()) = solver.solve(db.block(0,0,db.rows(),db.cols()));
+      VERIFY(oldb.isApprox(db,0.0) && "sparse solver testing: the rhs should not be modified!");
+      VERIFY(x.isApprox(refX1,test_precision<Scalar>()));
+    }
+
+    // test uncompressed inputs
+    {
+      Mat A2 = A;
+      A2.reserve((ArrayXf::Random(A.outerSize())+2).template cast<typename Mat::StorageIndex>().eval());
+      solver.compute(A2);
+      Rhs x = solver.solve(b);
+      VERIFY(x.isApprox(refX1,test_precision<Scalar>()));
+    }
+
+    // test expression as input
+    {
+      solver.compute(0.5*(A+A));
+      Rhs x = solver.solve(b);
+      VERIFY(x.isApprox(refX1,test_precision<Scalar>()));
+
+      Solver solver2(0.5*(A+A));
+      Rhs x2 = solver2.solve(b);
+      VERIFY(x2.isApprox(refX1,test_precision<Scalar>()));
+    }
+  }
+}
+
+
 template<typename Solver, typename Rhs>
 void check_sparse_solving_real_cases(Solver& solver, const typename Solver::MatrixType& A, const Rhs& b, const typename Solver::MatrixType& fullA, const Rhs& refX)
 {
@@ -266,7 +401,7 @@ std::string solver_stats(const SparseSolverBase<Derived> &/*solver*/)
 }
 #endif
 
-template<typename Solver> void check_sparse_spd_solving(Solver& solver, int maxSize = 300, int maxRealWorldSize = 100000)
+template<typename Solver> void check_sparse_spd_solving(Solver& solver, int maxSize = (std::min)(300,EIGEN_TEST_MAX_SIZE), int maxRealWorldSize = 100000)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
@@ -429,8 +564,7 @@ template<typename Solver> void check_sparse_square_solving(Solver& solver, int m
     // check only once
     if(i==0)
     {
-      b = DenseVector::Zero(size);
-      check_sparse_solving(solver, A, b, dA, b);
+      CALL_SUBTEST(b = DenseVector::Zero(size); check_sparse_solving(solver, A, b, dA, b));
     }
     // regression test for Bug 792 (structurally rank deficient matrices):
     if(checkDeficient && size>1) {
diff --git a/contrib/eigen/test/sparse_solvers.cpp b/contrib/eigen/test/sparse_solvers.cpp
index 3a8873d430b59bd83b41722121052852a1ca0496..3b7cd7788848b56183e44f72e2be7b48b0f9102e 100644
--- a/contrib/eigen/test/sparse_solvers.cpp
+++ b/contrib/eigen/test/sparse_solvers.cpp
@@ -98,10 +98,23 @@ template<typename Scalar> void sparse_solvers(int rows, int cols)
     initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag|MakeLowerTriangular, &zeroCoords, &nonzeroCoords);
     VERIFY_IS_APPROX(refMat2.template triangularView<Lower>().solve(vec2),
                      m2.template triangularView<Lower>().solve(vec3));
+
+    // test empty triangular matrix
+    {
+      m2.resize(0,0);
+      refMatB.resize(0,refMatB.cols());
+      DenseMatrix res = m2.template triangularView<Lower>().solve(refMatB);
+      VERIFY_IS_EQUAL(res.rows(),0);
+      VERIFY_IS_EQUAL(res.cols(),refMatB.cols());
+      res = refMatB;
+      m2.template triangularView<Lower>().solveInPlace(res);
+      VERIFY_IS_EQUAL(res.rows(),0);
+      VERIFY_IS_EQUAL(res.cols(),refMatB.cols());
+    }
   }
 }
 
-void test_sparse_solvers()
+EIGEN_DECLARE_TEST(sparse_solvers)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(sparse_solvers<double>(8, 8) );
diff --git a/contrib/eigen/test/sparse_vector.cpp b/contrib/eigen/test/sparse_vector.cpp
index b3e1dda259d4adc2429d3d0e0c5b6bcbcd292c5c..35129278bd7ff3913f0af86395e697f3ab9f01d8 100644
--- a/contrib/eigen/test/sparse_vector.cpp
+++ b/contrib/eigen/test/sparse_vector.cpp
@@ -145,7 +145,7 @@ template<typename Scalar,typename StorageIndex> void sparse_vector(int rows, int
 
 }
 
-void test_sparse_vector()
+EIGEN_DECLARE_TEST(sparse_vector)
 {
   for(int i = 0; i < g_repeat; i++) {
     int r = Eigen::internal::random<int>(1,500), c = Eigen::internal::random<int>(1,500);
diff --git a/contrib/eigen/test/sparselu.cpp b/contrib/eigen/test/sparselu.cpp
index bd000baf1028208e728b41347e23d3ff5c7993cc..84cc6ebe53fba5e74addc3c32596fe2826c92a6d 100644
--- a/contrib/eigen/test/sparselu.cpp
+++ b/contrib/eigen/test/sparselu.cpp
@@ -36,7 +36,7 @@ template<typename T> void test_sparselu_T()
   check_sparse_square_determinant(sparselu_amd);
 }
 
-void test_sparselu()
+EIGEN_DECLARE_TEST(sparselu)
 {
   CALL_SUBTEST_1(test_sparselu_T<float>()); 
   CALL_SUBTEST_2(test_sparselu_T<double>());
diff --git a/contrib/eigen/test/sparseqr.cpp b/contrib/eigen/test/sparseqr.cpp
index f0e721fcea2cc6949ab8b4ac66a0f3434b6c7dc5..3576cc626f5b11a7d28b2472ac56ac338946bf15 100644
--- a/contrib/eigen/test/sparseqr.cpp
+++ b/contrib/eigen/test/sparseqr.cpp
@@ -43,6 +43,7 @@ int generate_sparse_rectangular_problem(MatrixType& A, DenseMat& dA, int maxRows
 
 template<typename Scalar> void test_sparseqr_scalar()
 {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef SparseMatrix<Scalar,ColMajor> MatrixType; 
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMat;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
@@ -91,14 +92,34 @@ template<typename Scalar> void test_sparseqr_scalar()
     exit(0);
     return;
   }
-  
-  VERIFY_IS_APPROX(A * x, b);
-  
-  //Compare with a dense QR solver
+
+  // Compare with a dense QR solver
   ColPivHouseholderQR<DenseMat> dqr(dA);
   refX = dqr.solve(b);
   
-  VERIFY_IS_EQUAL(dqr.rank(), solver.rank());
+  bool rank_deficient = A.cols()>A.rows() || dqr.rank()<A.cols();
+  if(rank_deficient)
+  {
+    // rank deficient problem -> we might have to increase the threshold
+    // to get a correct solution.
+    RealScalar th = RealScalar(20)*dA.colwise().norm().maxCoeff()*(A.rows()+A.cols()) * NumTraits<RealScalar>::epsilon();
+    for(Index k=0; (k<16) && !test_isApprox(A*x,b); ++k)
+    {
+      th *= RealScalar(10);
+      solver.setPivotThreshold(th);
+      solver.compute(A);
+      x = solver.solve(b);
+    }
+  }
+
+  VERIFY_IS_APPROX(A * x, b);
+  
+  // For rank deficient problem, the estimated rank might
+  // be slightly off, so let's only raise a warning in such cases.
+  if(rank_deficient) ++g_test_level;
+  VERIFY_IS_EQUAL(solver.rank(), dqr.rank());
+  if(rank_deficient) --g_test_level;
+
   if(solver.rank()==A.cols()) // full rank
     VERIFY_IS_APPROX(x, refX);
 //   else
@@ -117,7 +138,7 @@ template<typename Scalar> void test_sparseqr_scalar()
   dQ = solver.matrixQ();
   VERIFY_IS_APPROX(Q, dQ);
 }
-void test_sparseqr()
+EIGEN_DECLARE_TEST(sparseqr)
 {
   for(int i=0; i<g_repeat; ++i)
   {
diff --git a/contrib/eigen/test/special_numbers.cpp b/contrib/eigen/test/special_numbers.cpp
index 2f1b704be9c72f91db8726710192089a64c7ad5c..1e1a6363115d89852a5faf49fc881dd33fb85dfe 100644
--- a/contrib/eigen/test/special_numbers.cpp
+++ b/contrib/eigen/test/special_numbers.cpp
@@ -49,7 +49,7 @@ template<typename Scalar> void special_numbers()
   VERIFY(!mboth.array().allFinite());
 }
 
-void test_special_numbers()
+EIGEN_DECLARE_TEST(special_numbers)
 {
   for(int i = 0; i < 10*g_repeat; i++) {
     CALL_SUBTEST_1( special_numbers<float>() );
diff --git a/contrib/eigen/test/split_test_helper.h b/contrib/eigen/test/split_test_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..82e82aaefcf532206f97d249777b7af3627401ed
--- /dev/null
+++ b/contrib/eigen/test/split_test_helper.h
@@ -0,0 +1,5994 @@
+#if defined(EIGEN_TEST_PART_1) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_1(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_1(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_2) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_2(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_2(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_3) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_3(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_3(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_4) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_4(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_4(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_5) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_5(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_5(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_6) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_6(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_6(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_7) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_7(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_7(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_8) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_8(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_8(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_9) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_9(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_9(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_10) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_10(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_10(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_11) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_11(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_11(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_12) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_12(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_12(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_13) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_13(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_13(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_14) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_14(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_14(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_15) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_15(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_15(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_16) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_16(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_16(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_17) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_17(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_17(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_18) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_18(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_18(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_19) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_19(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_19(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_20) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_20(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_20(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_21) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_21(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_21(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_22) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_22(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_22(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_23) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_23(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_23(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_24) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_24(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_24(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_25) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_25(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_25(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_26) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_26(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_26(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_27) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_27(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_27(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_28) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_28(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_28(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_29) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_29(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_29(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_30) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_30(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_30(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_31) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_31(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_31(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_32) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_32(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_32(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_33) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_33(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_33(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_34) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_34(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_34(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_35) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_35(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_35(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_36) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_36(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_36(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_37) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_37(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_37(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_38) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_38(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_38(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_39) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_39(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_39(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_40) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_40(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_40(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_41) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_41(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_41(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_42) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_42(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_42(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_43) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_43(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_43(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_44) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_44(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_44(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_45) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_45(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_45(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_46) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_46(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_46(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_47) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_47(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_47(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_48) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_48(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_48(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_49) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_49(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_49(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_50) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_50(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_50(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_51) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_51(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_51(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_52) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_52(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_52(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_53) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_53(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_53(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_54) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_54(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_54(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_55) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_55(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_55(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_56) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_56(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_56(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_57) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_57(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_57(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_58) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_58(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_58(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_59) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_59(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_59(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_60) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_60(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_60(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_61) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_61(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_61(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_62) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_62(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_62(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_63) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_63(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_63(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_64) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_64(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_64(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_65) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_65(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_65(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_66) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_66(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_66(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_67) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_67(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_67(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_68) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_68(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_68(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_69) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_69(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_69(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_70) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_70(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_70(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_71) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_71(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_71(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_72) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_72(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_72(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_73) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_73(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_73(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_74) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_74(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_74(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_75) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_75(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_75(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_76) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_76(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_76(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_77) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_77(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_77(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_78) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_78(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_78(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_79) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_79(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_79(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_80) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_80(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_80(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_81) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_81(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_81(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_82) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_82(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_82(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_83) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_83(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_83(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_84) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_84(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_84(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_85) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_85(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_85(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_86) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_86(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_86(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_87) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_87(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_87(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_88) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_88(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_88(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_89) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_89(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_89(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_90) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_90(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_90(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_91) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_91(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_91(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_92) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_92(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_92(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_93) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_93(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_93(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_94) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_94(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_94(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_95) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_95(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_95(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_96) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_96(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_96(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_97) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_97(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_97(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_98) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_98(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_98(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_99) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_99(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_99(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_100) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_100(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_100(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_101) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_101(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_101(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_102) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_102(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_102(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_103) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_103(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_103(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_104) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_104(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_104(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_105) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_105(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_105(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_106) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_106(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_106(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_107) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_107(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_107(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_108) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_108(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_108(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_109) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_109(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_109(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_110) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_110(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_110(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_111) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_111(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_111(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_112) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_112(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_112(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_113) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_113(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_113(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_114) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_114(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_114(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_115) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_115(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_115(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_116) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_116(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_116(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_117) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_117(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_117(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_118) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_118(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_118(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_119) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_119(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_119(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_120) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_120(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_120(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_121) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_121(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_121(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_122) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_122(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_122(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_123) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_123(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_123(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_124) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_124(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_124(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_125) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_125(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_125(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_126) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_126(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_126(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_127) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_127(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_127(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_128) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_128(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_128(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_129) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_129(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_129(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_130) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_130(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_130(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_131) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_131(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_131(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_132) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_132(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_132(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_133) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_133(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_133(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_134) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_134(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_134(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_135) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_135(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_135(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_136) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_136(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_136(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_137) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_137(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_137(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_138) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_138(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_138(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_139) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_139(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_139(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_140) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_140(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_140(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_141) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_141(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_141(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_142) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_142(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_142(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_143) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_143(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_143(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_144) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_144(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_144(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_145) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_145(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_145(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_146) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_146(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_146(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_147) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_147(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_147(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_148) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_148(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_148(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_149) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_149(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_149(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_150) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_150(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_150(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_151) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_151(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_151(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_152) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_152(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_152(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_153) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_153(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_153(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_154) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_154(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_154(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_155) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_155(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_155(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_156) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_156(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_156(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_157) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_157(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_157(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_158) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_158(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_158(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_159) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_159(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_159(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_160) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_160(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_160(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_161) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_161(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_161(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_162) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_162(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_162(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_163) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_163(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_163(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_164) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_164(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_164(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_165) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_165(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_165(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_166) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_166(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_166(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_167) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_167(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_167(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_168) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_168(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_168(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_169) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_169(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_169(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_170) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_170(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_170(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_171) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_171(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_171(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_172) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_172(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_172(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_173) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_173(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_173(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_174) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_174(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_174(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_175) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_175(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_175(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_176) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_176(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_176(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_177) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_177(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_177(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_178) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_178(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_178(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_179) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_179(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_179(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_180) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_180(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_180(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_181) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_181(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_181(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_182) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_182(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_182(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_183) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_183(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_183(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_184) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_184(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_184(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_185) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_185(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_185(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_186) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_186(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_186(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_187) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_187(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_187(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_188) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_188(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_188(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_189) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_189(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_189(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_190) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_190(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_190(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_191) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_191(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_191(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_192) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_192(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_192(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_193) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_193(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_193(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_194) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_194(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_194(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_195) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_195(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_195(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_196) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_196(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_196(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_197) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_197(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_197(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_198) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_198(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_198(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_199) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_199(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_199(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_200) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_200(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_200(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_201) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_201(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_201(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_202) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_202(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_202(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_203) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_203(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_203(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_204) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_204(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_204(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_205) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_205(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_205(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_206) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_206(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_206(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_207) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_207(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_207(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_208) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_208(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_208(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_209) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_209(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_209(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_210) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_210(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_210(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_211) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_211(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_211(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_212) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_212(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_212(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_213) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_213(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_213(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_214) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_214(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_214(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_215) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_215(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_215(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_216) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_216(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_216(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_217) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_217(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_217(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_218) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_218(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_218(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_219) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_219(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_219(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_220) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_220(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_220(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_221) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_221(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_221(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_222) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_222(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_222(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_223) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_223(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_223(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_224) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_224(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_224(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_225) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_225(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_225(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_226) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_226(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_226(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_227) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_227(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_227(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_228) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_228(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_228(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_229) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_229(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_229(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_230) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_230(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_230(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_231) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_231(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_231(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_232) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_232(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_232(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_233) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_233(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_233(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_234) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_234(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_234(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_235) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_235(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_235(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_236) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_236(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_236(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_237) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_237(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_237(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_238) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_238(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_238(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_239) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_239(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_239(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_240) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_240(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_240(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_241) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_241(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_241(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_242) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_242(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_242(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_243) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_243(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_243(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_244) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_244(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_244(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_245) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_245(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_245(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_246) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_246(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_246(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_247) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_247(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_247(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_248) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_248(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_248(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_249) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_249(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_249(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_250) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_250(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_250(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_251) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_251(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_251(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_252) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_252(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_252(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_253) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_253(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_253(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_254) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_254(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_254(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_255) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_255(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_255(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_256) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_256(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_256(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_257) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_257(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_257(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_258) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_258(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_258(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_259) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_259(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_259(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_260) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_260(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_260(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_261) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_261(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_261(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_262) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_262(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_262(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_263) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_263(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_263(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_264) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_264(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_264(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_265) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_265(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_265(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_266) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_266(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_266(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_267) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_267(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_267(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_268) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_268(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_268(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_269) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_269(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_269(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_270) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_270(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_270(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_271) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_271(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_271(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_272) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_272(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_272(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_273) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_273(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_273(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_274) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_274(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_274(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_275) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_275(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_275(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_276) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_276(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_276(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_277) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_277(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_277(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_278) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_278(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_278(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_279) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_279(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_279(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_280) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_280(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_280(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_281) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_281(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_281(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_282) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_282(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_282(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_283) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_283(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_283(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_284) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_284(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_284(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_285) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_285(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_285(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_286) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_286(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_286(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_287) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_287(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_287(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_288) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_288(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_288(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_289) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_289(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_289(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_290) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_290(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_290(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_291) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_291(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_291(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_292) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_292(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_292(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_293) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_293(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_293(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_294) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_294(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_294(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_295) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_295(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_295(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_296) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_296(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_296(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_297) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_297(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_297(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_298) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_298(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_298(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_299) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_299(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_299(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_300) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_300(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_300(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_301) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_301(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_301(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_302) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_302(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_302(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_303) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_303(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_303(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_304) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_304(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_304(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_305) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_305(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_305(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_306) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_306(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_306(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_307) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_307(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_307(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_308) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_308(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_308(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_309) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_309(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_309(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_310) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_310(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_310(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_311) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_311(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_311(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_312) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_312(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_312(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_313) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_313(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_313(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_314) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_314(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_314(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_315) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_315(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_315(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_316) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_316(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_316(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_317) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_317(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_317(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_318) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_318(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_318(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_319) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_319(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_319(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_320) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_320(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_320(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_321) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_321(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_321(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_322) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_322(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_322(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_323) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_323(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_323(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_324) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_324(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_324(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_325) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_325(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_325(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_326) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_326(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_326(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_327) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_327(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_327(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_328) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_328(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_328(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_329) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_329(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_329(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_330) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_330(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_330(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_331) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_331(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_331(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_332) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_332(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_332(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_333) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_333(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_333(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_334) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_334(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_334(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_335) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_335(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_335(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_336) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_336(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_336(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_337) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_337(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_337(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_338) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_338(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_338(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_339) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_339(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_339(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_340) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_340(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_340(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_341) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_341(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_341(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_342) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_342(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_342(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_343) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_343(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_343(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_344) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_344(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_344(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_345) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_345(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_345(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_346) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_346(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_346(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_347) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_347(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_347(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_348) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_348(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_348(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_349) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_349(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_349(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_350) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_350(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_350(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_351) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_351(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_351(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_352) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_352(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_352(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_353) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_353(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_353(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_354) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_354(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_354(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_355) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_355(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_355(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_356) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_356(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_356(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_357) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_357(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_357(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_358) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_358(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_358(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_359) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_359(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_359(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_360) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_360(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_360(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_361) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_361(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_361(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_362) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_362(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_362(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_363) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_363(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_363(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_364) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_364(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_364(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_365) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_365(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_365(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_366) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_366(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_366(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_367) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_367(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_367(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_368) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_368(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_368(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_369) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_369(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_369(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_370) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_370(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_370(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_371) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_371(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_371(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_372) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_372(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_372(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_373) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_373(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_373(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_374) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_374(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_374(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_375) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_375(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_375(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_376) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_376(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_376(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_377) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_377(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_377(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_378) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_378(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_378(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_379) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_379(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_379(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_380) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_380(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_380(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_381) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_381(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_381(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_382) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_382(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_382(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_383) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_383(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_383(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_384) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_384(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_384(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_385) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_385(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_385(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_386) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_386(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_386(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_387) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_387(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_387(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_388) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_388(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_388(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_389) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_389(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_389(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_390) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_390(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_390(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_391) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_391(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_391(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_392) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_392(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_392(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_393) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_393(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_393(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_394) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_394(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_394(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_395) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_395(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_395(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_396) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_396(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_396(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_397) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_397(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_397(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_398) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_398(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_398(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_399) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_399(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_399(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_400) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_400(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_400(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_401) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_401(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_401(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_402) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_402(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_402(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_403) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_403(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_403(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_404) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_404(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_404(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_405) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_405(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_405(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_406) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_406(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_406(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_407) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_407(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_407(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_408) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_408(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_408(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_409) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_409(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_409(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_410) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_410(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_410(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_411) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_411(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_411(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_412) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_412(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_412(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_413) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_413(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_413(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_414) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_414(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_414(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_415) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_415(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_415(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_416) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_416(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_416(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_417) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_417(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_417(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_418) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_418(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_418(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_419) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_419(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_419(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_420) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_420(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_420(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_421) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_421(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_421(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_422) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_422(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_422(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_423) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_423(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_423(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_424) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_424(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_424(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_425) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_425(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_425(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_426) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_426(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_426(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_427) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_427(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_427(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_428) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_428(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_428(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_429) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_429(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_429(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_430) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_430(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_430(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_431) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_431(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_431(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_432) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_432(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_432(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_433) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_433(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_433(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_434) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_434(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_434(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_435) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_435(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_435(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_436) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_436(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_436(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_437) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_437(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_437(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_438) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_438(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_438(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_439) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_439(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_439(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_440) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_440(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_440(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_441) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_441(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_441(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_442) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_442(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_442(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_443) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_443(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_443(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_444) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_444(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_444(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_445) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_445(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_445(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_446) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_446(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_446(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_447) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_447(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_447(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_448) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_448(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_448(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_449) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_449(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_449(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_450) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_450(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_450(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_451) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_451(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_451(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_452) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_452(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_452(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_453) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_453(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_453(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_454) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_454(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_454(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_455) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_455(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_455(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_456) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_456(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_456(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_457) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_457(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_457(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_458) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_458(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_458(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_459) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_459(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_459(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_460) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_460(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_460(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_461) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_461(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_461(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_462) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_462(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_462(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_463) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_463(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_463(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_464) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_464(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_464(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_465) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_465(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_465(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_466) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_466(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_466(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_467) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_467(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_467(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_468) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_468(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_468(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_469) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_469(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_469(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_470) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_470(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_470(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_471) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_471(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_471(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_472) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_472(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_472(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_473) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_473(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_473(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_474) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_474(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_474(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_475) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_475(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_475(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_476) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_476(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_476(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_477) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_477(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_477(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_478) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_478(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_478(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_479) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_479(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_479(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_480) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_480(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_480(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_481) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_481(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_481(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_482) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_482(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_482(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_483) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_483(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_483(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_484) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_484(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_484(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_485) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_485(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_485(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_486) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_486(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_486(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_487) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_487(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_487(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_488) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_488(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_488(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_489) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_489(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_489(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_490) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_490(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_490(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_491) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_491(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_491(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_492) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_492(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_492(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_493) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_493(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_493(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_494) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_494(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_494(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_495) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_495(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_495(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_496) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_496(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_496(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_497) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_497(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_497(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_498) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_498(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_498(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_499) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_499(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_499(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_500) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_500(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_500(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_501) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_501(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_501(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_502) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_502(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_502(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_503) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_503(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_503(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_504) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_504(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_504(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_505) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_505(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_505(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_506) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_506(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_506(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_507) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_507(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_507(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_508) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_508(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_508(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_509) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_509(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_509(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_510) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_510(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_510(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_511) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_511(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_511(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_512) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_512(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_512(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_513) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_513(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_513(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_514) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_514(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_514(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_515) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_515(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_515(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_516) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_516(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_516(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_517) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_517(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_517(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_518) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_518(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_518(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_519) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_519(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_519(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_520) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_520(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_520(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_521) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_521(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_521(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_522) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_522(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_522(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_523) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_523(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_523(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_524) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_524(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_524(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_525) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_525(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_525(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_526) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_526(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_526(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_527) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_527(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_527(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_528) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_528(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_528(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_529) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_529(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_529(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_530) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_530(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_530(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_531) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_531(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_531(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_532) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_532(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_532(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_533) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_533(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_533(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_534) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_534(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_534(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_535) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_535(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_535(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_536) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_536(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_536(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_537) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_537(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_537(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_538) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_538(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_538(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_539) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_539(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_539(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_540) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_540(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_540(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_541) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_541(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_541(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_542) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_542(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_542(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_543) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_543(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_543(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_544) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_544(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_544(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_545) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_545(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_545(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_546) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_546(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_546(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_547) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_547(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_547(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_548) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_548(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_548(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_549) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_549(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_549(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_550) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_550(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_550(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_551) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_551(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_551(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_552) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_552(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_552(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_553) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_553(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_553(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_554) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_554(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_554(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_555) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_555(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_555(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_556) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_556(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_556(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_557) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_557(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_557(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_558) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_558(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_558(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_559) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_559(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_559(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_560) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_560(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_560(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_561) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_561(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_561(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_562) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_562(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_562(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_563) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_563(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_563(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_564) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_564(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_564(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_565) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_565(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_565(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_566) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_566(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_566(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_567) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_567(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_567(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_568) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_568(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_568(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_569) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_569(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_569(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_570) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_570(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_570(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_571) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_571(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_571(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_572) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_572(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_572(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_573) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_573(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_573(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_574) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_574(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_574(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_575) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_575(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_575(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_576) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_576(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_576(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_577) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_577(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_577(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_578) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_578(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_578(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_579) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_579(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_579(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_580) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_580(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_580(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_581) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_581(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_581(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_582) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_582(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_582(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_583) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_583(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_583(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_584) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_584(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_584(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_585) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_585(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_585(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_586) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_586(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_586(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_587) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_587(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_587(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_588) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_588(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_588(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_589) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_589(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_589(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_590) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_590(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_590(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_591) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_591(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_591(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_592) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_592(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_592(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_593) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_593(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_593(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_594) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_594(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_594(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_595) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_595(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_595(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_596) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_596(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_596(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_597) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_597(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_597(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_598) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_598(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_598(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_599) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_599(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_599(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_600) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_600(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_600(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_601) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_601(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_601(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_602) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_602(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_602(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_603) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_603(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_603(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_604) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_604(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_604(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_605) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_605(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_605(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_606) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_606(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_606(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_607) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_607(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_607(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_608) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_608(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_608(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_609) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_609(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_609(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_610) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_610(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_610(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_611) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_611(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_611(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_612) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_612(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_612(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_613) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_613(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_613(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_614) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_614(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_614(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_615) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_615(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_615(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_616) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_616(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_616(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_617) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_617(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_617(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_618) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_618(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_618(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_619) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_619(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_619(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_620) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_620(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_620(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_621) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_621(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_621(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_622) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_622(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_622(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_623) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_623(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_623(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_624) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_624(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_624(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_625) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_625(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_625(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_626) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_626(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_626(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_627) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_627(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_627(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_628) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_628(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_628(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_629) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_629(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_629(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_630) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_630(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_630(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_631) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_631(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_631(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_632) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_632(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_632(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_633) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_633(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_633(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_634) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_634(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_634(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_635) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_635(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_635(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_636) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_636(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_636(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_637) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_637(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_637(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_638) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_638(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_638(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_639) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_639(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_639(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_640) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_640(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_640(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_641) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_641(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_641(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_642) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_642(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_642(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_643) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_643(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_643(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_644) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_644(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_644(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_645) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_645(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_645(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_646) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_646(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_646(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_647) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_647(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_647(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_648) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_648(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_648(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_649) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_649(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_649(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_650) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_650(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_650(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_651) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_651(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_651(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_652) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_652(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_652(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_653) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_653(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_653(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_654) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_654(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_654(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_655) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_655(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_655(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_656) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_656(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_656(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_657) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_657(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_657(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_658) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_658(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_658(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_659) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_659(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_659(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_660) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_660(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_660(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_661) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_661(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_661(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_662) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_662(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_662(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_663) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_663(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_663(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_664) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_664(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_664(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_665) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_665(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_665(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_666) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_666(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_666(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_667) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_667(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_667(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_668) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_668(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_668(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_669) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_669(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_669(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_670) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_670(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_670(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_671) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_671(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_671(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_672) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_672(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_672(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_673) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_673(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_673(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_674) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_674(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_674(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_675) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_675(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_675(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_676) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_676(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_676(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_677) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_677(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_677(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_678) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_678(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_678(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_679) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_679(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_679(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_680) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_680(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_680(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_681) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_681(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_681(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_682) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_682(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_682(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_683) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_683(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_683(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_684) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_684(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_684(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_685) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_685(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_685(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_686) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_686(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_686(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_687) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_687(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_687(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_688) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_688(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_688(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_689) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_689(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_689(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_690) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_690(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_690(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_691) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_691(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_691(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_692) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_692(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_692(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_693) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_693(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_693(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_694) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_694(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_694(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_695) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_695(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_695(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_696) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_696(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_696(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_697) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_697(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_697(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_698) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_698(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_698(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_699) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_699(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_699(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_700) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_700(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_700(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_701) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_701(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_701(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_702) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_702(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_702(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_703) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_703(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_703(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_704) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_704(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_704(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_705) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_705(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_705(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_706) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_706(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_706(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_707) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_707(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_707(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_708) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_708(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_708(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_709) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_709(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_709(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_710) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_710(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_710(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_711) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_711(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_711(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_712) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_712(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_712(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_713) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_713(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_713(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_714) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_714(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_714(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_715) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_715(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_715(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_716) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_716(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_716(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_717) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_717(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_717(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_718) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_718(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_718(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_719) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_719(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_719(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_720) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_720(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_720(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_721) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_721(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_721(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_722) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_722(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_722(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_723) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_723(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_723(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_724) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_724(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_724(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_725) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_725(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_725(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_726) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_726(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_726(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_727) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_727(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_727(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_728) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_728(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_728(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_729) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_729(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_729(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_730) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_730(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_730(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_731) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_731(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_731(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_732) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_732(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_732(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_733) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_733(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_733(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_734) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_734(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_734(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_735) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_735(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_735(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_736) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_736(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_736(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_737) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_737(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_737(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_738) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_738(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_738(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_739) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_739(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_739(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_740) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_740(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_740(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_741) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_741(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_741(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_742) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_742(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_742(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_743) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_743(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_743(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_744) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_744(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_744(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_745) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_745(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_745(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_746) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_746(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_746(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_747) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_747(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_747(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_748) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_748(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_748(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_749) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_749(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_749(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_750) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_750(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_750(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_751) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_751(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_751(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_752) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_752(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_752(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_753) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_753(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_753(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_754) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_754(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_754(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_755) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_755(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_755(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_756) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_756(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_756(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_757) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_757(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_757(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_758) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_758(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_758(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_759) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_759(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_759(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_760) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_760(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_760(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_761) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_761(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_761(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_762) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_762(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_762(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_763) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_763(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_763(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_764) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_764(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_764(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_765) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_765(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_765(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_766) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_766(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_766(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_767) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_767(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_767(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_768) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_768(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_768(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_769) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_769(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_769(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_770) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_770(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_770(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_771) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_771(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_771(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_772) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_772(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_772(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_773) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_773(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_773(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_774) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_774(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_774(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_775) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_775(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_775(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_776) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_776(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_776(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_777) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_777(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_777(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_778) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_778(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_778(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_779) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_779(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_779(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_780) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_780(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_780(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_781) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_781(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_781(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_782) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_782(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_782(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_783) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_783(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_783(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_784) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_784(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_784(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_785) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_785(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_785(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_786) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_786(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_786(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_787) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_787(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_787(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_788) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_788(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_788(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_789) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_789(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_789(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_790) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_790(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_790(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_791) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_791(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_791(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_792) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_792(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_792(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_793) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_793(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_793(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_794) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_794(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_794(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_795) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_795(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_795(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_796) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_796(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_796(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_797) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_797(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_797(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_798) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_798(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_798(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_799) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_799(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_799(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_800) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_800(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_800(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_801) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_801(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_801(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_802) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_802(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_802(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_803) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_803(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_803(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_804) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_804(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_804(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_805) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_805(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_805(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_806) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_806(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_806(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_807) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_807(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_807(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_808) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_808(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_808(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_809) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_809(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_809(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_810) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_810(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_810(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_811) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_811(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_811(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_812) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_812(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_812(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_813) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_813(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_813(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_814) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_814(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_814(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_815) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_815(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_815(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_816) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_816(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_816(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_817) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_817(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_817(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_818) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_818(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_818(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_819) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_819(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_819(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_820) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_820(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_820(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_821) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_821(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_821(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_822) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_822(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_822(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_823) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_823(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_823(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_824) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_824(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_824(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_825) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_825(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_825(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_826) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_826(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_826(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_827) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_827(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_827(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_828) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_828(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_828(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_829) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_829(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_829(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_830) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_830(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_830(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_831) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_831(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_831(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_832) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_832(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_832(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_833) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_833(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_833(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_834) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_834(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_834(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_835) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_835(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_835(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_836) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_836(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_836(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_837) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_837(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_837(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_838) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_838(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_838(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_839) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_839(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_839(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_840) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_840(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_840(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_841) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_841(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_841(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_842) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_842(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_842(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_843) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_843(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_843(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_844) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_844(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_844(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_845) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_845(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_845(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_846) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_846(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_846(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_847) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_847(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_847(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_848) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_848(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_848(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_849) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_849(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_849(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_850) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_850(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_850(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_851) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_851(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_851(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_852) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_852(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_852(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_853) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_853(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_853(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_854) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_854(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_854(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_855) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_855(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_855(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_856) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_856(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_856(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_857) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_857(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_857(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_858) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_858(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_858(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_859) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_859(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_859(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_860) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_860(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_860(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_861) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_861(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_861(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_862) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_862(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_862(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_863) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_863(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_863(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_864) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_864(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_864(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_865) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_865(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_865(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_866) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_866(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_866(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_867) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_867(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_867(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_868) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_868(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_868(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_869) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_869(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_869(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_870) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_870(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_870(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_871) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_871(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_871(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_872) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_872(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_872(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_873) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_873(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_873(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_874) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_874(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_874(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_875) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_875(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_875(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_876) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_876(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_876(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_877) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_877(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_877(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_878) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_878(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_878(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_879) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_879(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_879(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_880) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_880(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_880(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_881) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_881(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_881(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_882) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_882(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_882(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_883) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_883(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_883(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_884) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_884(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_884(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_885) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_885(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_885(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_886) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_886(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_886(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_887) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_887(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_887(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_888) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_888(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_888(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_889) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_889(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_889(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_890) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_890(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_890(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_891) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_891(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_891(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_892) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_892(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_892(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_893) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_893(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_893(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_894) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_894(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_894(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_895) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_895(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_895(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_896) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_896(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_896(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_897) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_897(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_897(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_898) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_898(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_898(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_899) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_899(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_899(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_900) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_900(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_900(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_901) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_901(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_901(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_902) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_902(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_902(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_903) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_903(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_903(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_904) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_904(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_904(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_905) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_905(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_905(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_906) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_906(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_906(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_907) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_907(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_907(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_908) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_908(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_908(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_909) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_909(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_909(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_910) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_910(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_910(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_911) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_911(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_911(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_912) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_912(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_912(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_913) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_913(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_913(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_914) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_914(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_914(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_915) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_915(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_915(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_916) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_916(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_916(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_917) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_917(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_917(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_918) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_918(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_918(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_919) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_919(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_919(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_920) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_920(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_920(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_921) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_921(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_921(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_922) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_922(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_922(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_923) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_923(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_923(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_924) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_924(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_924(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_925) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_925(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_925(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_926) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_926(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_926(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_927) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_927(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_927(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_928) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_928(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_928(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_929) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_929(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_929(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_930) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_930(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_930(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_931) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_931(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_931(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_932) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_932(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_932(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_933) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_933(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_933(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_934) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_934(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_934(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_935) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_935(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_935(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_936) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_936(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_936(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_937) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_937(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_937(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_938) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_938(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_938(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_939) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_939(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_939(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_940) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_940(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_940(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_941) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_941(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_941(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_942) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_942(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_942(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_943) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_943(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_943(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_944) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_944(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_944(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_945) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_945(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_945(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_946) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_946(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_946(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_947) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_947(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_947(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_948) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_948(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_948(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_949) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_949(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_949(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_950) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_950(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_950(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_951) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_951(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_951(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_952) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_952(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_952(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_953) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_953(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_953(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_954) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_954(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_954(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_955) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_955(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_955(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_956) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_956(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_956(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_957) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_957(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_957(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_958) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_958(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_958(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_959) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_959(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_959(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_960) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_960(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_960(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_961) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_961(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_961(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_962) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_962(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_962(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_963) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_963(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_963(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_964) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_964(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_964(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_965) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_965(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_965(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_966) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_966(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_966(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_967) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_967(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_967(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_968) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_968(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_968(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_969) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_969(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_969(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_970) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_970(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_970(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_971) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_971(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_971(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_972) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_972(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_972(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_973) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_973(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_973(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_974) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_974(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_974(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_975) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_975(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_975(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_976) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_976(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_976(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_977) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_977(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_977(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_978) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_978(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_978(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_979) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_979(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_979(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_980) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_980(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_980(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_981) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_981(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_981(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_982) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_982(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_982(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_983) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_983(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_983(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_984) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_984(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_984(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_985) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_985(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_985(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_986) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_986(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_986(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_987) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_987(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_987(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_988) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_988(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_988(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_989) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_989(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_989(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_990) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_990(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_990(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_991) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_991(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_991(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_992) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_992(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_992(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_993) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_993(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_993(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_994) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_994(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_994(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_995) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_995(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_995(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_996) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_996(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_996(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_997) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_997(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_997(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_998) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_998(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_998(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_999) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_999(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_999(FUNC)
+#endif
+
diff --git a/contrib/eigen/test/spqr_support.cpp b/contrib/eigen/test/spqr_support.cpp
index 81e63b6a57347aaefdde213a001f1ffdf0e3a192..79c2c12fc4751917007f9c56fa17ffe48e65bfb0 100644
--- a/contrib/eigen/test/spqr_support.cpp
+++ b/contrib/eigen/test/spqr_support.cpp
@@ -57,7 +57,7 @@ template<typename Scalar> void test_spqr_scalar()
   refX = dA.colPivHouseholderQr().solve(b);
   VERIFY(x.isApprox(refX,test_precision<Scalar>()));
 }
-void test_spqr_support()
+EIGEN_DECLARE_TEST(spqr_support)
 {
   CALL_SUBTEST_1(test_spqr_scalar<double>());
   CALL_SUBTEST_2(test_spqr_scalar<std::complex<double> >());
diff --git a/contrib/eigen/test/stable_norm.cpp b/contrib/eigen/test/stable_norm.cpp
index ac8b129112cb5b0579b309753f0c7cf64394de61..cb8a80c18da7bbab81465f339a337aaa81424cff 100644
--- a/contrib/eigen/test/stable_norm.cpp
+++ b/contrib/eigen/test/stable_norm.cpp
@@ -170,7 +170,13 @@ template<typename MatrixType> void stable_norm(const MatrixType& m)
     VERIFY(!(numext::isfinite)(v.norm()));          VERIFY((numext::isnan)(v.norm()));
     VERIFY(!(numext::isfinite)(v.stableNorm()));    VERIFY((numext::isnan)(v.stableNorm()));
     VERIFY(!(numext::isfinite)(v.blueNorm()));      VERIFY((numext::isnan)(v.blueNorm()));
-    VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY((numext::isnan)(v.hypotNorm()));
+    if (i2 != i || j2 != j) {
+      // hypot propagates inf over NaN.
+      VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY((numext::isinf)(v.hypotNorm()));
+    } else {
+      // inf is overwritten by NaN, expect norm to be NaN.
+      VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY((numext::isnan)(v.hypotNorm()));
+    }
   }
 
   // stableNormalize[d]
@@ -189,13 +195,51 @@ template<typename MatrixType> void stable_norm(const MatrixType& m)
   }
 }
 
-void test_stable_norm()
+template<typename Scalar>
+void test_hypot()
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  Scalar factor = internal::random<Scalar>();
+  while(numext::abs2(factor)<RealScalar(1e-4))
+    factor = internal::random<Scalar>();
+  Scalar big = factor * ((std::numeric_limits<RealScalar>::max)() * RealScalar(1e-4));
+  
+  factor = internal::random<Scalar>();
+  while(numext::abs2(factor)<RealScalar(1e-4))
+    factor = internal::random<Scalar>();
+  Scalar small = factor * ((std::numeric_limits<RealScalar>::min)() * RealScalar(1e4));
+
+  Scalar  one   (1),
+          zero  (0),
+          sqrt2 (std::sqrt(2)),
+          nan   (std::numeric_limits<RealScalar>::quiet_NaN());
+
+  Scalar a = internal::random<Scalar>(-1,1);
+  Scalar b = internal::random<Scalar>(-1,1);
+  VERIFY_IS_APPROX(numext::hypot(a,b),std::sqrt(numext::abs2(a)+numext::abs2(b)));
+  VERIFY_IS_EQUAL(numext::hypot(zero,zero), zero);
+  VERIFY_IS_APPROX(numext::hypot(one, one), sqrt2);
+  VERIFY_IS_APPROX(numext::hypot(big,big), sqrt2*numext::abs(big));
+  VERIFY_IS_APPROX(numext::hypot(small,small), sqrt2*numext::abs(small));
+  VERIFY_IS_APPROX(numext::hypot(small,big), numext::abs(big));
+  VERIFY((numext::isnan)(numext::hypot(nan,a)));
+  VERIFY((numext::isnan)(numext::hypot(a,nan)));
+}
+
+EIGEN_DECLARE_TEST(stable_norm)
 {
   for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_3( test_hypot<double>() );
+    CALL_SUBTEST_4( test_hypot<float>() );
+    CALL_SUBTEST_5( test_hypot<std::complex<double> >() );
+    CALL_SUBTEST_6( test_hypot<std::complex<float> >() );
+
     CALL_SUBTEST_1( stable_norm(Matrix<float, 1, 1>()) );
     CALL_SUBTEST_2( stable_norm(Vector4d()) );
     CALL_SUBTEST_3( stable_norm(VectorXd(internal::random<int>(10,2000))) );
+    CALL_SUBTEST_3( stable_norm(MatrixXd(internal::random<int>(10,200), internal::random<int>(10,200))) );
     CALL_SUBTEST_4( stable_norm(VectorXf(internal::random<int>(10,2000))) );
     CALL_SUBTEST_5( stable_norm(VectorXcd(internal::random<int>(10,2000))) );
+    CALL_SUBTEST_6( stable_norm(VectorXcf(internal::random<int>(10,2000))) );
   }
 }
diff --git a/contrib/eigen/test/stddeque.cpp b/contrib/eigen/test/stddeque.cpp
index b6955f747587bbfde1dcd464a3ef0155b885a081..ea85ea968f2e2f02871e1d93a752c223dbdc5835 100644
--- a/contrib/eigen/test/stddeque.cpp
+++ b/contrib/eigen/test/stddeque.cpp
@@ -100,7 +100,7 @@ void check_stddeque_quaternion(const QuaternionType&)
   VERIFY_IS_APPROX(v.back(), x);
 }
 
-void test_stddeque()
+EIGEN_DECLARE_TEST(stddeque)
 {
   // some non vectorizable fixed sizes
   CALL_SUBTEST_1(check_stddeque_matrix(Vector2f()));
diff --git a/contrib/eigen/test/stddeque_overload.cpp b/contrib/eigen/test/stddeque_overload.cpp
index f495b5a04ab02b6514a5386e5c14e43ad62710c5..0f59f0695b2c5a432d64c7489acfa7d24026ef9d 100644
--- a/contrib/eigen/test/stddeque_overload.cpp
+++ b/contrib/eigen/test/stddeque_overload.cpp
@@ -28,8 +28,8 @@ EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Quaterniond)
 template<typename MatrixType>
 void check_stddeque_matrix(const MatrixType& m)
 {
-  typename MatrixType::Index rows = m.rows();
-  typename MatrixType::Index cols = m.cols();
+  Index rows = m.rows();
+  Index cols = m.cols();
   MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
   std::deque<MatrixType> v(10, MatrixType::Zero(rows,cols)), w(20, y);
   v[5] = x;
@@ -128,7 +128,7 @@ void check_stddeque_quaternion(const QuaternionType&)
   }
 }
 
-void test_stddeque_overload()
+EIGEN_DECLARE_TEST(stddeque_overload)
 {
   // some non vectorizable fixed sizes
   CALL_SUBTEST_1(check_stddeque_matrix(Vector2f()));
diff --git a/contrib/eigen/test/stdlist.cpp b/contrib/eigen/test/stdlist.cpp
index 23b30ccaf1f514405f00037d6c4bcc537e8a1cf0..1af9e6ecbfbd1f47bcb6f1e43752b7379787a657 100644
--- a/contrib/eigen/test/stdlist.cpp
+++ b/contrib/eigen/test/stdlist.cpp
@@ -100,7 +100,7 @@ void check_stdlist_quaternion(const QuaternionType&)
   VERIFY_IS_APPROX(v.back(), x);
 }
 
-void test_stdlist()
+EIGEN_DECLARE_TEST(stdlist)
 {
   // some non vectorizable fixed sizes
   CALL_SUBTEST_1(check_stdlist_matrix(Vector2f()));
diff --git a/contrib/eigen/test/stdlist_overload.cpp b/contrib/eigen/test/stdlist_overload.cpp
index aea7a28465cf07d9d66c7958d61611b9b4d3085a..a78516e24eba1a093a0141cfbf3a3b6ed8b383ca 100644
--- a/contrib/eigen/test/stdlist_overload.cpp
+++ b/contrib/eigen/test/stdlist_overload.cpp
@@ -44,8 +44,8 @@ void set(Container & c, Position position, const Value & value)
 template<typename MatrixType>
 void check_stdlist_matrix(const MatrixType& m)
 {
-  typename MatrixType::Index rows = m.rows();
-  typename MatrixType::Index cols = m.cols();
+  Index rows = m.rows();
+  Index cols = m.cols();
   MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
   std::list<MatrixType> v(10, MatrixType::Zero(rows,cols)), w(20, y);
   typename std::list<MatrixType>::iterator itv = get(v, 5);
@@ -162,7 +162,7 @@ void check_stdlist_quaternion(const QuaternionType&)
   }
 }
 
-void test_stdlist_overload()
+EIGEN_DECLARE_TEST(stdlist_overload)
 {
   // some non vectorizable fixed sizes
   CALL_SUBTEST_1(check_stdlist_matrix(Vector2f()));
diff --git a/contrib/eigen/test/stdvector.cpp b/contrib/eigen/test/stdvector.cpp
index 383d9a5095ebff2b85dec9e37f0e7aff43dc444f..18de240c6e68f490b590808cef56ad0515fc6ff2 100644
--- a/contrib/eigen/test/stdvector.cpp
+++ b/contrib/eigen/test/stdvector.cpp
@@ -14,8 +14,8 @@
 template<typename MatrixType>
 void check_stdvector_matrix(const MatrixType& m)
 {
-  typename MatrixType::Index rows = m.rows();
-  typename MatrixType::Index cols = m.cols();
+  Index rows = m.rows();
+  Index cols = m.cols();
   MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
   std::vector<MatrixType,Eigen::aligned_allocator<MatrixType> > v(10, MatrixType::Zero(rows,cols)), w(20, y);
   v[5] = x;
@@ -127,7 +127,7 @@ void std_vector_gcc_warning()
   v.push_back(T());
 }
 
-void test_stdvector()
+EIGEN_DECLARE_TEST(stdvector)
 {
   // some non vectorizable fixed sizes
   CALL_SUBTEST_1(check_stdvector_matrix(Vector2f()));
diff --git a/contrib/eigen/test/stdvector_overload.cpp b/contrib/eigen/test/stdvector_overload.cpp
index 637e3ef52f34bb573407c566946ddbe27ec6f41a..da04f8a8434c84a5a1bcf1aed3b6aa1250b5bc35 100644
--- a/contrib/eigen/test/stdvector_overload.cpp
+++ b/contrib/eigen/test/stdvector_overload.cpp
@@ -28,8 +28,8 @@ EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(Quaterniond)
 template<typename MatrixType>
 void check_stdvector_matrix(const MatrixType& m)
 {
-  typename MatrixType::Index rows = m.rows();
-  typename MatrixType::Index cols = m.cols();
+  Index rows = m.rows();
+  Index cols = m.cols();
   MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
   std::vector<MatrixType> v(10, MatrixType::Zero(rows,cols)), w(20, y);
   v[5] = x;
@@ -131,7 +131,7 @@ void check_stdvector_quaternion(const QuaternionType&)
   }
 }
 
-void test_stdvector_overload()
+EIGEN_DECLARE_TEST(stdvector_overload)
 {
   // some non vectorizable fixed sizes
   CALL_SUBTEST_1(check_stdvector_matrix(Vector2f()));
diff --git a/contrib/eigen/test/stl_iterators.cpp b/contrib/eigen/test/stl_iterators.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72bbf8250dcf17bc8d4d42788c22f20ac879700e
--- /dev/null
+++ b/contrib/eigen/test/stl_iterators.cpp
@@ -0,0 +1,562 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <iterator>
+#include <numeric>
+
+template< class Iterator >
+std::reverse_iterator<Iterator>
+make_reverse_iterator( Iterator i )
+{
+  return std::reverse_iterator<Iterator>(i);
+}
+
+#if !EIGEN_HAS_CXX11
+template<class ForwardIt>
+ForwardIt is_sorted_until(ForwardIt firstIt, ForwardIt lastIt)
+{
+    if (firstIt != lastIt) {
+        ForwardIt next = firstIt;
+        while (++next != lastIt) {
+            if (*next < *firstIt)
+                return next;
+            firstIt = next;
+        }
+    }
+    return lastIt;
+}
+template<class ForwardIt>
+bool is_sorted(ForwardIt firstIt, ForwardIt lastIt)
+{
+    return ::is_sorted_until(firstIt, lastIt) == lastIt;
+}
+#else
+using std::is_sorted;
+#endif
+
+template<typename XprType>
+bool is_pointer_based_stl_iterator(const internal::pointer_based_stl_iterator<XprType> &) { return true; }
+
+template<typename XprType>
+bool is_generic_randaccess_stl_iterator(const internal::generic_randaccess_stl_iterator<XprType> &) { return true; }
+
+template<typename Iter>
+bool is_default_constructible_and_assignable(const Iter& it)
+{
+#if EIGEN_HAS_CXX11
+  VERIFY(std::is_default_constructible<Iter>::value);
+  VERIFY(std::is_nothrow_default_constructible<Iter>::value);
+#endif
+  Iter it2;
+  it2 = it;
+  return (it==it2);
+}
+
+template<typename Xpr>
+void check_begin_end_for_loop(Xpr xpr)
+{
+  const Xpr& cxpr(xpr);
+  Index i = 0;
+
+  i = 0;
+  for(typename Xpr::iterator it = xpr.begin(); it!=xpr.end(); ++it) { VERIFY_IS_EQUAL(*it,xpr[i++]); }
+
+  i = 0;
+  for(typename Xpr::const_iterator it = xpr.cbegin(); it!=xpr.cend(); ++it) { VERIFY_IS_EQUAL(*it,xpr[i++]); }
+
+  i = 0;
+  for(typename Xpr::const_iterator it = cxpr.begin(); it!=cxpr.end(); ++it) { VERIFY_IS_EQUAL(*it,xpr[i++]); }
+
+  i = 0;
+  for(typename Xpr::const_iterator it = xpr.begin(); it!=xpr.end(); ++it) { VERIFY_IS_EQUAL(*it,xpr[i++]); }
+
+  {
+    // simple API check
+    typename Xpr::const_iterator cit = xpr.begin();
+    cit = xpr.cbegin();
+
+    #if EIGEN_HAS_CXX11
+    auto tmp1 = xpr.begin();
+    VERIFY(tmp1==xpr.begin());
+    auto tmp2 = xpr.cbegin();
+    VERIFY(tmp2==xpr.cbegin());
+    #endif
+  }
+
+  VERIFY( xpr.end() -xpr.begin()  == xpr.size() );
+  VERIFY( xpr.cend()-xpr.begin()  == xpr.size() );
+  VERIFY( xpr.end() -xpr.cbegin() == xpr.size() );
+  VERIFY( xpr.cend()-xpr.cbegin() == xpr.size() );
+
+  if(xpr.size()>0) {
+    VERIFY(xpr.begin() != xpr.end());
+    VERIFY(xpr.begin() < xpr.end());
+    VERIFY(xpr.begin() <= xpr.end());
+    VERIFY(!(xpr.begin() == xpr.end()));
+    VERIFY(!(xpr.begin() > xpr.end()));
+    VERIFY(!(xpr.begin() >= xpr.end()));
+    
+    VERIFY(xpr.cbegin() != xpr.end());
+    VERIFY(xpr.cbegin() < xpr.end());
+    VERIFY(xpr.cbegin() <= xpr.end());
+    VERIFY(!(xpr.cbegin() == xpr.end()));
+    VERIFY(!(xpr.cbegin() > xpr.end()));
+    VERIFY(!(xpr.cbegin() >= xpr.end()));
+
+    VERIFY(xpr.begin() != xpr.cend());
+    VERIFY(xpr.begin() < xpr.cend());
+    VERIFY(xpr.begin() <= xpr.cend());
+    VERIFY(!(xpr.begin() == xpr.cend()));
+    VERIFY(!(xpr.begin() > xpr.cend()));
+    VERIFY(!(xpr.begin() >= xpr.cend()));
+  }
+}
+
+template<typename Scalar, int Rows, int Cols>
+void test_stl_iterators(int rows=Rows, int cols=Cols)
+{
+  typedef Matrix<Scalar,Rows,1> VectorType;
+  #if EIGEN_HAS_CXX11
+  typedef Matrix<Scalar,1,Cols> RowVectorType;
+  #endif
+  typedef Matrix<Scalar,Rows,Cols,ColMajor> ColMatrixType;
+  typedef Matrix<Scalar,Rows,Cols,RowMajor> RowMatrixType;
+  VectorType v = VectorType::Random(rows);
+  const VectorType& cv(v);
+  ColMatrixType A = ColMatrixType::Random(rows,cols);
+  const ColMatrixType& cA(A);
+  RowMatrixType B = RowMatrixType::Random(rows,cols);
+  
+  Index i, j;
+
+  // Verify that iterators are default constructible (See bug #1900)
+  {
+    VERIFY( is_default_constructible_and_assignable(v.begin()));
+    VERIFY( is_default_constructible_and_assignable(v.end()));
+    VERIFY( is_default_constructible_and_assignable(cv.begin()));
+    VERIFY( is_default_constructible_and_assignable(cv.end()));
+
+    VERIFY( is_default_constructible_and_assignable(A.row(0).begin()));
+    VERIFY( is_default_constructible_and_assignable(A.row(0).end()));
+    VERIFY( is_default_constructible_and_assignable(cA.row(0).begin()));
+    VERIFY( is_default_constructible_and_assignable(cA.row(0).end()));
+
+    VERIFY( is_default_constructible_and_assignable(B.row(0).begin()));
+    VERIFY( is_default_constructible_and_assignable(B.row(0).end()));
+  }
+
+  // Check we got a fast pointer-based iterator when expected
+  {
+    VERIFY( is_pointer_based_stl_iterator(v.begin()) );
+    VERIFY( is_pointer_based_stl_iterator(v.end()) );
+    VERIFY( is_pointer_based_stl_iterator(cv.begin()) );
+    VERIFY( is_pointer_based_stl_iterator(cv.end()) );
+
+    j = internal::random<Index>(0,A.cols()-1);
+    VERIFY( is_pointer_based_stl_iterator(A.col(j).begin()) );
+    VERIFY( is_pointer_based_stl_iterator(A.col(j).end()) );
+    VERIFY( is_pointer_based_stl_iterator(cA.col(j).begin()) );
+    VERIFY( is_pointer_based_stl_iterator(cA.col(j).end()) );
+
+    i = internal::random<Index>(0,A.rows()-1);
+    VERIFY( is_pointer_based_stl_iterator(A.row(i).begin()) );
+    VERIFY( is_pointer_based_stl_iterator(A.row(i).end()) );
+    VERIFY( is_pointer_based_stl_iterator(cA.row(i).begin()) );
+    VERIFY( is_pointer_based_stl_iterator(cA.row(i).end()) );
+
+    VERIFY( is_pointer_based_stl_iterator(A.reshaped().begin()) );
+    VERIFY( is_pointer_based_stl_iterator(A.reshaped().end()) );
+    VERIFY( is_pointer_based_stl_iterator(cA.reshaped().begin()) );
+    VERIFY( is_pointer_based_stl_iterator(cA.reshaped().end()) );
+
+    VERIFY( is_pointer_based_stl_iterator(B.template reshaped<AutoOrder>().begin()) );
+    VERIFY( is_pointer_based_stl_iterator(B.template reshaped<AutoOrder>().end()) );
+
+    VERIFY( is_generic_randaccess_stl_iterator(A.template reshaped<RowMajor>().begin()) );
+    VERIFY( is_generic_randaccess_stl_iterator(A.template reshaped<RowMajor>().end()) );
+  }
+
+  {
+    check_begin_end_for_loop(v);
+    check_begin_end_for_loop(A.col(internal::random<Index>(0,A.cols()-1)));
+    check_begin_end_for_loop(A.row(internal::random<Index>(0,A.rows()-1)));
+    check_begin_end_for_loop(v+v);
+  }
+
+#if EIGEN_HAS_CXX11
+  // check swappable
+  {
+    using std::swap;
+    // pointer-based
+    {
+      VectorType v_copy = v;
+      auto a = v.begin();
+      auto b = v.end()-1;
+      swap(a,b);
+      VERIFY_IS_EQUAL(v,v_copy);
+      VERIFY_IS_EQUAL(*b,*v.begin());
+      VERIFY_IS_EQUAL(*b,v(0));
+      VERIFY_IS_EQUAL(*a,v.end()[-1]);
+      VERIFY_IS_EQUAL(*a,v(last));
+    }
+
+    // generic
+    {
+      RowMatrixType B_copy = B;
+      auto Br = B.reshaped();
+      auto a = Br.begin();
+      auto b = Br.end()-1;
+      swap(a,b);
+      VERIFY_IS_EQUAL(B,B_copy);
+      VERIFY_IS_EQUAL(*b,*Br.begin());
+      VERIFY_IS_EQUAL(*b,Br(0));
+      VERIFY_IS_EQUAL(*a,Br.end()[-1]);
+      VERIFY_IS_EQUAL(*a,Br(last));
+    }
+  }
+
+  // check non-const iterator with for-range loops
+  {
+    i = 0;
+    for(auto x : v) { VERIFY_IS_EQUAL(x,v[i++]); }
+
+    j = internal::random<Index>(0,A.cols()-1);
+    i = 0;
+    for(auto x : A.col(j)) { VERIFY_IS_EQUAL(x,A(i++,j)); }
+
+    i = 0;
+    for(auto x : (v+A.col(j))) { VERIFY_IS_APPROX(x,v(i)+A(i,j)); ++i; }
+
+    j = 0;
+    i = internal::random<Index>(0,A.rows()-1);
+    for(auto x : A.row(i)) { VERIFY_IS_EQUAL(x,A(i,j++)); }
+
+    i = 0;
+    for(auto x : A.reshaped()) { VERIFY_IS_EQUAL(x,A(i++)); }
+  }
+
+  // same for const_iterator
+  {
+    i = 0;
+    for(auto x : cv) { VERIFY_IS_EQUAL(x,v[i++]); }
+
+    i = 0;
+    for(auto x : cA.reshaped()) { VERIFY_IS_EQUAL(x,A(i++)); }
+
+    j = 0;
+    i = internal::random<Index>(0,A.rows()-1);
+    for(auto x : cA.row(i)) { VERIFY_IS_EQUAL(x,A(i,j++)); }
+  }
+
+  // check reshaped() on row-major
+  {
+    i = 0;
+    Matrix<Scalar,Dynamic,Dynamic,ColMajor> Bc = B;
+    for(auto x : B.reshaped()) { VERIFY_IS_EQUAL(x,Bc(i++)); }
+  }
+
+  // check write access
+  {
+    VectorType w(v.size());
+    i = 0;
+    for(auto& x : w) { x = v(i++); }
+    VERIFY_IS_EQUAL(v,w);
+  }
+
+  // check for dangling pointers
+  {
+    // no dangling because pointer-based
+    {
+      j = internal::random<Index>(0,A.cols()-1);
+      auto it = A.col(j).begin();
+      for(i=0;i<rows;++i) {
+        VERIFY_IS_EQUAL(it[i],A(i,j));
+      }
+    }
+
+    // no dangling because pointer-based
+    {
+      i = internal::random<Index>(0,A.rows()-1);
+      auto it = A.row(i).begin();
+      for(j=0;j<cols;++j) { VERIFY_IS_EQUAL(it[j],A(i,j)); }
+    }
+
+    {
+      j = internal::random<Index>(0,A.cols()-1);
+      // this would produce a dangling pointer:
+      // auto it = (A+2*A).col(j).begin(); 
+      // we need to name the temporary expression:
+      auto tmp = (A+2*A).col(j);
+      auto it = tmp.begin();
+      for(i=0;i<rows;++i) {
+        VERIFY_IS_APPROX(it[i],3*A(i,j));
+      }
+    }
+  }
+
+  {
+    // check basic for loop on vector-wise iterators
+    j=0;
+    for (auto it = A.colwise().cbegin(); it != A.colwise().cend(); ++it, ++j) {
+      VERIFY_IS_APPROX( it->coeff(0), A(0,j) );
+      VERIFY_IS_APPROX( (*it).coeff(0), A(0,j) );
+    }
+    j=0;
+    for (auto it = A.colwise().begin(); it != A.colwise().end(); ++it, ++j) {
+      (*it).coeffRef(0) = (*it).coeff(0); // compilation check
+      it->coeffRef(0) = it->coeff(0);     // compilation check
+      VERIFY_IS_APPROX( it->coeff(0), A(0,j) );
+      VERIFY_IS_APPROX( (*it).coeff(0), A(0,j) );
+    }
+
+    // check valuetype gives us a copy
+    j=0;
+    for (auto it = A.colwise().cbegin(); it != A.colwise().cend(); ++it, ++j) {
+      typename decltype(it)::value_type tmp = *it;
+      VERIFY_IS_NOT_EQUAL( tmp.data() , it->data() );
+      VERIFY_IS_APPROX( tmp, A.col(j) );
+    }
+  }
+
+#endif
+
+  if(rows>=3) {
+    VERIFY_IS_EQUAL((v.begin()+rows/2)[1], v(rows/2+1));
+
+    VERIFY_IS_EQUAL((A.rowwise().begin()+rows/2)[1], A.row(rows/2+1));
+  }
+
+  if(cols>=3) {
+    VERIFY_IS_EQUAL((A.colwise().begin()+cols/2)[1], A.col(cols/2+1));
+  }
+
+  // check std::sort
+  {
+    // first check that is_sorted returns false when required
+    if(rows>=2)
+    {
+      v(1) = v(0)-Scalar(1);
+      #if EIGEN_HAS_CXX11
+      VERIFY(!is_sorted(std::begin(v),std::end(v)));
+      #else
+      VERIFY(!is_sorted(v.cbegin(),v.cend()));
+      #endif
+    }
+
+    // on a vector
+    {
+      std::sort(v.begin(),v.end());
+      VERIFY(is_sorted(v.begin(),v.end()));
+      VERIFY(!::is_sorted(make_reverse_iterator(v.end()),make_reverse_iterator(v.begin())));
+    }
+
+    // on a column of a column-major matrix -> pointer-based iterator and default increment
+    {
+      j = internal::random<Index>(0,A.cols()-1);
+      // std::sort(begin(A.col(j)),end(A.col(j))); // does not compile because this returns const iterators
+      typename ColMatrixType::ColXpr Acol = A.col(j);
+      std::sort(Acol.begin(),Acol.end());
+      VERIFY(is_sorted(Acol.cbegin(),Acol.cend()));
+      A.setRandom();
+
+      std::sort(A.col(j).begin(),A.col(j).end());
+      VERIFY(is_sorted(A.col(j).cbegin(),A.col(j).cend()));
+      A.setRandom();
+    }
+
+    // on a row of a rowmajor matrix -> pointer-based iterator and runtime increment
+    {
+      i = internal::random<Index>(0,A.rows()-1);
+      typename ColMatrixType::RowXpr Arow = A.row(i);
+      VERIFY_IS_EQUAL( std::distance(Arow.begin(),Arow.end()), cols);
+      std::sort(Arow.begin(),Arow.end());
+      VERIFY(is_sorted(Arow.cbegin(),Arow.cend()));
+      A.setRandom();
+
+      std::sort(A.row(i).begin(),A.row(i).end());
+      VERIFY(is_sorted(A.row(i).cbegin(),A.row(i).cend()));
+      A.setRandom();
+    }
+
+    // with a generic iterator
+    {
+      Reshaped<RowMatrixType,RowMatrixType::SizeAtCompileTime,1> B1 = B.reshaped();
+      std::sort(B1.begin(),B1.end());
+      VERIFY(is_sorted(B1.cbegin(),B1.cend()));
+      B.setRandom();
+
+      // assertion because nested expressions are different
+      // std::sort(B.reshaped().begin(),B.reshaped().end());
+      // VERIFY(is_sorted(B.reshaped().cbegin(),B.reshaped().cend()));
+      // B.setRandom();
+    }
+  }
+
+  // check with partial_sum
+  {
+    j = internal::random<Index>(0,A.cols()-1);
+    typename ColMatrixType::ColXpr Acol = A.col(j);
+    std::partial_sum(Acol.begin(), Acol.end(), v.begin());
+    VERIFY_IS_APPROX(v(seq(1,last)), v(seq(0,last-1))+Acol(seq(1,last)));
+
+    // inplace
+    std::partial_sum(Acol.begin(), Acol.end(), Acol.begin());
+    VERIFY_IS_APPROX(v, Acol);
+  }
+
+  // stress random access as required by std::nth_element
+  if(rows>=3)
+  {
+    v.setRandom();
+    VectorType v1 = v;
+    std::sort(v1.begin(),v1.end());
+    std::nth_element(v.begin(), v.begin()+rows/2, v.end());
+    VERIFY_IS_APPROX(v1(rows/2), v(rows/2));
+
+    v.setRandom();
+    v1 = v;
+    std::sort(v1.begin()+rows/2,v1.end());
+    std::nth_element(v.begin()+rows/2, v.begin()+rows/4, v.end());
+    VERIFY_IS_APPROX(v1(rows/4), v(rows/4));
+  }
+
+#if EIGEN_HAS_CXX11
+  // check rows/cols iterators with range-for loops
+  {
+    j = 0;
+    for(auto c : A.colwise()) { VERIFY_IS_APPROX(c.sum(), A.col(j).sum()); ++j; }
+    j = 0;
+    for(auto c : B.colwise()) { VERIFY_IS_APPROX(c.sum(), B.col(j).sum()); ++j; }
+
+    j = 0;
+    for(auto c : B.colwise()) {
+      i = 0;
+      for(auto& x : c) {
+        VERIFY_IS_EQUAL(x, B(i,j));
+        x = A(i,j);
+        ++i;
+      }
+      ++j;
+    }
+    VERIFY_IS_APPROX(A,B);
+    B.setRandom();
+    
+    i = 0;
+    for(auto r : A.rowwise()) { VERIFY_IS_APPROX(r.sum(), A.row(i).sum()); ++i; }
+    i = 0;
+    for(auto r : B.rowwise()) { VERIFY_IS_APPROX(r.sum(), B.row(i).sum()); ++i; }
+  }
+
+
+  // check rows/cols iterators with STL algorithms
+  {
+    RowVectorType row = RowVectorType::Random(cols);
+    A.rowwise() = row;
+    VERIFY( std::all_of(A.rowwise().begin(),  A.rowwise().end(),  [&row](typename ColMatrixType::RowXpr x) { return internal::isApprox(x.squaredNorm(),row.squaredNorm()); }) );
+    VERIFY( std::all_of(A.rowwise().rbegin(), A.rowwise().rend(), [&row](typename ColMatrixType::RowXpr x) { return internal::isApprox(x.squaredNorm(),row.squaredNorm()); }) );
+
+    VectorType col = VectorType::Random(rows);
+    A.colwise() = col;
+    VERIFY( std::all_of(A.colwise().begin(),   A.colwise().end(),   [&col](typename ColMatrixType::ColXpr x) { return internal::isApprox(x.squaredNorm(),col.squaredNorm()); }) );
+    VERIFY( std::all_of(A.colwise().rbegin(),  A.colwise().rend(),  [&col](typename ColMatrixType::ColXpr x) { return internal::isApprox(x.squaredNorm(),col.squaredNorm()); }) );
+    VERIFY( std::all_of(A.colwise().cbegin(),  A.colwise().cend(),  [&col](typename ColMatrixType::ConstColXpr x) { return internal::isApprox(x.squaredNorm(),col.squaredNorm()); }) );
+    VERIFY( std::all_of(A.colwise().crbegin(), A.colwise().crend(), [&col](typename ColMatrixType::ConstColXpr x) { return internal::isApprox(x.squaredNorm(),col.squaredNorm()); }) );
+
+    i = internal::random<Index>(0,A.rows()-1);
+    A.setRandom();
+    A.row(i).setZero();
+    VERIFY_IS_EQUAL( std::find_if(A.rowwise().begin(),  A.rowwise().end(),  [](typename ColMatrixType::RowXpr x) { return x.squaredNorm() == Scalar(0); })-A.rowwise().begin(),  i );
+    VERIFY_IS_EQUAL( std::find_if(A.rowwise().rbegin(), A.rowwise().rend(), [](typename ColMatrixType::RowXpr x) { return x.squaredNorm() == Scalar(0); })-A.rowwise().rbegin(), (A.rows()-1) - i );
+
+    j = internal::random<Index>(0,A.cols()-1);
+    A.setRandom();
+    A.col(j).setZero();
+    VERIFY_IS_EQUAL( std::find_if(A.colwise().begin(),  A.colwise().end(),  [](typename ColMatrixType::ColXpr x) { return x.squaredNorm() == Scalar(0); })-A.colwise().begin(),  j );
+    VERIFY_IS_EQUAL( std::find_if(A.colwise().rbegin(), A.colwise().rend(), [](typename ColMatrixType::ColXpr x) { return x.squaredNorm() == Scalar(0); })-A.colwise().rbegin(), (A.cols()-1) - j );
+  }
+
+  {
+    using VecOp = VectorwiseOp<ArrayXXi, 0>;
+    STATIC_CHECK(( internal::is_same<VecOp::const_iterator, decltype(std::declval<const VecOp&>().cbegin())>::value ));
+    STATIC_CHECK(( internal::is_same<VecOp::const_iterator, decltype(std::declval<const VecOp&>().cend  ())>::value ));
+    #if EIGEN_COMP_CXXVER>=14
+      STATIC_CHECK(( internal::is_same<VecOp::const_iterator, decltype(std::cbegin(std::declval<const VecOp&>()))>::value ));
+      STATIC_CHECK(( internal::is_same<VecOp::const_iterator, decltype(std::cend  (std::declval<const VecOp&>()))>::value ));
+    #endif
+  }
+
+#endif
+}
+
+
+#if EIGEN_HAS_CXX11
+// When the compiler sees expression IsContainerTest<C>(0), if C is an
+// STL-style container class, the first overload of IsContainerTest
+// will be viable (since both C::iterator* and C::const_iterator* are
+// valid types and NULL can be implicitly converted to them).  It will
+// be picked over the second overload as 'int' is a perfect match for
+// the type of argument 0.  If C::iterator or C::const_iterator is not
+// a valid type, the first overload is not viable, and the second
+// overload will be picked.
+template <class C,
+          class Iterator = decltype(::std::declval<const C&>().begin()),
+          class = decltype(::std::declval<const C&>().end()),
+          class = decltype(++::std::declval<Iterator&>()),
+          class = decltype(*::std::declval<Iterator>()),
+          class = typename C::const_iterator>
+bool IsContainerType(int /* dummy */) { return true; }
+
+template <class C>
+bool IsContainerType(long /* dummy */) { return false; }
+
+template <typename Scalar, int Rows, int Cols>
+void test_stl_container_detection(int rows=Rows, int cols=Cols)
+{
+  typedef Matrix<Scalar,Rows,1> VectorType;
+  typedef Matrix<Scalar,Rows,Cols,ColMajor> ColMatrixType;
+  typedef Matrix<Scalar,Rows,Cols,RowMajor> RowMatrixType;
+
+  ColMatrixType A = ColMatrixType::Random(rows, cols);
+  RowMatrixType B = RowMatrixType::Random(rows, cols);
+
+  Index i = 1;
+
+  using ColMatrixColType = decltype(A.col(i));
+  using ColMatrixRowType = decltype(A.row(i));
+  using RowMatrixColType = decltype(B.col(i));
+  using RowMatrixRowType = decltype(B.row(i));
+
+  // Vector and matrix col/row are valid Stl-style container.
+  VERIFY_IS_EQUAL(IsContainerType<VectorType>(0), true);
+  VERIFY_IS_EQUAL(IsContainerType<ColMatrixColType>(0), true);
+  VERIFY_IS_EQUAL(IsContainerType<ColMatrixRowType>(0), true);
+  VERIFY_IS_EQUAL(IsContainerType<RowMatrixColType>(0), true);
+  VERIFY_IS_EQUAL(IsContainerType<RowMatrixRowType>(0), true);
+
+  // But the matrix itself is not a valid Stl-style container.
+  VERIFY_IS_EQUAL(IsContainerType<ColMatrixType>(0), rows == 1 || cols == 1);
+  VERIFY_IS_EQUAL(IsContainerType<RowMatrixType>(0), rows == 1 || cols == 1);
+}
+#endif
+
+EIGEN_DECLARE_TEST(stl_iterators)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(( test_stl_iterators<double,2,3>() ));
+    CALL_SUBTEST_1(( test_stl_iterators<float,7,5>() ));
+    CALL_SUBTEST_1(( test_stl_iterators<int,Dynamic,Dynamic>(internal::random<int>(5,10), internal::random<int>(5,10)) ));
+    CALL_SUBTEST_1(( test_stl_iterators<int,Dynamic,Dynamic>(internal::random<int>(10,200), internal::random<int>(10,200)) ));
+  }
+  
+#if EIGEN_HAS_CXX11
+  CALL_SUBTEST_1(( test_stl_container_detection<float,1,1>() ));
+  CALL_SUBTEST_1(( test_stl_container_detection<float,5,5>() ));
+#endif  
+}
diff --git a/contrib/eigen/test/superlu_support.cpp b/contrib/eigen/test/superlu_support.cpp
index 98a7bc5c82660f82969649fb3320415e82bf2c3c..55450c8687a4f8f8ad805a1124c7990e4db42c8c 100644
--- a/contrib/eigen/test/superlu_support.cpp
+++ b/contrib/eigen/test/superlu_support.cpp
@@ -12,7 +12,7 @@
 
 #include <Eigen/SuperLUSupport>
 
-void test_superlu_support()
+EIGEN_DECLARE_TEST(superlu_support)
 {
   SuperLU<SparseMatrix<double> > superlu_double_colmajor;
   SuperLU<SparseMatrix<std::complex<double> > > superlu_cplxdouble_colmajor;
diff --git a/contrib/eigen/test/svd_common.h b/contrib/eigen/test/svd_common.h
index cba0665936e3cf131194fab9480c8fb5269d7bab..eae4c0bfe31e98b7e3cbe157768300f813220cee 100644
--- a/contrib/eigen/test/svd_common.h
+++ b/contrib/eigen/test/svd_common.h
@@ -17,6 +17,7 @@
 #endif
 
 #include "svd_fill.h"
+#include "solverbase.h"
 
 // Check that the matrix m is properly reconstructed and that the U and V factors are unitary
 // The SVD must have already been computed.
@@ -219,12 +220,33 @@ void svd_min_norm(const MatrixType& m, unsigned int computationOptions)
   VERIFY_IS_APPROX(x21, x3);
 }
 
+template<typename MatrixType, typename SolverType>
+void svd_test_solvers(const MatrixType& m, const SolverType& solver) {
+    Index rows, cols, cols2;
+
+    rows = m.rows();
+    cols = m.cols();
+
+    if(MatrixType::ColsAtCompileTime==Dynamic)
+    {
+      cols2 = internal::random<int>(2,EIGEN_TEST_MAX_SIZE);
+    }
+    else
+    {
+      cols2 = cols;
+    }
+    typedef Matrix<typename MatrixType::Scalar, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime> CMatrixType;
+    check_solverbase<CMatrixType, MatrixType>(m, solver, rows, cols, cols2);
+}
+
 // Check full, compare_to_full, least_square, and min_norm for all possible compute-options
 template<typename SvdType, typename MatrixType>
 void svd_test_all_computation_options(const MatrixType& m, bool full_only)
 {
 //   if (QRPreconditioner == NoQRPreconditioner && m.rows() != m.cols())
 //     return;
+  STATIC_CHECK(( internal::is_same<typename SvdType::StorageIndex,int>::value ));
+
   SvdType fullSvd(m, ComputeFullU|ComputeFullV);
   CALL_SUBTEST(( svd_check_full(m, fullSvd) ));
   CALL_SUBTEST(( svd_least_square<SvdType>(m, ComputeFullU | ComputeFullV) ));
@@ -234,6 +256,9 @@ void svd_test_all_computation_options(const MatrixType& m, bool full_only)
   // remark #111: statement is unreachable
   #pragma warning disable 111
   #endif
+
+  svd_test_solvers(m, fullSvd);
+
   if(full_only)
     return;
 
@@ -273,7 +298,8 @@ EIGEN_DONT_INLINE Scalar zero() { return Scalar(0); }
 // workaround aggressive optimization in ICC
 template<typename T> EIGEN_DONT_INLINE  T sub(T a, T b) { return a - b; }
 
-// all this function does is verify we don't iterate infinitely on nan/inf values
+// This function verifies we don't iterate infinitely on nan/inf values,
+// and that info() returns InvalidInput.
 template<typename SvdType, typename MatrixType>
 void svd_inf_nan()
 {
@@ -282,18 +308,22 @@ void svd_inf_nan()
   Scalar some_inf = Scalar(1) / zero<Scalar>();
   VERIFY(sub(some_inf, some_inf) != sub(some_inf, some_inf));
   svd.compute(MatrixType::Constant(10,10,some_inf), ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);
 
   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
   VERIFY(nan != nan);
   svd.compute(MatrixType::Constant(10,10,nan), ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);  
 
   MatrixType m = MatrixType::Zero(10,10);
   m(internal::random<int>(0,9), internal::random<int>(0,9)) = some_inf;
   svd.compute(m, ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);
 
   m = MatrixType::Zero(10,10);
   m(internal::random<int>(0,9), internal::random<int>(0,9)) = nan;
   svd.compute(m, ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);
   
   // regression test for bug 791
   m.resize(3,3);
@@ -301,6 +331,7 @@ void svd_inf_nan()
        0,   -0.5,                             0,
        nan,  0,                               0;
   svd.compute(m, ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);
   
   m.resize(4,4);
   m <<  1, 0, 0, 0,
@@ -308,6 +339,7 @@ void svd_inf_nan()
         1, 0, 1, nan,
         0, nan, nan, 0;
   svd.compute(m, ComputeFullU | ComputeFullV);
+  VERIFY(svd.info() == InvalidInput);
 }
 
 // Regression test for bug 286: JacobiSVD loops indefinitely with some
@@ -430,7 +462,7 @@ void svd_preallocate()
 }
 
 template<typename SvdType,typename MatrixType> 
-void svd_verify_assert(const MatrixType& m)
+void svd_verify_assert(const MatrixType& m, bool fullOnly = false)
 {
   typedef typename MatrixType::Scalar Scalar;
   Index rows = m.rows();
@@ -448,6 +480,8 @@ void svd_verify_assert(const MatrixType& m)
   VERIFY_RAISES_ASSERT(svd.singularValues())
   VERIFY_RAISES_ASSERT(svd.matrixV())
   VERIFY_RAISES_ASSERT(svd.solve(rhs))
+  VERIFY_RAISES_ASSERT(svd.transpose().solve(rhs))
+  VERIFY_RAISES_ASSERT(svd.adjoint().solve(rhs))
   MatrixType a = MatrixType::Zero(rows, cols);
   a.setZero();
   svd.compute(a, 0);
@@ -455,8 +489,17 @@ void svd_verify_assert(const MatrixType& m)
   VERIFY_RAISES_ASSERT(svd.matrixV())
   svd.singularValues();
   VERIFY_RAISES_ASSERT(svd.solve(rhs))
-    
-  if (ColsAtCompileTime == Dynamic)
+
+  svd.compute(a, ComputeFullU);
+  svd.matrixU();
+  VERIFY_RAISES_ASSERT(svd.matrixV())
+  VERIFY_RAISES_ASSERT(svd.solve(rhs))
+  svd.compute(a, ComputeFullV);
+  svd.matrixV();
+  VERIFY_RAISES_ASSERT(svd.matrixU())
+  VERIFY_RAISES_ASSERT(svd.solve(rhs))
+
+  if (!fullOnly && ColsAtCompileTime == Dynamic)
   {
     svd.compute(a, ComputeThinU);
     svd.matrixU();
diff --git a/contrib/eigen/test/swap.cpp b/contrib/eigen/test/swap.cpp
index f76e3624ddfd189c410f31a8527188a2462b5c4e..5b259d3ecac476c7f08fca5b134f0f2bc61fd160 100644
--- a/contrib/eigen/test/swap.cpp
+++ b/contrib/eigen/test/swap.cpp
@@ -28,8 +28,8 @@ template<typename MatrixType> void swap(const MatrixType& m)
   typedef typename MatrixType::Scalar Scalar;
 
   eigen_assert((!internal::is_same<MatrixType,OtherMatrixType>::value));
-  typename MatrixType::Index rows = m.rows();
-  typename MatrixType::Index cols = m.cols();
+  Index rows = m.rows();
+  Index cols = m.cols();
   
   // construct 3 matrix guaranteed to be distinct
   MatrixType m1 = MatrixType::Random(rows,cols);
@@ -83,7 +83,7 @@ template<typename MatrixType> void swap(const MatrixType& m)
   }
 }
 
-void test_swap()
+EIGEN_DECLARE_TEST(swap)
 {
   int s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
   CALL_SUBTEST_1( swap(Matrix3f()) ); // fixed size, no vectorization 
diff --git a/contrib/eigen/test/symbolic_index.cpp b/contrib/eigen/test/symbolic_index.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b114cbb9565eb8ff4a403e918f37211123d7214a
--- /dev/null
+++ b/contrib/eigen/test/symbolic_index.cpp
@@ -0,0 +1,84 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifdef EIGEN_TEST_PART_2
+#define EIGEN_MAX_CPP_VER 03
+
+// see indexed_view.cpp
+#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+  #pragma GCC diagnostic ignored "-Wdeprecated"
+#endif
+
+#endif
+
+#include "main.h"
+
+template<typename T1,typename T2>
+bool is_same_symb(const T1& a, const T2& b, Index size)
+{
+  return a.eval(last=size-1) == b.eval(last=size-1);
+}
+
+template<typename T>
+void check_is_symbolic(const T&) {
+  STATIC_CHECK(( symbolic::is_symbolic<T>::value ))
+}
+
+template<typename T>
+void check_isnot_symbolic(const T&) {
+  STATIC_CHECK(( !symbolic::is_symbolic<T>::value ))
+}
+
+#define VERIFY_EQ_INT(A,B) VERIFY_IS_APPROX(int(A),int(B))
+
+void check_symbolic_index()
+{
+  check_is_symbolic(last);
+  check_is_symbolic(lastp1);
+  check_is_symbolic(last+1);
+  check_is_symbolic(last-lastp1);
+  check_is_symbolic(2*last-lastp1/2);
+  check_isnot_symbolic(fix<3>());
+
+  Index size=100;
+
+  // First, let's check FixedInt arithmetic:
+  VERIFY( is_same_type( (fix<5>()-fix<3>())*fix<9>()/(-fix<3>()), fix<-(5-3)*9/3>() ) );
+  VERIFY( is_same_type( (fix<5>()-fix<3>())*fix<9>()/fix<2>(), fix<(5-3)*9/2>() ) );
+  VERIFY( is_same_type( fix<9>()/fix<2>(), fix<9/2>() ) );
+  VERIFY( is_same_type( fix<9>()%fix<2>(), fix<9%2>() ) );
+  VERIFY( is_same_type( fix<9>()&fix<2>(), fix<9&2>() ) );
+  VERIFY( is_same_type( fix<9>()|fix<2>(), fix<9|2>() ) );
+  VERIFY( is_same_type( fix<9>()/2, int(9/2) ) );
+
+  VERIFY( is_same_symb( lastp1-1, last, size) );
+  VERIFY( is_same_symb( lastp1-fix<1>, last, size) );
+
+  VERIFY_IS_EQUAL( ( (last*5-2)/3 ).eval(last=size-1), ((size-1)*5-2)/3 );
+  VERIFY_IS_EQUAL( ( (last*fix<5>-fix<2>)/fix<3> ).eval(last=size-1), ((size-1)*5-2)/3 );
+  VERIFY_IS_EQUAL( ( -last*lastp1  ).eval(last=size-1), -(size-1)*size );
+  VERIFY_IS_EQUAL( ( lastp1-3*last  ).eval(last=size-1), size- 3*(size-1) );
+  VERIFY_IS_EQUAL( ( (lastp1-3*last)/lastp1  ).eval(last=size-1), (size- 3*(size-1))/size );
+
+#if EIGEN_HAS_CXX14
+  {
+    struct x_tag {};  static const symbolic::SymbolExpr<x_tag> x;
+    struct y_tag {};  static const symbolic::SymbolExpr<y_tag> y;
+    struct z_tag {};  static const symbolic::SymbolExpr<z_tag> z;
+
+    VERIFY_IS_APPROX( int(((x+3)/y+z).eval(x=6,y=3,z=-13)), (6+3)/3+(-13) );
+  }
+#endif
+}
+
+EIGEN_DECLARE_TEST(symbolic_index)
+{
+  CALL_SUBTEST_1( check_symbolic_index() );
+  CALL_SUBTEST_2( check_symbolic_index() );
+}
diff --git a/contrib/eigen/test/triangular.cpp b/contrib/eigen/test/triangular.cpp
index 328eef4de20bf33261feb4f0e429a5445a545aac..981a0d0714c48c20cbcd369e0c4dd9333216a64a 100644
--- a/contrib/eigen/test/triangular.cpp
+++ b/contrib/eigen/test/triangular.cpp
@@ -7,9 +7,35 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#ifdef EIGEN_TEST_PART_100
+#  define EIGEN_NO_DEPRECATED_WARNING
+#endif
+
 #include "main.h"
 
 
+template<typename MatrixType> void triangular_deprecated(const MatrixType &m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+  MatrixType m1, m2, m3, m4;
+  m1.setRandom(rows,cols);
+  m2.setRandom(rows,cols);
+  m3 = m1; m4 = m2;
+  // deprecated method:
+  m1.template triangularView<Eigen::Upper>().swap(m2);
+  // use this method instead:
+  m3.template triangularView<Eigen::Upper>().swap(m4.template triangularView<Eigen::Upper>());
+  VERIFY_IS_APPROX(m1,m3);
+  VERIFY_IS_APPROX(m2,m4);
+  // deprecated method:
+  m1.template triangularView<Eigen::Lower>().swap(m4);
+  // use this method instead:
+  m3.template triangularView<Eigen::Lower>().swap(m2.template triangularView<Eigen::Lower>());
+  VERIFY_IS_APPROX(m1,m3);
+  VERIFY_IS_APPROX(m2,m4);
+}
+
 
 template<typename MatrixType> void triangular_square(const MatrixType& m)
 {
@@ -19,8 +45,8 @@ template<typename MatrixType> void triangular_square(const MatrixType& m)
 
   RealScalar largerEps = 10*test_precision<RealScalar>();
 
-  typename MatrixType::Index rows = m.rows();
-  typename MatrixType::Index cols = m.cols();
+  Index rows = m.rows();
+  Index cols = m.cols();
 
   MatrixType m1 = MatrixType::Random(rows, cols),
              m2 = MatrixType::Random(rows, cols),
@@ -68,7 +94,7 @@ template<typename MatrixType> void triangular_square(const MatrixType& m)
     while (numext::abs2(m1(i,i))<RealScalar(1e-1)) m1(i,i) = internal::random<Scalar>();
 
   Transpose<MatrixType> trm4(m4);
-  // test back and forward subsitution with a vector as the rhs
+  // test back and forward substitution with a vector as the rhs
   m3 = m1.template triangularView<Upper>();
   VERIFY(v2.isApprox(m3.adjoint() * (m1.adjoint().template triangularView<Lower>().solve(v2)), largerEps));
   m3 = m1.template triangularView<Lower>();
@@ -109,11 +135,12 @@ template<typename MatrixType> void triangular_square(const MatrixType& m)
   // test swap
   m1.setOnes();
   m2.setZero();
-  m2.template triangularView<Upper>().swap(m1);
+  m2.template triangularView<Upper>().swap(m1.template triangularView<Eigen::Upper>());
   m3.setZero();
   m3.template triangularView<Upper>().setOnes();
   VERIFY_IS_APPROX(m2,m3);
-  
+  VERIFY_RAISES_STATIC_ASSERT(m1.template triangularView<Eigen::Lower>().swap(m2.template triangularView<Eigen::Upper>()));
+
   m1.setRandom();
   m3 = m1.template triangularView<Upper>();
   Matrix<Scalar, MatrixType::ColsAtCompileTime, Dynamic> m5(cols, internal::random<int>(1,20));  m5.setRandom();
@@ -129,6 +156,22 @@ template<typename MatrixType> void triangular_square(const MatrixType& m)
 
   VERIFY_IS_APPROX(m1.template selfadjointView<Upper>().diagonal(), m1.diagonal());
 
+  m3.setRandom();
+  const MatrixType& m3c(m3);
+  VERIFY( is_same_type(m3c.template triangularView<Lower>(),m3.template triangularView<Lower>().template conjugateIf<false>()) );
+  VERIFY( is_same_type(m3c.template triangularView<Lower>().conjugate(),m3.template triangularView<Lower>().template conjugateIf<true>()) );
+  VERIFY_IS_APPROX(m3.template triangularView<Lower>().template conjugateIf<true>().toDenseMatrix(),
+                   m3.conjugate().template triangularView<Lower>().toDenseMatrix());
+  VERIFY_IS_APPROX(m3.template triangularView<Lower>().template conjugateIf<false>().toDenseMatrix(),
+                   m3.template triangularView<Lower>().toDenseMatrix());
+
+  VERIFY( is_same_type(m3c.template selfadjointView<Lower>(),m3.template selfadjointView<Lower>().template conjugateIf<false>()) );
+  VERIFY( is_same_type(m3c.template selfadjointView<Lower>().conjugate(),m3.template selfadjointView<Lower>().template conjugateIf<true>()) );
+  VERIFY_IS_APPROX(m3.template selfadjointView<Lower>().template conjugateIf<true>().toDenseMatrix(),
+                   m3.conjugate().template selfadjointView<Lower>().toDenseMatrix());
+  VERIFY_IS_APPROX(m3.template selfadjointView<Lower>().template conjugateIf<false>().toDenseMatrix(),
+                   m3.template selfadjointView<Lower>().toDenseMatrix());
+
 }
 
 
@@ -208,7 +251,7 @@ template<typename MatrixType> void triangular_rect(const MatrixType& m)
   // test swap
   m1.setOnes();
   m2.setZero();
-  m2.template triangularView<Upper>().swap(m1);
+  m2.template triangularView<Upper>().swap(m1.template triangularView<Eigen::Upper>());
   m3.setZero();
   m3.template triangularView<Upper>().setOnes();
   VERIFY_IS_APPROX(m2,m3);
@@ -220,7 +263,7 @@ void bug_159()
   EIGEN_UNUSED_VARIABLE(m)
 }
 
-void test_triangular()
+EIGEN_DECLARE_TEST(triangular)
 {
   int maxsize = (std::min)(EIGEN_TEST_MAX_SIZE,20);
   for(int i = 0; i < g_repeat ; i++)
@@ -240,6 +283,9 @@ void test_triangular()
     CALL_SUBTEST_9( triangular_rect(MatrixXcf(r, c)) );
     CALL_SUBTEST_5( triangular_rect(MatrixXcd(r, c)) );
     CALL_SUBTEST_6( triangular_rect(Matrix<float,Dynamic,Dynamic,RowMajor>(r, c)) );
+
+    CALL_SUBTEST_100( triangular_deprecated(Matrix<float, 5, 7>()) );
+    CALL_SUBTEST_100( triangular_deprecated(MatrixXd(r,c)) );
   }
   
   CALL_SUBTEST_1( bug_159() );
diff --git a/contrib/eigen/test/type_alias.cpp b/contrib/eigen/test/type_alias.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9a6616c72c6c2b4e49b9a92a0684587c9a41c993
--- /dev/null
+++ b/contrib/eigen/test/type_alias.cpp
@@ -0,0 +1,48 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+EIGEN_DECLARE_TEST(type_alias)
+{
+  using namespace internal;
+
+  // To warm up, some basic checks:
+  STATIC_CHECK((is_same<MatrixXd,Matrix<double,Dynamic,Dynamic> >::value));
+  STATIC_CHECK((is_same<Matrix2f,Matrix<float,2,2> >::value));
+  STATIC_CHECK((is_same<Array33i,Array<int,3,3> >::value));
+
+#if EIGEN_HAS_CXX11
+  
+  STATIC_CHECK((is_same<MatrixX<double>,    MatrixXd>::value));
+  STATIC_CHECK((is_same<MatrixX<int>,       MatrixXi>::value));
+  STATIC_CHECK((is_same<Matrix2<int>,       Matrix2i>::value));
+  STATIC_CHECK((is_same<Matrix2X<float>,    Matrix2Xf>::value));
+  STATIC_CHECK((is_same<MatrixX4<double>,   MatrixX4d>::value));
+  STATIC_CHECK((is_same<VectorX<int>,       VectorXi>::value));
+  STATIC_CHECK((is_same<Vector2<float>,     Vector2f>::value));
+  STATIC_CHECK((is_same<RowVectorX<int>,    RowVectorXi>::value));
+  STATIC_CHECK((is_same<RowVector2<float>,  RowVector2f>::value));
+
+  STATIC_CHECK((is_same<ArrayXX<float>,     ArrayXXf>::value));
+  STATIC_CHECK((is_same<Array33<int>,       Array33i>::value));
+  STATIC_CHECK((is_same<Array2X<float>,     Array2Xf>::value));
+  STATIC_CHECK((is_same<ArrayX4<double>,    ArrayX4d>::value));
+  STATIC_CHECK((is_same<ArrayX<double>,     ArrayXd>::value));
+  STATIC_CHECK((is_same<Array4<double>,     Array4d>::value));
+
+  STATIC_CHECK((is_same<Vector<float,3>,        Vector3f>::value));
+  STATIC_CHECK((is_same<Vector<int,Dynamic>,    VectorXi>::value));
+  STATIC_CHECK((is_same<RowVector<float,3>,     RowVector3f>::value));
+  STATIC_CHECK((is_same<RowVector<int,Dynamic>, RowVectorXi>::value));
+
+#else
+  std::cerr << "WARNING: c++11 type aliases not tested.\n";
+#endif
+}
diff --git a/contrib/eigen/test/umeyama.cpp b/contrib/eigen/test/umeyama.cpp
index 2e8092434820b7b23d1fddbbacf037524230cfd6..170c28a6116b2e964e202e6518178f556377786a 100644
--- a/contrib/eigen/test/umeyama.cpp
+++ b/contrib/eigen/test/umeyama.cpp
@@ -27,7 +27,7 @@ Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> randMatrixUnitary(int size)
   MatrixType Q;
 
   int max_tries = 40;
-  double is_unitary = false;
+  bool is_unitary = false;
 
   while (!is_unitary && max_tries > 0)
   {
@@ -155,7 +155,7 @@ void run_fixed_size_test(int num_elements)
   VERIFY(error < Scalar(16)*std::numeric_limits<Scalar>::epsilon());
 }
 
-void test_umeyama()
+EIGEN_DECLARE_TEST(umeyama)
 {
   for (int i=0; i<g_repeat; ++i)
   {
diff --git a/contrib/eigen/test/umfpack_support.cpp b/contrib/eigen/test/umfpack_support.cpp
index 37ab11f0b94a48f6da7032ca34f7b55ab47a86de..d8f2a6f806b4591c6226b00481d25545a811e395 100644
--- a/contrib/eigen/test/umfpack_support.cpp
+++ b/contrib/eigen/test/umfpack_support.cpp
@@ -12,10 +12,10 @@
 
 #include <Eigen/UmfPackSupport>
 
-template<typename T> void test_umfpack_support_T()
+template<typename T1, typename T2> void test_umfpack_support_T()
 {
-  UmfPackLU<SparseMatrix<T, ColMajor> > umfpack_colmajor;
-  UmfPackLU<SparseMatrix<T, RowMajor> > umfpack_rowmajor;
+  UmfPackLU<SparseMatrix<T1, ColMajor, T2> > umfpack_colmajor;
+  UmfPackLU<SparseMatrix<T1, RowMajor, T2> > umfpack_rowmajor;
   
   check_sparse_square_solving(umfpack_colmajor);
   check_sparse_square_solving(umfpack_rowmajor);
@@ -24,9 +24,11 @@ template<typename T> void test_umfpack_support_T()
   check_sparse_square_determinant(umfpack_rowmajor);
 }
 
-void test_umfpack_support()
+EIGEN_DECLARE_TEST(umfpack_support)
 {
-  CALL_SUBTEST_1(test_umfpack_support_T<double>());
-  CALL_SUBTEST_2(test_umfpack_support_T<std::complex<double> >());
+  CALL_SUBTEST_1((test_umfpack_support_T<double, int>()));
+  CALL_SUBTEST_2((test_umfpack_support_T<std::complex<double>, int>()));
+  CALL_SUBTEST_3((test_umfpack_support_T<double, long >()));
+  CALL_SUBTEST_4((test_umfpack_support_T<std::complex<double>, long>()));
 }
 
diff --git a/contrib/eigen/test/unalignedassert.cpp b/contrib/eigen/test/unalignedassert.cpp
deleted file mode 100644
index 731a08977c6bbd97d5bb0d1a918e72eb85164bfe..0000000000000000000000000000000000000000
--- a/contrib/eigen/test/unalignedassert.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#if defined(EIGEN_TEST_PART_1)
-  // default
-#elif defined(EIGEN_TEST_PART_2)
-  #define EIGEN_MAX_STATIC_ALIGN_BYTES 16
-  #define EIGEN_MAX_ALIGN_BYTES 16
-#elif defined(EIGEN_TEST_PART_3)
-  #define EIGEN_MAX_STATIC_ALIGN_BYTES 32
-  #define EIGEN_MAX_ALIGN_BYTES 32
-#elif defined(EIGEN_TEST_PART_4)
-  #define EIGEN_MAX_STATIC_ALIGN_BYTES 64
-  #define EIGEN_MAX_ALIGN_BYTES 64
-#endif
-
-#include "main.h"
-
-typedef Matrix<float,  6,1> Vector6f;
-typedef Matrix<float,  8,1> Vector8f;
-typedef Matrix<float, 12,1> Vector12f;
-
-typedef Matrix<double, 5,1> Vector5d;
-typedef Matrix<double, 6,1> Vector6d;
-typedef Matrix<double, 7,1> Vector7d;
-typedef Matrix<double, 8,1> Vector8d;
-typedef Matrix<double, 9,1> Vector9d;
-typedef Matrix<double,10,1> Vector10d;
-typedef Matrix<double,12,1> Vector12d;
-
-struct TestNew1
-{
-  MatrixXd m; // good: m will allocate its own array, taking care of alignment.
-  TestNew1() : m(20,20) {}
-};
-
-struct TestNew2
-{
-  Matrix3d m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be 16-byte aligned,
-              // 8-byte alignment is good enough here, which we'll get automatically
-};
-
-struct TestNew3
-{
-  Vector2f m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be 16-byte aligned
-};
-
-struct TestNew4
-{
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-  Vector2d m;
-  float f; // make the struct have sizeof%16!=0 to make it a little more tricky when we allow an array of 2 such objects
-};
-
-struct TestNew5
-{
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-  float f; // try the f at first -- the EIGEN_ALIGN_MAX attribute of m should make that still work
-  Matrix4f m;
-};
-
-struct TestNew6
-{
-  Matrix<float,2,2,DontAlign> m; // good: no alignment requested
-  float f;
-};
-
-template<bool Align> struct Depends
-{
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(Align)
-  Vector2d m;
-  float f;
-};
-
-template<typename T>
-void check_unalignedassert_good()
-{
-  T *x, *y;
-  x = new T;
-  delete x;
-  y = new T[2];
-  delete[] y;
-}
-
-#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
-template<typename T>
-void construct_at_boundary(int boundary)
-{
-  char buf[sizeof(T)+256];
-  size_t _buf = reinterpret_cast<internal::UIntPtr>(buf);
-  _buf += (EIGEN_MAX_ALIGN_BYTES - (_buf % EIGEN_MAX_ALIGN_BYTES)); // make 16/32/...-byte aligned
-  _buf += boundary; // make exact boundary-aligned
-  T *x = ::new(reinterpret_cast<void*>(_buf)) T;
-  x[0].setZero(); // just in order to silence warnings
-  x->~T();
-}
-#endif
-
-void unalignedassert()
-{
-#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
-  construct_at_boundary<Vector2f>(4);
-  construct_at_boundary<Vector3f>(4);
-  construct_at_boundary<Vector4f>(16);
-  construct_at_boundary<Vector6f>(4);
-  construct_at_boundary<Vector8f>(EIGEN_MAX_ALIGN_BYTES);
-  construct_at_boundary<Vector12f>(16);
-  construct_at_boundary<Matrix2f>(16);
-  construct_at_boundary<Matrix3f>(4);
-  construct_at_boundary<Matrix4f>(EIGEN_MAX_ALIGN_BYTES);
-
-  construct_at_boundary<Vector2d>(16);
-  construct_at_boundary<Vector3d>(4);
-  construct_at_boundary<Vector4d>(EIGEN_MAX_ALIGN_BYTES);
-  construct_at_boundary<Vector5d>(4);
-  construct_at_boundary<Vector6d>(16);
-  construct_at_boundary<Vector7d>(4);
-  construct_at_boundary<Vector8d>(EIGEN_MAX_ALIGN_BYTES);
-  construct_at_boundary<Vector9d>(4);
-  construct_at_boundary<Vector10d>(16);
-  construct_at_boundary<Vector12d>(EIGEN_MAX_ALIGN_BYTES);
-  construct_at_boundary<Matrix2d>(EIGEN_MAX_ALIGN_BYTES);
-  construct_at_boundary<Matrix3d>(4);
-  construct_at_boundary<Matrix4d>(EIGEN_MAX_ALIGN_BYTES);
-
-  construct_at_boundary<Vector2cf>(16);
-  construct_at_boundary<Vector3cf>(4);
-  construct_at_boundary<Vector2cd>(EIGEN_MAX_ALIGN_BYTES);
-  construct_at_boundary<Vector3cd>(16);
-#endif
-
-  check_unalignedassert_good<TestNew1>();
-  check_unalignedassert_good<TestNew2>();
-  check_unalignedassert_good<TestNew3>();
-
-  check_unalignedassert_good<TestNew4>();
-  check_unalignedassert_good<TestNew5>();
-  check_unalignedassert_good<TestNew6>();
-  check_unalignedassert_good<Depends<true> >();
-
-#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
-  if(EIGEN_MAX_ALIGN_BYTES>=16)
-  {
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4f>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8f>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector12f>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2d>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4d>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector6d>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8d>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector10d>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector12d>(8));
-    // Complexes are disabled because the compiler might aggressively vectorize
-    // the initialization of complex coeffs to 0 before we can check for alignedness
-    //VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cf>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4i>(8));
-  }
-  for(int b=8; b<EIGEN_MAX_ALIGN_BYTES; b+=8)
-  {
-    if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8f>(b));
-    if(b<64)  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4f>(b));
-    if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4d>(b));
-    if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix2d>(b));
-    if(b<128) VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4d>(b));
-    //if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cd>(b));
-  }
-#endif
-}
-
-void test_unalignedassert()
-{
-  CALL_SUBTEST(unalignedassert());
-}
diff --git a/contrib/eigen/test/unalignedcount.cpp b/contrib/eigen/test/unalignedcount.cpp
index d6ffeafdf9c88c2b3759aa6ff4f479f2ffbf9da9..52cdd9e1d163378f4e2d5f870f9ebc2700758be8 100644
--- a/contrib/eigen/test/unalignedcount.cpp
+++ b/contrib/eigen/test/unalignedcount.cpp
@@ -28,9 +28,16 @@ static int nb_storeu;
 
 #include "main.h"
 
-void test_unalignedcount()
+EIGEN_DECLARE_TEST(unalignedcount)
 {
-  #if defined(EIGEN_VECTORIZE_AVX)
+  #if defined(EIGEN_VECTORIZE_AVX512)
+  VectorXf a(48), b(48);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 6, 0, 3, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) += b.segment(0,48), 3, 3, 3, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) -= b.segment(0,48), 3, 3, 3, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) *= 3.5, 3, 0, 3, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) /= 3.5, 3, 0, 3, 0);
+  #elif defined(EIGEN_VECTORIZE_AVX)
   VectorXf a(40), b(40);
   VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 10, 0, 5, 0);
   VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 5, 5, 5, 0);
diff --git a/contrib/eigen/test/upperbidiagonalization.cpp b/contrib/eigen/test/upperbidiagonalization.cpp
index 847b34b550951d049224202c772dc5c7cecdb4bb..945c999598f9f415d82642c9702ef78871702a44 100644
--- a/contrib/eigen/test/upperbidiagonalization.cpp
+++ b/contrib/eigen/test/upperbidiagonalization.cpp
@@ -12,8 +12,8 @@
 
 template<typename MatrixType> void upperbidiag(const MatrixType& m)
 {
-  const typename MatrixType::Index rows = m.rows();
-  const typename MatrixType::Index cols = m.cols();
+  const Index rows = m.rows();
+  const Index cols = m.cols();
 
   typedef Matrix<typename MatrixType::RealScalar, MatrixType::RowsAtCompileTime,  MatrixType::ColsAtCompileTime> RealMatrixType;
   typedef Matrix<typename MatrixType::Scalar, MatrixType::ColsAtCompileTime,  MatrixType::RowsAtCompileTime> TransposeMatrixType;
@@ -29,7 +29,7 @@ template<typename MatrixType> void upperbidiag(const MatrixType& m)
   VERIFY_IS_APPROX(a.adjoint(),d);
 }
 
-void test_upperbidiagonalization()
+EIGEN_DECLARE_TEST(upperbidiagonalization)
 {
   for(int i = 0; i < g_repeat; i++) {
    CALL_SUBTEST_1( upperbidiag(MatrixXf(3,3)) );
diff --git a/contrib/eigen/test/vectorization_logic.cpp b/contrib/eigen/test/vectorization_logic.cpp
index c2f77bfec21328e10b5043640870227f60b08370..97c0bdad9859b083ae99e7595ec5faade2540e2c 100644
--- a/contrib/eigen/test/vectorization_logic.cpp
+++ b/contrib/eigen/test/vectorization_logic.cpp
@@ -37,6 +37,7 @@ using internal::demangle_unrolling;
 template<typename Dst, typename Src>
 bool test_assign(const Dst&, const Src&, int traversal, int unrolling)
 {
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src);
   typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar,typename Src::Scalar> > traits;
   bool res = traits::Traversal==traversal;
   if(unrolling==InnerUnrolling+CompleteUnrolling)
@@ -61,6 +62,7 @@ bool test_assign(const Dst&, const Src&, int traversal, int unrolling)
 template<typename Dst, typename Src>
 bool test_assign(int traversal, int unrolling)
 {
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src);
   typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar,typename Src::Scalar> > traits;
   bool res = traits::Traversal==traversal && traits::Unrolling==unrolling;
   if(!res)
@@ -117,26 +119,26 @@ struct vectorization_logic
     typedef Matrix<Scalar,Dynamic,1> VectorX;
     typedef Matrix<Scalar,Dynamic,Dynamic> MatrixXX;
     typedef Matrix<Scalar,PacketSize,PacketSize> Matrix11;
-    typedef Matrix<Scalar,2*PacketSize,2*PacketSize> Matrix22;
+    typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?8:2*PacketSize,(Matrix11::Flags&RowMajorBit)?2*PacketSize:8>   Matrix22;
     typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16> Matrix44;
     typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16,DontAlign|EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION> Matrix44u;
     typedef Matrix<Scalar,4*PacketSize,4*PacketSize,ColMajor> Matrix44c;
     typedef Matrix<Scalar,4*PacketSize,4*PacketSize,RowMajor> Matrix44r;
 
     typedef Matrix<Scalar,
-        (PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
-        (PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1)
+        (PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
+        (PacketSize==16 ? 2 : PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1)
       > Matrix1;
 
     typedef Matrix<Scalar,
-        (PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
-        (PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1),
+        (PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
+        (PacketSize==16 ? 2 : PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1),
       DontAlign|((Matrix1::Flags&RowMajorBit)?RowMajor:ColMajor)> Matrix1u;
 
     // this type is made such that it can only be vectorized when viewed as a linear 1D vector
     typedef Matrix<Scalar,
-        (PacketSize==8 ? 4 : PacketSize==4 ? 6 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1),
-        (PacketSize==8 ? 6 : PacketSize==4 ? 2 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3)
+        (PacketSize==16 ?  4 : PacketSize==8 ? 4 : PacketSize==4 ? 6 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1),
+        (PacketSize==16 ? 12 : PacketSize==8 ? 6 : PacketSize==4 ? 2 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3)
       > Matrix3;
     
     #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT
@@ -149,14 +151,6 @@ struct vectorization_logic
     VERIFY(test_assign(Vector1(),Vector1().template cast<Scalar>(),
       InnerVectorizedTraversal,CompleteUnrolling));
 
-
-    VERIFY(test_assign(Vector1(),Vector1(),
-      InnerVectorizedTraversal,CompleteUnrolling));
-    VERIFY(test_assign(Vector1(),Vector1()+Vector1(),
-      InnerVectorizedTraversal,CompleteUnrolling));
-    VERIFY(test_assign(Vector1(),Vector1().cwiseProduct(Vector1()),
-      InnerVectorizedTraversal,CompleteUnrolling));
-
     VERIFY(test_assign(Matrix44(),Matrix44()+Matrix44(),
       InnerVectorizedTraversal,InnerUnrolling));
 
@@ -165,11 +159,11 @@ struct vectorization_logic
       EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling));
 
     VERIFY(test_assign(Matrix1(),Matrix1()+Matrix1(),
-      (Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal,
+      (int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize))==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal,
       CompleteUnrolling));
 
     VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(),
-      EIGEN_UNALIGNED_VECTORIZE ? ((Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal)
+      EIGEN_UNALIGNED_VECTORIZE ? ((int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize))==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal)
                                 : LinearTraversal, CompleteUnrolling));
 
     VERIFY(test_assign(Matrix44c().col(1),Matrix44c().col(2)+Matrix44c().col(3),
@@ -185,24 +179,25 @@ struct vectorization_logic
       VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1),
         LinearTraversal,CompleteUnrolling));
       VERIFY(test_assign(Vector3(),Vector3()+Vector3(),
-        EIGEN_UNALIGNED_VECTORIZE ? (HalfPacketSize==1 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : (HalfPacketSize==1 ? InnerVectorizedTraversal : LinearTraversal), CompleteUnrolling));
+        sizeof(Scalar)==16 ? InnerVectorizedTraversal : (EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal), CompleteUnrolling));
       VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1),
-        EIGEN_UNALIGNED_VECTORIZE ? (HalfPacketSize==1 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : (HalfPacketSize==1 ? SliceVectorizedTraversal : LinearTraversal),
-        ((!EIGEN_UNALIGNED_VECTORIZE) && HalfPacketSize==1) ? NoUnrolling : CompleteUnrolling));
+        EIGEN_UNALIGNED_VECTORIZE ? (sizeof(Scalar)==16 ? InnerVectorizedTraversal : LinearVectorizedTraversal)
+                                  : (sizeof(Scalar)==16 ? SliceVectorizedTraversal : LinearTraversal),
+        ((!EIGEN_UNALIGNED_VECTORIZE) && (sizeof(Scalar)==16)) ? NoUnrolling : CompleteUnrolling));
 
       VERIFY(test_assign(Matrix3(),Matrix3().cwiseProduct(Matrix3()),
         LinearVectorizedTraversal,CompleteUnrolling));
 
       VERIFY(test_assign(Matrix<Scalar,17,17>(),Matrix<Scalar,17,17>()+Matrix<Scalar,17,17>(),
-        HalfPacketSize==1             ? InnerVectorizedTraversal  :
+        sizeof(Scalar)==16        ? InnerVectorizedTraversal  :
         EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal :
-                                        LinearTraversal,
+                                    LinearTraversal,
         NoUnrolling));
 
       VERIFY(test_assign(Matrix11(), Matrix11()+Matrix11(),InnerVectorizedTraversal,CompleteUnrolling));
 
 
-      VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4),
+      VERIFY(test_assign(Matrix11(),Matrix<Scalar,21,21>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,21,21>().template block<PacketSize,PacketSize>(3,2),
         (EIGEN_UNALIGNED_VECTORIZE) ? InnerVectorizedTraversal : DefaultTraversal, CompleteUnrolling|InnerUnrolling));
 
       VERIFY(test_assign(Vector1(),Matrix11()*Vector1(),
@@ -230,8 +225,13 @@ struct vectorization_logic
     VERIFY(test_redux(Matrix44(),
       LinearVectorizedTraversal,NoUnrolling));
 
-    VERIFY(test_redux(Matrix44().template block<(Matrix1::Flags&RowMajorBit)?4:PacketSize,(Matrix1::Flags&RowMajorBit)?PacketSize:4>(1,2),
-      DefaultTraversal,CompleteUnrolling));
+    if(PacketSize>1) {
+      VERIFY(test_redux(Matrix44().template block<(Matrix1::Flags&RowMajorBit)?4:PacketSize,(Matrix1::Flags&RowMajorBit)?PacketSize:4>(1,2),
+        SliceVectorizedTraversal,CompleteUnrolling));
+
+      VERIFY(test_redux(Matrix44().template block<(Matrix1::Flags&RowMajorBit)?2:PacketSize,(Matrix1::Flags&RowMajorBit)?PacketSize:2>(1,2),
+        DefaultTraversal,CompleteUnrolling));
+    }
 
     VERIFY(test_redux(Matrix44c().template block<2*PacketSize,1>(1,2),
       LinearVectorizedTraversal,CompleteUnrolling));
@@ -283,25 +283,21 @@ struct vectorization_logic_half
     typedef Matrix<Scalar,5*PacketSize,7,ColMajor> Matrix57;
     typedef Matrix<Scalar,3*PacketSize,5,ColMajor> Matrix35;
     typedef Matrix<Scalar,5*PacketSize,7,DontAlign|ColMajor> Matrix57u;
-//     typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16> Matrix44;
-//     typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16,DontAlign|EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION> Matrix44u;
-//     typedef Matrix<Scalar,4*PacketSize,4*PacketSize,ColMajor> Matrix44c;
-//     typedef Matrix<Scalar,4*PacketSize,4*PacketSize,RowMajor> Matrix44r;
 
     typedef Matrix<Scalar,
-        (PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
-        (PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1)
+        (PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
+        (PacketSize==16 ? 2 : PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1)
       > Matrix1;
 
     typedef Matrix<Scalar,
-        (PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
-        (PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1),
+        (PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
+        (PacketSize==16 ? 2 : PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1),
       DontAlign|((Matrix1::Flags&RowMajorBit)?RowMajor:ColMajor)> Matrix1u;
 
     // this type is made such that it can only be vectorized when viewed as a linear 1D vector
     typedef Matrix<Scalar,
-        (PacketSize==8 ? 4 : PacketSize==4 ? 6 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1),
-        (PacketSize==8 ? 6 : PacketSize==4 ? 2 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3)
+        (PacketSize==16 ?  4 : PacketSize==8 ? 4 : PacketSize==4 ? 6 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1),
+        (PacketSize==16 ? 12 : PacketSize==8 ? 6 : PacketSize==4 ? 2 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3)
       > Matrix3;
     
     #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT
@@ -320,14 +316,6 @@ struct vectorization_logic_half
     VERIFY(test_assign(Vector1(),Vector1().template cast<Scalar>(),
       InnerVectorizedTraversal,CompleteUnrolling));
 
-
-    VERIFY(test_assign(Vector1(),Vector1(),
-      InnerVectorizedTraversal,CompleteUnrolling));
-    VERIFY(test_assign(Vector1(),Vector1()+Vector1(),
-      InnerVectorizedTraversal,CompleteUnrolling));
-    VERIFY(test_assign(Vector1(),Vector1().cwiseProduct(Vector1()),
-      InnerVectorizedTraversal,CompleteUnrolling));
-
     VERIFY(test_assign(Matrix57(),Matrix57()+Matrix57(),
       InnerVectorizedTraversal,InnerUnrolling));
 
@@ -336,7 +324,7 @@ struct vectorization_logic_half
       EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling));
 
     VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(),
-      EIGEN_UNALIGNED_VECTORIZE ? ((Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,CompleteUnrolling));
+      EIGEN_UNALIGNED_VECTORIZE ? ((int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize))==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,CompleteUnrolling));
         
     if(PacketSize>1)
     {
@@ -344,17 +332,20 @@ struct vectorization_logic_half
       VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1),
         LinearTraversal,CompleteUnrolling));
       VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1),
-        EIGEN_UNALIGNED_VECTORIZE ? (PacketSize==1 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,CompleteUnrolling));
+        EIGEN_UNALIGNED_VECTORIZE ? (sizeof(Scalar)==16 ? InnerVectorizedTraversal : LinearVectorizedTraversal)
+                                  : (sizeof(Scalar)==16 ? SliceVectorizedTraversal : LinearTraversal),
+        ((!EIGEN_UNALIGNED_VECTORIZE) && (sizeof(Scalar)==16)) ? NoUnrolling : CompleteUnrolling));
               
       VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()),
         PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling));
         
       VERIFY(test_assign(Matrix<Scalar,17,17>(),Matrix<Scalar,17,17>()+Matrix<Scalar,17,17>(),
-        EIGEN_UNALIGNED_VECTORIZE ? (PacketSize==1 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,
+        sizeof(Scalar)==16 ? InnerVectorizedTraversal : (EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal),
         NoUnrolling));
         
       VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4),
-        EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling));
+        EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : DefaultTraversal,InnerUnrolling+CompleteUnrolling));
+  
 
       VERIFY(test_assign(Vector1(),Matrix11()*Vector1(),
                          InnerVectorizedTraversal,CompleteUnrolling));
@@ -375,16 +366,21 @@ struct vectorization_logic_half
     VERIFY(test_redux(Matrix35(),
       LinearVectorizedTraversal,CompleteUnrolling));
 
-    VERIFY(test_redux(Matrix57().template block<PacketSize,3>(1,0),
-      DefaultTraversal,CompleteUnrolling));
+    VERIFY(test_redux(Matrix57().template block<PacketSize==1?2:PacketSize,3>(1,0),
+      SliceVectorizedTraversal,CompleteUnrolling));
+
+    if(PacketSize>1) {
+      VERIFY(test_redux(Matrix57().template block<PacketSize,2>(1,0),
+        DefaultTraversal,CompleteUnrolling));
+    }
 
     VERIFY((test_assign<
             Map<Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>, AlignedMax, InnerStride<3*PacketSize> >,
             Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>
-            >(DefaultTraversal,CompleteUnrolling)));
+            >(DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling)));
 
     VERIFY((test_assign(Matrix57(), Matrix<Scalar,5*PacketSize,3>()*Matrix<Scalar,3,7>(),
-                        InnerVectorizedTraversal, InnerUnrolling|CompleteUnrolling)));
+                        InnerVectorizedTraversal, InnerUnrolling+CompleteUnrolling)));
     #endif
   }
 };
@@ -394,7 +390,7 @@ template<typename Scalar> struct vectorization_logic_half<Scalar,false>
   static void run() {}
 };
 
-void test_vectorization_logic()
+EIGEN_DECLARE_TEST(vectorization_logic)
 {
 
 #ifdef EIGEN_VECTORIZE
diff --git a/contrib/eigen/test/vectorwiseop.cpp b/contrib/eigen/test/vectorwiseop.cpp
index a099d17c876dc038013168c6409d22edffe6c09c..8ee58841af161162df62df76b915784c270bfc06 100644
--- a/contrib/eigen/test/vectorwiseop.cpp
+++ b/contrib/eigen/test/vectorwiseop.cpp
@@ -134,6 +134,7 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
   typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime> RowVectorType;
   typedef Matrix<RealScalar, MatrixType::RowsAtCompileTime, 1> RealColVectorType;
   typedef Matrix<RealScalar, 1, MatrixType::ColsAtCompileTime> RealRowVectorType;
+  typedef Matrix<Scalar,Dynamic,Dynamic> MatrixX;
 
   Index rows = m.rows();
   Index cols = m.cols();
@@ -149,6 +150,19 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
   RealColVectorType rcres;
   RealRowVectorType rrres;
 
+  // test broadcast assignment
+  m2 = m1;
+  m2.colwise() = colvec;
+  for(Index j=0; j<cols; ++j)
+    VERIFY_IS_APPROX(m2.col(j), colvec);
+  m2.rowwise() = rowvec;
+  for(Index i=0; i<rows; ++i)
+    VERIFY_IS_APPROX(m2.row(i), rowvec);
+  if(rows>1)
+    VERIFY_RAISES_ASSERT(m2.colwise() = colvec.transpose());
+  if(cols>1)
+    VERIFY_RAISES_ASSERT(m2.rowwise() = rowvec.transpose());
+
   // test addition
 
   m2 = m1;
@@ -197,11 +211,23 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
     VERIFY_RAISES_ASSERT(m1.rowwise() - rowvec.transpose());
   }
 
-  // test norm
-  rrres = m1.colwise().norm();
-  VERIFY_IS_APPROX(rrres(c), m1.col(c).norm());
-  rcres = m1.rowwise().norm();
-  VERIFY_IS_APPROX(rcres(r), m1.row(r).norm());
+  // ------ partial reductions ------
+
+  #define TEST_PARTIAL_REDUX_BASIC(FUNC,ROW,COL,PREPROCESS) {                          \
+    ROW = m1 PREPROCESS .colwise().FUNC ;                                              \
+    for(Index k=0; k<cols; ++k) VERIFY_IS_APPROX(ROW(k), m1.col(k) PREPROCESS .FUNC ); \
+    COL = m1 PREPROCESS .rowwise().FUNC ;                                              \
+    for(Index k=0; k<rows; ++k) VERIFY_IS_APPROX(COL(k), m1.row(k) PREPROCESS .FUNC ); \
+  }
+
+  TEST_PARTIAL_REDUX_BASIC(sum(),        rowvec,colvec,EIGEN_EMPTY);
+  TEST_PARTIAL_REDUX_BASIC(prod(),       rowvec,colvec,EIGEN_EMPTY);
+  TEST_PARTIAL_REDUX_BASIC(mean(),       rowvec,colvec,EIGEN_EMPTY);
+  TEST_PARTIAL_REDUX_BASIC(minCoeff(),   rrres, rcres, .real());
+  TEST_PARTIAL_REDUX_BASIC(maxCoeff(),   rrres, rcres, .real());
+  TEST_PARTIAL_REDUX_BASIC(norm(),       rrres, rcres, EIGEN_EMPTY);
+  TEST_PARTIAL_REDUX_BASIC(squaredNorm(),rrres, rcres, EIGEN_EMPTY);
+  TEST_PARTIAL_REDUX_BASIC(redux(internal::scalar_sum_op<Scalar,Scalar>()),rowvec,colvec,EIGEN_EMPTY);
 
   VERIFY_IS_APPROX(m1.cwiseAbs().colwise().sum(), m1.colwise().template lpNorm<1>());
   VERIFY_IS_APPROX(m1.cwiseAbs().rowwise().sum(), m1.rowwise().template lpNorm<1>());
@@ -235,14 +261,36 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
   m1 = m1.rowwise() - (m1.colwise().sum()/RealScalar(m1.rows()));
   VERIFY_IS_APPROX( m1, m2 );
   VERIFY_EVALUATION_COUNT( m2 = (m1.rowwise() - m1.colwise().sum()/RealScalar(m1.rows())), (MatrixType::RowsAtCompileTime!=1 ? 1 : 0) );
+
+  // test empty expressions
+  VERIFY_IS_APPROX(m1.matrix().middleCols(0,0).rowwise().sum().eval(), MatrixX::Zero(rows,1));
+  VERIFY_IS_APPROX(m1.matrix().middleRows(0,0).colwise().sum().eval(), MatrixX::Zero(1,cols));
+  VERIFY_IS_APPROX(m1.matrix().middleCols(0,fix<0>).rowwise().sum().eval(), MatrixX::Zero(rows,1));
+  VERIFY_IS_APPROX(m1.matrix().middleRows(0,fix<0>).colwise().sum().eval(), MatrixX::Zero(1,cols));
+
+  VERIFY_IS_APPROX(m1.matrix().middleCols(0,0).rowwise().prod().eval(), MatrixX::Ones(rows,1));
+  VERIFY_IS_APPROX(m1.matrix().middleRows(0,0).colwise().prod().eval(), MatrixX::Ones(1,cols));
+  VERIFY_IS_APPROX(m1.matrix().middleCols(0,fix<0>).rowwise().prod().eval(), MatrixX::Ones(rows,1));
+  VERIFY_IS_APPROX(m1.matrix().middleRows(0,fix<0>).colwise().prod().eval(), MatrixX::Ones(1,cols));
+  
+  VERIFY_IS_APPROX(m1.matrix().middleCols(0,0).rowwise().squaredNorm().eval(), MatrixX::Zero(rows,1));
+
+  VERIFY_RAISES_ASSERT(m1.real().middleCols(0,0).rowwise().minCoeff().eval());
+  VERIFY_RAISES_ASSERT(m1.real().middleRows(0,0).colwise().maxCoeff().eval());
+  VERIFY_IS_EQUAL(m1.real().middleRows(0,0).rowwise().maxCoeff().eval().rows(),0);
+  VERIFY_IS_EQUAL(m1.real().middleCols(0,0).colwise().maxCoeff().eval().cols(),0);
+  VERIFY_IS_EQUAL(m1.real().middleRows(0,fix<0>).rowwise().maxCoeff().eval().rows(),0);
+  VERIFY_IS_EQUAL(m1.real().middleCols(0,fix<0>).colwise().maxCoeff().eval().cols(),0);
 }
 
-void test_vectorwiseop()
+EIGEN_DECLARE_TEST(vectorwiseop)
 {
   CALL_SUBTEST_1( vectorwiseop_array(Array22cd()) );
   CALL_SUBTEST_2( vectorwiseop_array(Array<double, 3, 2>()) );
   CALL_SUBTEST_3( vectorwiseop_array(ArrayXXf(3, 4)) );
   CALL_SUBTEST_4( vectorwiseop_matrix(Matrix4cf()) );
+  CALL_SUBTEST_5( vectorwiseop_matrix(Matrix4f()) );
+  CALL_SUBTEST_5( vectorwiseop_matrix(Vector4f()) );
   CALL_SUBTEST_5( vectorwiseop_matrix(Matrix<float,4,5>()) );
   CALL_SUBTEST_6( vectorwiseop_matrix(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
   CALL_SUBTEST_7( vectorwiseop_matrix(VectorXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
diff --git a/contrib/eigen/test/visitor.cpp b/contrib/eigen/test/visitor.cpp
index 7f4efab9709ddcdbba8ac28da079e45343dbd04a..20fb2c3ed56baa0e8653d8de126544fb1106e1c1 100644
--- a/contrib/eigen/test/visitor.cpp
+++ b/contrib/eigen/test/visitor.cpp
@@ -56,9 +56,44 @@ template<typename MatrixType> void matrixVisitor(const MatrixType& p)
   VERIFY_IS_APPROX(maxc, m.maxCoeff());
 
   eigen_maxc = (m.adjoint()*m).maxCoeff(&eigen_maxrow,&eigen_maxcol);
-  eigen_maxc = (m.adjoint()*m).eval().maxCoeff(&maxrow,&maxcol);
-  VERIFY(maxrow == eigen_maxrow);
-  VERIFY(maxcol == eigen_maxcol);
+  Index maxrow2=0,maxcol2=0;
+  eigen_maxc = (m.adjoint()*m).eval().maxCoeff(&maxrow2,&maxcol2);
+  VERIFY(maxrow2 == eigen_maxrow);
+  VERIFY(maxcol2 == eigen_maxcol);
+
+  if (!NumTraits<Scalar>::IsInteger && m.size() > 2) {
+    // Test NaN propagation by replacing an element with NaN.
+    bool stop = false;
+    for (Index j = 0; j < cols && !stop; ++j) {
+      for (Index i = 0; i < rows && !stop; ++i) {
+        if (!(j == mincol && i == minrow) &&
+            !(j == maxcol && i == maxrow)) {
+          m(i,j) = NumTraits<Scalar>::quiet_NaN();
+          stop = true;
+          break;
+        }
+      }
+    }
+
+    eigen_minc = m.template minCoeff<PropagateNumbers>(&eigen_minrow, &eigen_mincol);
+    eigen_maxc = m.template maxCoeff<PropagateNumbers>(&eigen_maxrow, &eigen_maxcol);
+    VERIFY(minrow == eigen_minrow);
+    VERIFY(maxrow == eigen_maxrow);
+    VERIFY(mincol == eigen_mincol);
+    VERIFY(maxcol == eigen_maxcol);
+    VERIFY_IS_APPROX(minc, eigen_minc);
+    VERIFY_IS_APPROX(maxc, eigen_maxc);
+    VERIFY_IS_APPROX(minc, m.template minCoeff<PropagateNumbers>());
+    VERIFY_IS_APPROX(maxc, m.template maxCoeff<PropagateNumbers>());
+
+    eigen_minc = m.template minCoeff<PropagateNaN>(&eigen_minrow, &eigen_mincol);
+    eigen_maxc = m.template maxCoeff<PropagateNaN>(&eigen_maxrow, &eigen_maxcol);
+    VERIFY(minrow != eigen_minrow || mincol != eigen_mincol);
+    VERIFY(maxrow != eigen_maxrow || maxcol != eigen_maxcol);
+    VERIFY((numext::isnan)(eigen_minc));
+    VERIFY((numext::isnan)(eigen_maxc));
+  }
+
 }
 
 template<typename VectorType> void vectorVisitor(const VectorType& w)
@@ -111,9 +146,34 @@ template<typename VectorType> void vectorVisitor(const VectorType& w)
   v2.maxCoeff(&eigen_maxidx);
   VERIFY(eigen_minidx == (std::min)(idx0,idx1));
   VERIFY(eigen_maxidx == (std::min)(idx0,idx2));
+
+  if (!NumTraits<Scalar>::IsInteger && size > 2) {
+    // Test NaN propagation by replacing an element with NaN.
+    for (Index i = 0; i < size; ++i) {
+      if (i != minidx && i != maxidx) {
+        v(i) = NumTraits<Scalar>::quiet_NaN();
+        break;
+      }
+    }
+    eigen_minc = v.template minCoeff<PropagateNumbers>(&eigen_minidx);
+    eigen_maxc = v.template maxCoeff<PropagateNumbers>(&eigen_maxidx);
+    VERIFY(minidx == eigen_minidx);
+    VERIFY(maxidx == eigen_maxidx);
+    VERIFY_IS_APPROX(minc, eigen_minc);
+    VERIFY_IS_APPROX(maxc, eigen_maxc);
+    VERIFY_IS_APPROX(minc, v.template minCoeff<PropagateNumbers>());
+    VERIFY_IS_APPROX(maxc, v.template maxCoeff<PropagateNumbers>());
+
+    eigen_minc = v.template minCoeff<PropagateNaN>(&eigen_minidx);
+    eigen_maxc = v.template maxCoeff<PropagateNaN>(&eigen_maxidx);
+    VERIFY(minidx != eigen_minidx);
+    VERIFY(maxidx != eigen_maxidx);
+    VERIFY((numext::isnan)(eigen_minc));
+    VERIFY((numext::isnan)(eigen_maxc));
+  }
 }
 
-void test_visitor()
+EIGEN_DECLARE_TEST(visitor)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( matrixVisitor(Matrix<float, 1, 1>()) );
diff --git a/contrib/eigen/test/zerosized.cpp b/contrib/eigen/test/zerosized.cpp
index 477ff0070c908d803bbd0de988e7eb1fffe112ed..07afd0f8651b78f99bf6c6c0ffe93b15c3f2b437 100644
--- a/contrib/eigen/test/zerosized.cpp
+++ b/contrib/eigen/test/zerosized.cpp
@@ -16,9 +16,18 @@ template<typename MatrixType> void zeroReduction(const MatrixType& m) {
   VERIFY(!m.any());
   VERIFY(m.prod()==1);
   VERIFY(m.sum()==0);
+  VERIFY(m.norm()==0);
+  VERIFY(m.squaredNorm()==0);
   VERIFY(m.count()==0);
   VERIFY(m.allFinite());
   VERIFY(!m.hasNaN());
+  VERIFY_RAISES_ASSERT( m.minCoeff() );
+  VERIFY_RAISES_ASSERT( m.maxCoeff() );
+  Index i,j;
+  VERIFY_RAISES_ASSERT( m.minCoeff(&i,&j) );
+  VERIFY_RAISES_ASSERT( m.maxCoeff(&i,&j) );
+  VERIFY_RAISES_ASSERT( m.reshaped().minCoeff(&i) );
+  VERIFY_RAISES_ASSERT( m.reshaped().maxCoeff(&i) );
 }
 
 
@@ -81,7 +90,7 @@ template<typename VectorType> void zeroSizedVector()
   }
 }
 
-void test_zerosized()
+EIGEN_DECLARE_TEST(zerosized)
 {
   zeroSizedMatrix<Matrix2d>();
   zeroSizedMatrix<Matrix3i>();
diff --git a/contrib/eigen/unsupported/CMakeLists.txt b/contrib/eigen/unsupported/CMakeLists.txt
index 9a5666105a95bfac9c18c2913f14ad74d998162b..34408c017004409072f71b860477790cb767dadf 100644
--- a/contrib/eigen/unsupported/CMakeLists.txt
+++ b/contrib/eigen/unsupported/CMakeLists.txt
@@ -1,5 +1,7 @@
 add_subdirectory(Eigen)
-add_subdirectory(doc EXCLUDE_FROM_ALL)
+if(EIGEN_BUILD_DOC)
+  add_subdirectory(doc EXCLUDE_FROM_ALL)
+endif()
 if(BUILD_TESTING)
   if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
     add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
diff --git a/contrib/eigen/unsupported/Eigen/AdolcForward b/contrib/eigen/unsupported/Eigen/AdolcForward
index 15f5f0731a9426d7123816cadac4fc65f3c9e57f..56caeaebfefc56794cfc160ef3f29c359a56d56c 100644
--- a/contrib/eigen/unsupported/Eigen/AdolcForward
+++ b/contrib/eigen/unsupported/Eigen/AdolcForward
@@ -40,7 +40,7 @@
 # undef realloc
 #endif
 
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 
 namespace Eigen {
 
@@ -74,6 +74,9 @@ inline adouble imag(const adouble&)    { return 0.; }
 inline adouble abs(const adouble&  x)  { return fabs(x); }
 inline adouble abs2(const adouble& x)  { return x*x; }
 
+inline bool (isinf)(const adouble& x) { return (Eigen::numext::isinf)(x.getValue()); }
+inline bool (isnan)(const adouble& x) { return (Eigen::numext::isnan)(x.getValue()); }
+
 }
 
 namespace Eigen {
diff --git a/contrib/eigen/unsupported/Eigen/AlignedVector3 b/contrib/eigen/unsupported/Eigen/AlignedVector3
index 47a86d4c000135a4851334b1dc8ac278d48fee1c..4fa1842acfe22be3eb64f88c4e9f7f5a5b2d9fa9 100644
--- a/contrib/eigen/unsupported/Eigen/AlignedVector3
+++ b/contrib/eigen/unsupported/Eigen/AlignedVector3
@@ -10,7 +10,9 @@
 #ifndef EIGEN_ALIGNED_VECTOR3
 #define EIGEN_ALIGNED_VECTOR3
 
-#include <Eigen/Geometry>
+#include "../../Eigen/Geometry"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 namespace Eigen {
 
@@ -76,6 +78,9 @@ template<typename _Scalar> class AlignedVector3
     { return m_coeffs.coeffRef(index);}
 
 
+    inline AlignedVector3()
+    {}
+
     inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z)
       : m_coeffs(x, y, z, Scalar(0))
     {}
@@ -129,6 +134,9 @@ template<typename _Scalar> class AlignedVector3
     inline AlignedVector3 operator-(const AlignedVector3& other) const
     { return AlignedVector3(m_coeffs - other.m_coeffs); }
 
+    inline AlignedVector3 operator-() const
+    { return AlignedVector3(-m_coeffs); }
+
     inline AlignedVector3 operator-=(const AlignedVector3& other)
     { m_coeffs -= other.m_coeffs; return *this; }
 
@@ -221,4 +229,6 @@ struct evaluator<AlignedVector3<Scalar> >
 
 }
 
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
 #endif // EIGEN_ALIGNED_VECTOR3
diff --git a/contrib/eigen/unsupported/Eigen/ArpackSupport b/contrib/eigen/unsupported/Eigen/ArpackSupport
index a0d4820e125bd92430fcd9b59b173f3b17792c92..67c4ac838bf2943226fb90d75f625d1ed52fcbe5 100644
--- a/contrib/eigen/unsupported/Eigen/ArpackSupport
+++ b/contrib/eigen/unsupported/Eigen/ArpackSupport
@@ -9,7 +9,7 @@
 #ifndef EIGEN_ARPACKSUPPORT_MODULE_H
 #define EIGEN_ARPACKSUPPORT_MODULE_H
 
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 
 /** \defgroup ArpackSupport_Module Arpack support module
   *
@@ -20,12 +20,11 @@
   * \endcode
   */
 
-#include <Eigen/SparseCholesky>
+#include "../../Eigen/SparseCholesky"
 
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 #include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h"
 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_ARPACKSUPPORT_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/contrib/eigen/unsupported/Eigen/AutoDiff b/contrib/eigen/unsupported/Eigen/AutoDiff
index abf5b7d678a41baca587efc07264aea5c8da7ec5..7a4ff460ce505d1801af0ef383525b1beb5bb628 100644
--- a/contrib/eigen/unsupported/Eigen/AutoDiff
+++ b/contrib/eigen/unsupported/Eigen/AutoDiff
@@ -28,11 +28,17 @@ namespace Eigen {
 //@{
 
 }
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
 
 #include "src/AutoDiff/AutoDiffScalar.h"
 // #include "src/AutoDiff/AutoDiffVector.h"
 #include "src/AutoDiff/AutoDiffJacobian.h"
 
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+
+
 namespace Eigen {
 //@}
 }
diff --git a/contrib/eigen/unsupported/Eigen/BVH b/contrib/eigen/unsupported/Eigen/BVH
index 0161a5402d3e125c31f2fb31ed7130ecf65ca6c8..666c9835f894b7ab6e03839ac1c5ced9cde56fff 100644
--- a/contrib/eigen/unsupported/Eigen/BVH
+++ b/contrib/eigen/unsupported/Eigen/BVH
@@ -10,9 +10,9 @@
 #ifndef EIGEN_BVH_MODULE_H
 #define EIGEN_BVH_MODULE_H
 
-#include <Eigen/Core>
-#include <Eigen/Geometry>
-#include <Eigen/StdVector>
+#include "../../Eigen/Core"
+#include "../../Eigen/Geometry"
+#include "../../Eigen/StdVector"
 #include <algorithm>
 #include <queue>
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/Tensor b/contrib/eigen/unsupported/Eigen/CXX11/Tensor
index bb6523d15ecf885fd64489f6c0895afdc94ef268..0938bb554da43256277e42971b355a242f96cf4f 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/Tensor
+++ b/contrib/eigen/unsupported/Eigen/CXX11/Tensor
@@ -13,21 +13,11 @@
 
 #include "../../../Eigen/Core"
 
-#ifdef EIGEN_USE_SYCL
-#undef min
-#undef max
-#undef isnan
-#undef isinf
-#undef isfinite
-#include <SYCL/sycl.hpp>
-#include <map>
-#include <memory>
-#include <utility>
-#endif
-
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#if EIGEN_HAS_CXX11
 
 #include "../SpecialFunctions"
+
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
 #include "src/util/CXX11Meta.h"
 #include "src/util/MaxSizeVector.h"
 
@@ -43,44 +33,25 @@
   * Much of the documentation can be found \ref eigen_tensors "here".
   */
 
+#include <atomic>
+#include <chrono>
 #include <cmath>
 #include <cstddef>
 #include <cstring>
-
-#ifdef _WIN32
-typedef __int16 int16_t;
-typedef unsigned __int16 uint16_t;
-typedef __int32 int32_t;
-typedef unsigned __int32 uint32_t;
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-#else
-#include <stdint.h>
-#endif
-
-#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
 #include <random>
-#endif
+#include <thread>
 
-#ifdef _WIN32
-#include <windows.h>
-#elif defined(__APPLE__)
-#include <mach/mach_time.h>
-#else
-#include <time.h>
-#endif
-
-#ifdef EIGEN_USE_THREADS
+#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
 #include "ThreadPool"
 #endif
 
 #ifdef EIGEN_USE_GPU
-#include <iostream>
-#include <cuda_runtime.h>
-#if __cplusplus >= 201103L
-#include <atomic>
-#include <unistd.h>
-#endif
+  #include <iostream>
+  #if defined(EIGEN_USE_HIP)
+    #include <hip/hip_runtime.h>
+  #else
+    #include <cuda_runtime.h>
+  #endif
 #endif
 
 #include "src/Tensor/TensorMacros.h"
@@ -90,7 +61,10 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorCostModel.h"
 #include "src/Tensor/TensorDeviceDefault.h"
 #include "src/Tensor/TensorDeviceThreadPool.h"
-#include "src/Tensor/TensorDeviceCuda.h"
+#include "src/Tensor/TensorDeviceGpu.h"
+#ifndef gpu_assert
+#define gpu_assert(x)
+#endif
 #include "src/Tensor/TensorDeviceSycl.h"
 #include "src/Tensor/TensorIndexList.h"
 #include "src/Tensor/TensorDimensionList.h"
@@ -103,18 +77,19 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorGlobalFunctions.h"
 
 #include "src/Tensor/TensorBase.h"
+#include "src/Tensor/TensorBlock.h"
 
 #include "src/Tensor/TensorEvaluator.h"
 #include "src/Tensor/TensorExpr.h"
 #include "src/Tensor/TensorReduction.h"
-#include "src/Tensor/TensorReductionCuda.h"
+#include "src/Tensor/TensorReductionGpu.h"
 #include "src/Tensor/TensorArgMax.h"
 #include "src/Tensor/TensorConcatenation.h"
 #include "src/Tensor/TensorContractionMapper.h"
 #include "src/Tensor/TensorContractionBlocking.h"
 #include "src/Tensor/TensorContraction.h"
 #include "src/Tensor/TensorContractionThreadPool.h"
-#include "src/Tensor/TensorContractionCuda.h"
+#include "src/Tensor/TensorContractionGpu.h"
 #include "src/Tensor/TensorConversion.h"
 #include "src/Tensor/TensorConvolution.h"
 #include "src/Tensor/TensorFFT.h"
@@ -136,8 +111,15 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorGenerator.h"
 #include "src/Tensor/TensorAssign.h"
 #include "src/Tensor/TensorScan.h"
+#include "src/Tensor/TensorTrace.h"
+
+#ifdef EIGEN_USE_SYCL
+#include "src/Tensor/TensorReductionSycl.h"
+#include "src/Tensor/TensorConvolutionSycl.h"
+#include "src/Tensor/TensorContractionSycl.h"
+#include "src/Tensor/TensorScanSycl.h"
+#endif
 
-#include "src/Tensor/TensorSycl.h"
 #include "src/Tensor/TensorExecutor.h"
 #include "src/Tensor/TensorDevice.h"
 
@@ -149,6 +131,7 @@ typedef unsigned __int64 uint64_t;
 
 #include "src/Tensor/TensorIO.h"
 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
+#endif  // EIGEN_HAS_CXX11
 //#endif // EIGEN_CXX11_TENSOR_MODULE
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/TensorSymmetry b/contrib/eigen/unsupported/Eigen/CXX11/TensorSymmetry
index fb1b0c0fbcd3e65e394bb18f418cc69afcee231c..b09c5e472d8d64f8ea7b989b53492c632f89018a 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/TensorSymmetry
+++ b/contrib/eigen/unsupported/Eigen/CXX11/TensorSymmetry
@@ -10,9 +10,9 @@
 #ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE
 #define EIGEN_CXX11_TENSORSYMMETRY_MODULE
 
-#include <unsupported/Eigen/CXX11/Tensor>
+#include "Tensor"
 
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 #include "src/util/CXX11Meta.h"
 
@@ -33,7 +33,7 @@
 #include "src/TensorSymmetry/StaticSymmetry.h"
 #include "src/TensorSymmetry/DynamicSymmetry.h"
 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/ThreadPool b/contrib/eigen/unsupported/Eigen/CXX11/ThreadPool
index 09d637e9a06881328eda7ff44e00b30fa0fc30bb..c5cafb2a1ece6f1d0b1fa10c0feff0e35bd53222 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/ThreadPool
+++ b/contrib/eigen/unsupported/Eigen/CXX11/ThreadPool
@@ -12,7 +12,7 @@
 
 #include "../../../Eigen/Core"
 
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 /** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module
   *
@@ -30,10 +30,9 @@
 
 // The code depends on CXX11, so only include the module if the
 // compiler supports it.
-#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
+#if (EIGEN_COMP_CXXVER >= 11)
 #include <cstddef>
 #include <cstring>
-#include <stdint.h>
 #include <time.h>
 
 #include <vector>
@@ -44,22 +43,32 @@
 #include <thread>
 #include <functional>
 #include <memory>
+#include <utility>
+
+// There are non-parenthesized calls to "max" in the  <unordered_map> header,
+// which trigger a check in test/main.h causing compilation to fail.
+// We work around the check here by removing the check for max in
+// the case where we have to emulate thread_local.
+#ifdef max
+#undef max
+#endif
+#include <unordered_map>
 
 #include "src/util/CXX11Meta.h"
 #include "src/util/MaxSizeVector.h"
 
 #include "src/ThreadPool/ThreadLocal.h"
 #include "src/ThreadPool/ThreadYield.h"
+#include "src/ThreadPool/ThreadCancel.h"
 #include "src/ThreadPool/EventCount.h"
 #include "src/ThreadPool/RunQueue.h"
 #include "src/ThreadPool/ThreadPoolInterface.h"
 #include "src/ThreadPool/ThreadEnvironment.h"
-#include "src/ThreadPool/SimpleThreadPool.h"
+#include "src/ThreadPool/Barrier.h"
 #include "src/ThreadPool/NonBlockingThreadPool.h"
 
 #endif
 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_CXX11_THREADPOOL_MODULE
-
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md
index da70fa216b6b81662e873a1cc6d961b7551c25a1..2f65b1b0e9f60a4c3ac6baabdeb68d6d15fd1ca6 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -3,8 +3,6 @@
 Tensors are multidimensional arrays of elements. Elements are typically scalars,
 but more complex types such as strings are also supported.
 
-[TOC]
-
 ## Tensor Classes
 
 You can manipulate a tensor with one of the following classes.  They all are in
@@ -21,7 +19,7 @@ matrix.
 Tensors of this class are resizable.  For example, if you assign a tensor of a
 different size to a Tensor, that tensor is resized to match its new value.
 
-#### Constructor `Tensor<data_type, rank>(size0, size1, ...)`
+#### Constructor Tensor<data_type, rank>(size0, size1, ...)
 
 Constructor for a Tensor.  The constructor must be passed `rank` integers
 indicating the sizes of the instance along each of the the `rank`
@@ -34,7 +32,7 @@ dimensions.
     // Resize t_3d by assigning a tensor of different sizes, but same rank.
     t_3d = Tensor<float, 3>(3, 4, 3);
 
-#### Constructor `Tensor<data_type, rank>(size_array)`
+#### Constructor Tensor<data_type, rank>(size_array)
 
 Constructor where the sizes for the constructor are specified as an array of
 values instead of an explicitly list of parameters.  The array type to use is
@@ -45,7 +43,7 @@ from an initializer list.
     Tensor<string, 2> t_2d({5, 7});
 
 
-### Class `TensorFixedSize<data_type, Sizes<size0, size1, ...>>`
+### Class TensorFixedSize<data_type, Sizes<size0, size1, ...>>
 
 Class to use for tensors of fixed size, where the size is known at compile
 time.  Fixed sized tensors can provide very fast computations because all their
@@ -57,7 +55,7 @@ tensor data is held onto the stack and does not cause heap allocation and free.
     // Create a 4 x 3 tensor of floats.
     TensorFixedSize<float, Sizes<4, 3>> t_4x3;
 
-### Class `TensorMap<Tensor<data_type, rank>>`
+### Class TensorMap<Tensor<data_type, rank>>
 
 This is the class to use to create a tensor on top of memory allocated and
 owned by another part of your code.  It allows to view any piece of allocated
@@ -67,7 +65,7 @@ data are stored.
 A TensorMap is not resizable because it does not own the memory where its data
 are stored.
 
-#### Constructor `TensorMap<Tensor<data_type, rank>>(data, size0, size1, ...)`
+#### Constructor TensorMap<Tensor<data_type, rank>>(data, size0, size1, ...)
 
 Constructor for a Tensor.  The constructor must be passed a pointer to the
 storage for the data, and "rank" size attributes.  The storage has to be
@@ -83,17 +81,17 @@ large enough to hold all the data.
 
     // You can also map fixed-size tensors.  Here we get a 1d view of
     // the 2d fixed-size tensor.
-    TensorFixedSize<float, Sizes<4, 5>> t_4x3;
+    TensorFixedSize<float, Sizes<4, 3>> t_4x3;
     TensorMap<Tensor<float, 1>> t_12(t_4x3.data(), 12);
 
 
-#### Class `TensorRef`
+#### Class TensorRef
 
 See Assigning to a TensorRef below.
 
 ## Accessing Tensor Elements
 
-#### `<data_type> tensor(index0, index1...)`
+#### <data_type> tensor(index0, index1...)
 
 Return the element at position `(index0, index1...)` in tensor
 `tensor`.  You must pass as many parameters as the rank of `tensor`.
@@ -278,7 +276,7 @@ Simiarly, assigning an expression to a TensorMap causes its evaluation.  Like
 tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to
 have the rank and sizes of the expression that are assigned to them.
 
-#### Calling `eval()`.
+#### Calling eval().
 
 When you compute large composite expressions, you sometimes want to tell Eigen
 that an intermediate value in the expression tree is worth evaluating ahead of
@@ -355,7 +353,7 @@ call for the right hand side:
         (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval();
 
 
-#### Assigning to a `TensorRef`.
+#### Assigning to a TensorRef.
 
 If you need to access only a few elements from the value of an expression you
 can avoid materializing the value in a full tensor by using a TensorRef.
@@ -430,8 +428,11 @@ This is exactly the same as not inserting a `device()` call.
 
 #### Evaluating with a Thread Pool
 
+    // Create the Eigen ThreadPool
+    Eigen::ThreadPool pool(8 /* number of threads in pool */)
+
     // Create the Eigen ThreadPoolDevice.
-    Eigen::ThreadPoolDevice my_device(4 /* number of threads to use */);
+    Eigen::ThreadPoolDevice my_device(&pool, 4 /* number of threads to use */);
 
     // Now just use the device when evaluating expressions.
     Eigen::Tensor<float, 2> c(30, 50);
@@ -452,24 +453,24 @@ memory for tensors with cuda.
 In the documentation of the tensor methods and Operation we mention datatypes
 that are tensor-type specific:
 
-#### `<Tensor-Type>::``Dimensions`
+#### <Tensor-Type>::Dimensions
 
 Acts like an array of ints.  Has an `int size` attribute, and can be
 indexed like an array to access individual values.  Used to represent the
 dimensions of a tensor.  See `dimensions()`.
 
-#### `<Tensor-Type>::``Index`
+#### <Tensor-Type>::Index
 
 Acts like an `int`.  Used for indexing tensors along their dimensions.  See
 `operator()`, `dimension()`, and `size()`.
 
-#### `<Tensor-Type>::``Scalar`
+#### <Tensor-Type>::Scalar
 
 Represents the datatype of individual tensor elements.  For example, for a
 `Tensor<float>`, `Scalar` is the type `float`.  See
 `setConstant()`.
 
-#### `<Operation>`
+#### <Operation>
 
 We use this pseudo type to indicate that a tensor Operation is returned by a
 method.  We indicate in the text the type and dimensions of the tensor that the
@@ -489,7 +490,7 @@ Tensor, TensorFixedSize, and TensorMap.
 
 ## Metadata
 
-### `int NumDimensions`
+### int NumDimensions
 
 Constant value indicating the number of dimensions of a Tensor.  This is also
 known as the tensor "rank".
@@ -498,7 +499,7 @@ known as the tensor "rank".
       cout << "Dims " << a.NumDimensions;
       => Dims 2
 
-### `Dimensions dimensions()`
+### Dimensions dimensions()
 
 Returns an array-like object representing the dimensions of the tensor.
 The actual type of the `dimensions()` result is `<Tensor-Type>::``Dimensions`.
@@ -516,7 +517,7 @@ If you use a C++11 compiler, you can use `auto` to simplify the code:
          << ", dim 1: " << d[1];
     => Dim size: 2, dim 0: 3, dim 1: 4
 
-### `Index dimension(Index n)`
+### Index dimension(Index n)
 
 Returns the n-th dimension of the tensor.  The actual type of the
 `dimension()` result is `<Tensor-Type>::``Index`, but you can
@@ -527,7 +528,7 @@ always use it like an int.
       cout << "Dim 1: " << dim1;
       => Dim 1: 4
 
-### `Index size()`
+### Index size()
 
 Returns the total number of elements in the tensor.  This is the product of all
 the tensor dimensions.  The actual type of the `size()` result is
@@ -602,7 +603,7 @@ You can use one of the methods below to initialize the tensor memory.  These
 have an immediate effect on the tensor and return the tensor itself as a
 result.  These are not tensor Operations which delay evaluation.
 
-### `<Tensor-Type> setConstant(const Scalar& val)`
+### <Tensor-Type> setConstant(const Scalar& val)
 
 Sets all elements of the tensor to the constant value `val`.  `Scalar`
 is the type of data stored in the tensor.  You can pass any value that is
@@ -630,7 +631,7 @@ has a copy constructor and an `operator=()`:
     yolo yolo yolo
 
 
-### `<Tensor-Type> setZero()`
+### <Tensor-Type> setZero()
 
 Fills the tensor with zeros.  Equivalent to `setConstant(Scalar(0))`.
 Returns the tensor itself in case you want to chain another call.
@@ -644,7 +645,7 @@ Returns the tensor itself in case you want to chain another call.
     0 0 0 0
 
 
-### `<Tensor-Type> setValues({..initializer_list})`
+### <Tensor-Type> setValues({..initializer_list})
 
 Fills the tensor with explicit values specified in a std::initializer_list.
 The type of the initializer list depends on the type and rank of the tensor.
@@ -680,7 +681,7 @@ code only sets the values of the first row of the tensor.
     10   20   30
     1000 1000 1000
 
-### `<Tensor-Type> setRandom()`
+### <Tensor-Type> setRandom()
 
 Fills the tensor with random values.  Returns the tensor itself in case you
 want to chain another call.
@@ -747,7 +748,7 @@ values of a tensor expression, the expression must either be evaluated or
 wrapped in a TensorRef.
 
 
-### `Scalar* data()` and `const Scalar* data() const`
+### Scalar* data() and const Scalar* data() const
 
 Returns a pointer to the storage for the tensor.  The pointer is const if the
 tensor was const.  This allows direct access to the data.  The layout of the
@@ -775,7 +776,7 @@ The chain of Operation is evaluated lazily, typically when it is assigned to a
 tensor.  See "Controlling when Expression are Evaluated" for more details about
 their evaluation.
 
-### `<Operation> constant(const Scalar& val)`
+### <Operation> constant(const Scalar& val)
 
 Returns a tensor of the same type and dimensions as the original tensor but
 where all elements have the value `val`.
@@ -803,7 +804,7 @@ tensor, or multiply every element of a tensor by a scalar.
     0.6 0.6 0.6
     0.6 0.6 0.6
 
-### `<Operation> random()`
+### <Operation> random()
 
 Returns a tensor of the same type and dimensions as the current tensor
 but where all elements have random values.
@@ -833,7 +834,7 @@ All these operations take a single input tensor as argument and return a tensor
 of the same type and dimensions as the tensor to which they are applied.  The
 requested operations are applied to each element independently.
 
-### `<Operation> operator-()`
+### <Operation> operator-()
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the opposite values of the original tensor.
@@ -852,42 +853,42 @@ containing the opposite values of the original tensor.
     -1 -1 -1
     -1 -1 -1
 
-### `<Operation> sqrt()`
+### <Operation> sqrt()
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the square roots of the original tensor.
 
-### `<Operation> rsqrt()`
+### <Operation> rsqrt()
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the inverse square roots of the original tensor.
 
-### `<Operation> square()`
+### <Operation> square()
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the squares of the original tensor values.
 
-### `<Operation> inverse()`
+### <Operation> inverse()
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the inverse of the original tensor values.
 
-### `<Operation> exp()`
+### <Operation> exp()
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the exponential of the original tensor.
 
-### `<Operation> log()`
+### <Operation> log()
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the natural logarithms of the original tensor.
 
-### `<Operation> abs()`
+### <Operation> abs()
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the absolute values of the original tensor.
 
-### `<Operation> pow(Scalar exponent)`
+### <Operation> pow(Scalar exponent)
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the coefficients of the original tensor to the power of the
@@ -914,17 +915,17 @@ cubic roots of an int Tensor:
     0 1 2
     3 4 5
 
-### `<Operation>  operator * (Scalar scale)`
+### <Operation>  operator * (Scalar scale)
 
 Multiplies all the coefficients of the input tensor by the provided scale.
 
-### `<Operation>  cwiseMax(Scalar threshold)`
+### <Operation>  cwiseMax(Scalar threshold)
 TODO
 
-### `<Operation>  cwiseMin(Scalar threshold)`
+### <Operation>  cwiseMin(Scalar threshold)
 TODO
 
-### `<Operation>  unaryExpr(const CustomUnaryOp& func)`
+### <Operation>  unaryExpr(const CustomUnaryOp& func)
 TODO
 
 
@@ -936,39 +937,39 @@ dimensions as the tensors to which they are applied, and unless otherwise
 specified it is also of the same type. The requested operations are applied to
 each pair of elements independently.
 
-### `<Operation> operator+(const OtherDerived& other)`
+### <Operation> operator+(const OtherDerived& other)
 
 Returns a tensor of the same type and dimensions as the input tensors
 containing the coefficient wise sums of the inputs.
 
-### `<Operation> operator-(const OtherDerived& other)`
+### <Operation> operator-(const OtherDerived& other)
 
 Returns a tensor of the same type and dimensions as the input tensors
 containing the coefficient wise differences of the inputs.
 
-### `<Operation> operator*(const OtherDerived& other)`
+### <Operation> operator*(const OtherDerived& other)
 
 Returns a tensor of the same type and dimensions as the input tensors
 containing the coefficient wise products of the inputs.
 
-### `<Operation> operator/(const OtherDerived& other)`
+### <Operation> operator/(const OtherDerived& other)
 
 Returns a tensor of the same type and dimensions as the input tensors
 containing the coefficient wise quotients of the inputs.
 
 This operator is not supported for integer types.
 
-### `<Operation> cwiseMax(const OtherDerived& other)`
+### <Operation> cwiseMax(const OtherDerived& other)
 
 Returns a tensor of the same type and dimensions as the input tensors
 containing the coefficient wise maximums of the inputs.
 
-### `<Operation> cwiseMin(const OtherDerived& other)`
+### <Operation> cwiseMin(const OtherDerived& other)
 
 Returns a tensor of the same type and dimensions as the input tensors
 containing the coefficient wise mimimums of the inputs.
 
-### `<Operation> Logical operators`
+### <Operation> Logical operators
 
 The following logical operators are supported as well:
 
@@ -1126,55 +1127,107 @@ scalar, represented as a zero-dimension tensor.
     276
 
 
-### `<Operation> sum(const Dimensions& new_dims)`
-### `<Operation> sum()`
+### <Operation> sum(const Dimensions& new_dims)
+### <Operation> sum()
 
 Reduce a tensor using the sum() operator.  The resulting values
 are the sum of the reduced values.
 
-### `<Operation> mean(const Dimensions& new_dims)`
-### `<Operation> mean()`
+### <Operation> mean(const Dimensions& new_dims)
+### <Operation> mean()
 
 Reduce a tensor using the mean() operator.  The resulting values
 are the mean of the reduced values.
 
-### `<Operation> maximum(const Dimensions& new_dims)`
-### `<Operation> maximum()`
+### <Operation> maximum(const Dimensions& new_dims)
+### <Operation> maximum()
 
 Reduce a tensor using the maximum() operator.  The resulting values are the
 largest of the reduced values.
 
-### `<Operation> minimum(const Dimensions& new_dims)`
-### `<Operation> minimum()`
+### <Operation> minimum(const Dimensions& new_dims)
+### <Operation> minimum()
 
 Reduce a tensor using the minimum() operator.  The resulting values
 are the smallest of the reduced values.
 
-### `<Operation> prod(const Dimensions& new_dims)`
-### `<Operation> prod()`
+### <Operation> prod(const Dimensions& new_dims)
+### <Operation> prod()
 
 Reduce a tensor using the prod() operator.  The resulting values
 are the product of the reduced values.
 
-### `<Operation> all(const Dimensions& new_dims)`
-### `<Operation> all()`
+### <Operation> all(const Dimensions& new_dims)
+### <Operation> all()
 Reduce a tensor using the all() operator.  Casts tensor to bool and then checks
 whether all elements are true.  Runs through all elements rather than
 short-circuiting, so may be significantly inefficient.
 
-### `<Operation> any(const Dimensions& new_dims)`
-### `<Operation> any()`
+### <Operation> any(const Dimensions& new_dims)
+### <Operation> any()
 Reduce a tensor using the any() operator.  Casts tensor to bool and then checks
 whether any element is true.  Runs through all elements rather than
 short-circuiting, so may be significantly inefficient.
 
 
-### `<Operation> reduce(const Dimensions& new_dims, const Reducer& reducer)`
+### <Operation> reduce(const Dimensions& new_dims, const Reducer& reducer)
 
 Reduce a tensor using a user-defined reduction operator.  See `SumReducer`
 in TensorFunctors.h for information on how to implement a reduction operator.
 
 
+## Trace
+
+A *Trace* operation returns a tensor with fewer dimensions than the original
+tensor. It returns a tensor whose elements are the sum of the elements of the
+original tensor along the main diagonal for a list of specified dimensions, the
+"trace dimensions". Similar to the `Reduction Dimensions`, the trace dimensions
+are passed as an input parameter to the operation, are of type `<TensorType>::``Dimensions`
+, and have the same requirements when passed as an input parameter. In addition,
+the trace dimensions must have the same size.
+
+Example: Trace along 2 dimensions.
+
+    // Create a tensor of 3 dimensions
+    Eigen::Tensor<int, 3> a(2, 2, 3);
+    a.setValues({{{1, 2, 3}, {4, 5, 6}}, {{7, 8, 9}, {10, 11, 12}}});
+    // Specify the dimensions along which the trace will be computed.
+    // In this example, the trace can only be computed along the dimensions
+    // with indices 0 and 1
+    Eigen::array<int, 2> dims({0, 1});
+    // The output tensor contains all but the trace dimensions.
+    Tensor<int, 1> a_trace = a.trace(dims);
+    cout << "a_trace:" << endl;
+    cout << a_trace << endl;
+    =>
+    a_trace:
+    11
+    13
+    15
+
+
+### <Operation> trace(const Dimensions& new_dims)
+### <Operation> trace()
+
+As a special case, if no parameter is passed to the operation, trace is computed
+along *all* dimensions of the input tensor.
+
+Example: Trace along all dimensions.
+
+    // Create a tensor of 3 dimensions, with all dimensions having the same size.
+    Eigen::Tensor<int, 3> a(3, 3, 3);
+    a.setValues({{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}},
+                {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}},
+                {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}});
+    // Result is a zero dimension tensor
+    Tensor<int, 0> a_trace = a.trace();
+    cout<<"a_trace:"<<endl;
+    cout<<a_trace<<endl;
+    =>
+    a_trace:
+    42
+
+
 ## Scan Operations
 
 A *Scan* operation returns a tensor with the same dimensions as the original
@@ -1204,18 +1257,18 @@ dd a comment to this line
     1  3  6
     4  9 15
 
-### `<Operation> cumsum(const Index& axis)`
+### <Operation> cumsum(const Index& axis)
 
 Perform a scan by summing consecutive entries.
 
-### `<Operation> cumprod(const Index& axis)`
+### <Operation> cumprod(const Index& axis)
 
 Perform a scan by multiplying consecutive entries.
 
 
 ## Convolutions
 
-### `<Operation> convolve(const Kernel& kernel, const Dimensions& dims)`
+### <Operation> convolve(const Kernel& kernel, const Dimensions& dims)
 
 Returns a tensor that is the output of the convolution of the input tensor with the kernel,
 along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor
@@ -1258,7 +1311,7 @@ These operations return a Tensor with different dimensions than the original
 Tensor.  They can be used to access slices of tensors, see them with different
 dimensions, or pad tensors with additional data.
 
-### `<Operation> reshape(const Dimensions& new_dims)`
+### <Operation> reshape(const Dimensions& new_dims)
 
 Returns a view of the input tensor that has been reshaped to the specified
 new dimensions.  The argument new_dims is an array of Index values.  The
@@ -1337,7 +1390,7 @@ Note that "b" itself was not reshaped but that instead the assignment is done to
 the reshape view of b.
 
 
-### `<Operation> shuffle(const Shuffle& shuffle)`
+### <Operation> shuffle(const Shuffle& shuffle)
 
 Returns a copy of the input tensor whose dimensions have been
 reordered according to the specified permutation. The argument shuffle
@@ -1378,7 +1431,7 @@ Let's rewrite the previous example to take advantage of this feature:
     output.shuffle({2, 0, 1}) = input;
 
 
-### `<Operation> stride(const Strides& strides)`
+### <Operation> stride(const Strides& strides)
 
 Returns a view of the input tensor that strides (skips stride-1
 elements) along each of the dimensions.  The argument strides is an
@@ -1404,7 +1457,7 @@ It is possible to assign a tensor to a stride:
     output.stride({2, 3, 4}) = input;
 
 
-### `<Operation> slice(const StartIndices& offsets, const Sizes& extents)`
+### <Operation> slice(const StartIndices& offsets, const Sizes& extents)
 
 Returns a sub-tensor of the given tensor. For each dimension i, the slice is
 made of the coefficients stored between offset[i] and offset[i] + extents[i] in
@@ -1430,7 +1483,7 @@ the input tensor.
      600   700
 
 
-### `<Operation> chip(const Index offset, const Index dim)`
+### <Operation> chip(const Index offset, const Index dim)
 
 A chip is a special kind of slice. It is the subtensor at the given offset in
 the dimension dim. The returned tensor has one fewer dimension than the input
@@ -1481,7 +1534,7 @@ lvalue. For example:
          0     0     0
 
 
-### `<Operation> reverse(const ReverseDimensions& reverse)`
+### <Operation> reverse(const ReverseDimensions& reverse)
 
 Returns a view of the input tensor that reverses the order of the coefficients
 along a subset of the dimensions.  The argument reverse is an array of boolean
@@ -1511,7 +1564,7 @@ of a 2D tensor:
        0   100   200
 
 
-### `<Operation> broadcast(const Broadcast& broadcast)`
+### <Operation> broadcast(const Broadcast& broadcast)
 
 Returns a view of the input tensor in which the input is replicated one to many
 times.
@@ -1535,11 +1588,11 @@ made in each of the dimensions.
        0   100   200    0   100   200
      300   400   500  300   400   500
 
-### `<Operation> concatenate(const OtherDerived& other, Axis axis)`
+### <Operation> concatenate(const OtherDerived& other, Axis axis)
 
 TODO
 
-### `<Operation>  pad(const PaddingDimensions& padding)`
+### <Operation>  pad(const PaddingDimensions& padding)
 
 Returns a view of the input tensor in which the input is padded with zeros.
 
@@ -1564,7 +1617,7 @@ Returns a view of the input tensor in which the input is padded with zeros.
        0     0     0    0
 
 
-### `<Operation>  extract_patches(const PatchDims& patch_dims)`
+### <Operation>  extract_patches(const PatchDims& patch_dims)
 
 Returns a tensor of coefficient patches extracted from the input tensor, where
 each patch is of dimension specified by 'patch_dims'. The returned tensor has
@@ -1575,83 +1628,83 @@ dimension in RowMajor layout.
 
 For example, given the following input tensor:
 
-  Eigen::Tensor<float, 2, DataLayout> tensor(3,4);
-  tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f},
-                    {4.0f, 5.0f, 6.0f, 7.0f},
-                    {8.0f, 9.0f, 10.0f, 11.0f}});
+    Eigen::Tensor<float, 2, DataLayout> tensor(3,4);
+    tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f},
+                      {4.0f, 5.0f, 6.0f, 7.0f},
+                      {8.0f, 9.0f, 10.0f, 11.0f}});
 
-  cout << "tensor: " << endl << tensor << endl;
-=>
-tensor:
- 0   1   2   3
- 4   5   6   7
- 8   9  10  11
+    cout << "tensor: " << endl << tensor << endl;
+    =>
+    tensor:
+     0   1   2   3
+     4   5   6   7
+     8   9  10  11
 
 Six 2x2 patches can be extracted and indexed using the following code:
 
-  Eigen::Tensor<float, 3, DataLayout> patch;
-  Eigen::array<ptrdiff_t, 2> patch_dims;
-  patch_dims[0] = 2;
-  patch_dims[1] = 2;
-  patch = tensor.extract_patches(patch_dims);
-  for (int k = 0; k < 6; ++k) {
-    cout << "patch index: " << k << endl;
-    for (int i = 0; i < 2; ++i) {
-      for (int j = 0; j < 2; ++j) {
-        if (DataLayout == ColMajor) {
-          cout << patch(i, j, k) << " ";
-        } else {
-          cout << patch(k, i, j) << " ";
-        }
+    Eigen::Tensor<float, 3, DataLayout> patch;
+    Eigen::array<ptrdiff_t, 2> patch_dims;
+    patch_dims[0] = 2;
+    patch_dims[1] = 2;
+    patch = tensor.extract_patches(patch_dims);
+    for (int k = 0; k < 6; ++k) {
+      cout << "patch index: " << k << endl;
+      for (int i = 0; i < 2; ++i) {
+    	for (int j = 0; j < 2; ++j) {
+    	  if (DataLayout == ColMajor) {
+    		cout << patch(i, j, k) << " ";
+    	  } else {
+    		cout << patch(k, i, j) << " ";
+    	  }
+    	}
+    	cout << endl;
       }
-      cout << endl;
     }
-  }
 
 This code results in the following output when the data layout is ColMajor:
 
-patch index: 0
-0 1
-4 5
-patch index: 1
-4 5
-8 9
-patch index: 2
-1 2
-5 6
-patch index: 3
-5 6
-9 10
-patch index: 4
-2 3
-6 7
-patch index: 5
-6 7
-10 11
+    patch index: 0
+    0 1
+    4 5
+    patch index: 1
+    4 5
+    8 9
+    patch index: 2
+    1 2
+    5 6
+    patch index: 3
+    5 6
+    9 10
+    patch index: 4
+    2 3
+    6 7
+    patch index: 5
+    6 7
+    10 11
 
 This code results in the following output when the data layout is RowMajor:
 (NOTE: the set of patches is the same as in ColMajor, but are indexed differently).
 
-patch index: 0
-0 1
-4 5
-patch index: 1
-1 2
-5 6
-patch index: 2
-2 3
-6 7
-patch index: 3
-4 5
-8 9
-patch index: 4
-5 6
-9 10
-patch index: 5
-6 7
-10 11
-
-### `<Operation>  extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)`
+    patch index: 0
+    0 1
+    4 5
+    patch index: 1
+    1 2
+    5 6
+    patch index: 2
+    2 3
+    6 7
+    patch index: 3
+    4 5
+    8 9
+    patch index: 4
+    5 6
+    9 10
+    patch index: 5
+    6 7
+    10 11
+
+### <Operation>  extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)
 
 Returns a tensor of coefficient image patches extracted from the input tensor,
 which is expected to have dimensions ordered as follows (depending on the data
@@ -1681,32 +1734,34 @@ sizes:
  *) columns: 5
  *) batch:   7
 
-  Tensor<float, 4> tensor(2,3,5,7);
-  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+    Tensor<float, 4> tensor(2,3,5,7);
+    Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
 
 2x2 image patches can be extracted and indexed using the following code:
 
 *) 2D patch: ColMajor (patch indexed by second-to-last dimension)
-  Tensor<float, 5> twod_patch;
-  twod_patch = tensor.extract_image_patches<2, 2>();
-  // twod_patch.dimension(0) == 2
-  // twod_patch.dimension(1) == 2
-  // twod_patch.dimension(2) == 2
-  // twod_patch.dimension(3) == 3*5
-  // twod_patch.dimension(4) == 7
+
+    Tensor<float, 5> twod_patch;
+    twod_patch = tensor.extract_image_patches<2, 2>();
+    // twod_patch.dimension(0) == 2
+    // twod_patch.dimension(1) == 2
+    // twod_patch.dimension(2) == 2
+    // twod_patch.dimension(3) == 3*5
+    // twod_patch.dimension(4) == 7
 
 *) 2D patch: RowMajor (patch indexed by the second dimension)
-  Tensor<float, 5, RowMajor> twod_patch_row_major;
-  twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>();
-  // twod_patch_row_major.dimension(0) == 7
-  // twod_patch_row_major.dimension(1) == 3*5
-  // twod_patch_row_major.dimension(2) == 2
-  // twod_patch_row_major.dimension(3) == 2
-  // twod_patch_row_major.dimension(4) == 2
+
+    Tensor<float, 5, RowMajor> twod_patch_row_major;
+    twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>();
+    // twod_patch_row_major.dimension(0) == 7
+    // twod_patch_row_major.dimension(1) == 3*5
+    // twod_patch_row_major.dimension(2) == 2
+    // twod_patch_row_major.dimension(3) == 2
+    // twod_patch_row_major.dimension(4) == 2
 
 ## Special Operations
 
-### `<Operation> cast<T>()`
+### <Operation> cast<T>()
 
 Returns a tensor of type T with the same dimensions as the original tensor.
 The returned tensor contains the values of the original tensor converted to
@@ -1735,7 +1790,7 @@ but you can easily cast the tensors to floats to do the division:
     1 2 2
 
 
-### `<Operation>     eval()`
+### <Operation>     eval()
 
 TODO
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 00295a255cf4aa8e8d86aa246eed8bfb11c7e265..8cac2bb1234ae8e795f72362703465c7d5ca4765 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -112,7 +112,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
-    EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -388,6 +388,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
     }
+
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other)
@@ -398,6 +399,20 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
     }
 
+    #if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor(Self&& other)
+      : m_storage(std::move(other.m_storage))
+    {
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor& operator=(Self&& other)
+    {
+      m_storage = std::move(other.m_storage);
+      return *this;
+    }
+    #endif
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other)
     {
@@ -462,6 +477,18 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       // Nothing to do: rank 0 tensors have fixed size
     }
 
+#ifdef EIGEN_HAS_INDEX_LIST
+    template <typename FirstType, typename... OtherTypes>
+    EIGEN_DEVICE_FUNC
+    void resize(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
+      array<Index, NumIndices> dims;
+      for (int i = 0; i < NumIndices; ++i) {
+        dims[i] = static_cast<Index>(dimensions[i]);
+      }
+      resize(dims);
+    }
+#endif
+
     /** Custom Dimension */
 #ifdef EIGEN_HAS_SFINAE
     template<typename CustomDimension,
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index d06f40cd866c425143a0f69412e1aec406121f2c..8b8fb923523d6c42df61f5b9ce7e6fc877fd4094 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -37,7 +37,7 @@ struct traits<TensorIndexTupleOp<XprType> > : public traits<XprType>
 template<typename XprType>
 struct eval<TensorIndexTupleOp<XprType>, Eigen::Dense>
 {
-  typedef const TensorIndexTupleOp<XprType>& type;
+  typedef const TensorIndexTupleOp<XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename XprType>
@@ -82,28 +82,35 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
 
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
   static const int NumDims = internal::array_size<Dimensions>::value;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
     BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device) { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
     return m_impl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -117,7 +124,13 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
     return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  protected:
   TensorEvaluator<ArgType, Device> m_impl;
@@ -147,7 +160,7 @@ struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<Xp
 template<typename ReduceOp, typename Dims, typename XprType>
 struct eval<TensorTupleReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense>
 {
-  typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>& type;
+  typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename ReduceOp, typename Dims, typename XprType>
@@ -172,7 +185,7 @@ class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Di
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr,
                                                           const ReduceOp& reduce_op,
-                                                          const int return_dim,
+                                                          const Index return_dim,
                                                           const Dims& reduce_dims)
       : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {}
 
@@ -187,12 +200,12 @@ class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Di
   const Dims& reduce_dims() const { return m_reduce_dims; }
 
   EIGEN_DEVICE_FUNC
-  int return_dim() const { return m_return_dim; }
+  Index return_dim() const { return m_return_dim; }
 
   protected:
     typename XprType::Nested m_xpr;
     const ReduceOp m_reduce_op;
-    const int m_return_dim;
+    const Index m_return_dim;
     const Dims m_reduce_dims;
 };
 
@@ -209,21 +222,29 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
   typedef typename TensorEvaluator<const TensorIndexTupleOp<ArgType> , Device>::Dimensions InputDimensions;
   static const int NumDims = internal::array_size<InputDimensions>::value;
   typedef array<Index, NumDims> StrideDims;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef StorageMemory<TupleType, Device> TupleStorageMem;
 
   enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
-    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
-    BlockAccess = false,
-    Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess      = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
+    BlockAccess       = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_orig_impl(op.expression(), device),
         m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
-        m_return_dim(op.return_dim()) {
-
+        m_return_dim(op.return_dim())
+  {
     gen_strides(m_orig_impl.dimensions(), m_strides);
     if (Layout == static_cast<int>(ColMajor)) {
       const Index total_size = internal::array_prod(m_orig_impl.dimensions());
@@ -231,19 +252,22 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
     } else {
       const Index total_size = internal::array_prod(m_orig_impl.dimensions());
       m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size;
-    }
-    m_stride_div = m_strides[m_return_dim];
+    }    
+    // If m_return_dim is not a valid index, returns 1 or this can crash on Windows.
+    m_stride_div = ((m_return_dim >= 0) &&
+                    (m_return_dim < static_cast<Index>(m_strides.size())))
+                   ? m_strides[m_return_dim] : 1;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
     return m_impl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -252,7 +276,13 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
     return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+#ifdef EIGEN_USE_SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+    m_orig_impl.bind(cgh);
+  }
+#endif
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
@@ -288,7 +318,7 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
  protected:
   TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> m_orig_impl;
   TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device> m_impl;
-  const int m_return_dim;
+  const Index m_return_dim;
   StrideDims m_strides;
   Index m_stride_mod;
   Index m_stride_div;
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 166be200c5587be71dd7b6bfce5bb8ea89e4a34a..e5811d63fa63b7fd98ef94bbcfd4f0335040e87e 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -34,6 +34,7 @@ struct traits<TensorAssignOp<LhsXprType, RhsXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions;
   static const int Layout = internal::traits<LhsXprType>::Layout;
+  typedef typename traits<LhsXprType>::PointerType PointerType;
 
   enum {
     Flags = 0
@@ -67,6 +68,8 @@ class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType>
   typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
 
+  static const int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
       : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
 
@@ -94,20 +97,41 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static const int NumDims = XprType::NumDims;
 
   enum {
-    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
+    IsAligned         = int(TensorEvaluator<LeftArgType, Device>::IsAligned) &
+                        int(TensorEvaluator<RightArgType, Device>::IsAligned),
+    PacketAccess      = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
+                        int(TensorEvaluator<RightArgType, Device>::PacketAccess),
+    BlockAccess       = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
+                        int(TensorEvaluator<RightArgType, Device>::BlockAccess),
+    PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
+                        int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess         = TensorEvaluator<LeftArgType, Device>::RawAccess
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock
+      RightTensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType& op, const Device& device) :
       m_leftImpl(op.lhsExpression(), device),
       m_rightImpl(op.rhsExpression(), device)
   {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT(
+        (static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+        YOU_MADE_A_PROGRAMMING_MISTAKE);
   }
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@@ -118,7 +142,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     return m_rightImpl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
@@ -127,7 +151,19 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     // by the rhs to the lhs.
     return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data());
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
+      m_rightImpl.evalSubExprsIfNeededAsync(
+          m_leftImpl.data(), [done](bool need_assign) { done(need_assign); });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
   }
@@ -136,6 +172,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+
     const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
     const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
     m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
@@ -163,12 +200,41 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
            TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
   }
 
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::merge(
+        m_leftImpl.getResourceRequirements(),
+        m_rightImpl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(
+      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
+        m_leftImpl.data() != NULL) {
+      // If destination has raw data access, we pass it as a potential
+      // destination for a block descriptor evaluation.
+      desc.template AddDestinationBuffer<Layout>(
+          /*dst_base=*/m_leftImpl.data() + desc.offset(),
+          /*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()));
+    }
+
+    RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true);
+    // If block was evaluated into a destination, there is no need to do assignment.
+    if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
+      m_leftImpl.writeBlock(desc, block);
+    }
+    block.cleanup();
+  }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_leftImpl.bind(cgh);
+    m_rightImpl.bind(cgh);
+  }
+#endif
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); }
 
  private:
   TensorEvaluator<LeftArgType, Device> m_leftImpl;
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index f573608d9638b639c26cd7d3cfc62e30c5c2be7b..35b6458e5a813518d0ba0bcc7d3949d931914b2d 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -20,7 +20,7 @@ namespace Eigen {
   * \brief The tensor base class.
   *
   * This class is the common parent of the Tensor and TensorMap class, thus
-  * making it possible to use either class interchangably in expressions.
+  * making it possible to use either class interchangeably in expressions.
   */
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 // FIXME Doxygen does not like the inheritance with different template parameters
@@ -135,6 +135,78 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_digamma_op<Scalar>());
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0_op<Scalar>, const Derived>
+    bessel_i0() const {
+      return unaryExpr(internal::scalar_bessel_i0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0e_op<Scalar>, const Derived>
+    bessel_i0e() const {
+      return unaryExpr(internal::scalar_bessel_i0e_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1_op<Scalar>, const Derived>
+    bessel_i1() const {
+      return unaryExpr(internal::scalar_bessel_i1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1e_op<Scalar>, const Derived>
+    bessel_i1e() const {
+      return unaryExpr(internal::scalar_bessel_i1e_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j0_op<Scalar>, const Derived>
+    bessel_j0() const {
+      return unaryExpr(internal::scalar_bessel_j0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y0_op<Scalar>, const Derived>
+    bessel_y0() const {
+      return unaryExpr(internal::scalar_bessel_y0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j1_op<Scalar>, const Derived>
+    bessel_j1() const {
+      return unaryExpr(internal::scalar_bessel_j1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y1_op<Scalar>, const Derived>
+    bessel_y1() const {
+      return unaryExpr(internal::scalar_bessel_y1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0_op<Scalar>, const Derived>
+    bessel_k0() const {
+      return unaryExpr(internal::scalar_bessel_k0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0e_op<Scalar>, const Derived>
+    bessel_k0e() const {
+      return unaryExpr(internal::scalar_bessel_k0e_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1_op<Scalar>, const Derived>
+    bessel_k1() const {
+      return unaryExpr(internal::scalar_bessel_k1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1e_op<Scalar>, const Derived>
+    bessel_k1e() const {
+      return unaryExpr(internal::scalar_bessel_k1e_op<Scalar>());
+    }
+
     // igamma(a = this, x = other)
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_igamma_op<Scalar>, const Derived, const OtherDerived>
@@ -142,6 +214,20 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return binaryExpr(other.derived(), internal::scalar_igamma_op<Scalar>());
     }
 
+    // igamma_der_a(a = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_igamma_der_a_op<Scalar>, const Derived, const OtherDerived>
+    igamma_der_a(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_igamma_der_a_op<Scalar>());
+    }
+
+    // gamma_sample_der_alpha(alpha = this, sample = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_gamma_sample_der_alpha_op<Scalar>, const Derived, const OtherDerived>
+    gamma_sample_der_alpha(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_gamma_sample_der_alpha_op<Scalar>());
+    }
+
     // igammac(a = this, x = other)
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_igammac_op<Scalar>, const Derived, const OtherDerived>
@@ -176,9 +262,15 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sigmoid_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived>
+    ndtri() const {
+      return unaryExpr(internal::scalar_ndtri_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived>
     sigmoid() const {
-      return unaryExpr(internal::scalar_sigmoid_op<Scalar>());
+      return unaryExpr(internal::scalar_logistic_op<Scalar>());
     }
 
     EIGEN_DEVICE_FUNC
@@ -187,6 +279,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_exp_op<Scalar>());
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived>
+    expm1() const {
+      return unaryExpr(internal::scalar_expm1_op<Scalar>());
+    }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
     log() const {
@@ -199,6 +297,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_log1p_op<Scalar>());
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log2_op<Scalar>, const Derived>
+    log2() const {
+      return unaryExpr(internal::scalar_log2_op<Scalar>());
+    }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
     abs() const {
@@ -206,9 +310,17 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_clamp_op<Scalar>, const Derived>
+    clip(Scalar min, Scalar max) const {
+      return unaryExpr(internal::scalar_clamp_op<Scalar>(min, max));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const typename internal::conditional<NumTraits<CoeffReturnType>::IsComplex,
+                                                             TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>,
+                                                             Derived>::type
     conjugate() const {
-      return unaryExpr(internal::scalar_conjugate_op<Scalar>());
+      return choose(Cond<NumTraits<CoeffReturnType>::IsComplex>(), unaryExpr(internal::scalar_conjugate_op<Scalar>()), derived());
     }
 
     EIGEN_DEVICE_FUNC
@@ -289,22 +401,27 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_mod_op<Scalar>(rhs));
     }
 
+    template <int NanPropagation=PropagateFast>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+        EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NanPropagation>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     cwiseMax(Scalar threshold) const {
-      return cwiseMax(constant(threshold));
+      return cwiseMax<NanPropagation>(constant(threshold));
     }
 
+    template <int NanPropagation=PropagateFast>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+        EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NanPropagation>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     cwiseMin(Scalar threshold) const {
-      return cwiseMin(constant(threshold));
+      return cwiseMin<NanPropagation>(constant(threshold));
     }
 
-    template <typename NewType> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorConversionOp<NewType, const Derived>
+    template<typename NewType>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const typename internal::conditional<internal::is_same<NewType, CoeffReturnType>::value,
+                                                             Derived,
+                                                             TensorConversionOp<NewType, const Derived> >::type
     cast() const {
-      return TensorConversionOp<NewType, const Derived>(derived());
+      return choose(Cond<internal::is_same<NewType, CoeffReturnType>::value>(), derived(), TensorConversionOp<NewType, const Derived>(derived()));
     }
 
     EIGEN_DEVICE_FUNC
@@ -313,6 +430,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_round_op<Scalar>());
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rint_op<Scalar>, const Derived>
+    rint() const {
+      return unaryExpr(internal::scalar_rint_op<Scalar>());
+    }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived>
     ceil() const {
@@ -357,16 +480,16 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return binaryExpr(other.derived(), internal::scalar_quotient_op<Scalar>());
     }
 
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>
+  template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
     cwiseMax(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_max_op<Scalar>());
+    return binaryExpr(other.derived(), internal::scalar_max_op<Scalar,Scalar, NaNPropagation>());
     }
 
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>
+  template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
     cwiseMin(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_min_op<Scalar>());
+      return binaryExpr(other.derived(), internal::scalar_min_op<Scalar,Scalar, NaNPropagation>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -481,9 +604,15 @@ class TensorBase<Derived, ReadOnlyAccessors>
     typedef Eigen::IndexPair<Index> DimensionPair;
 
     template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived>
+    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>
     contract(const OtherDerived& other, const Dimensions& dims) const {
-      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived>(derived(), other.derived(), dims);
+      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>(derived(), other.derived(), dims);
+    }
+
+    template<typename OtherDerived, typename Dimensions, typename OutputKernel> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>
+    contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const {
+      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>(derived(), other.derived(), dims, output_kernel);
     }
 
     // Convolutions.
@@ -496,8 +625,8 @@ class TensorBase<Derived, ReadOnlyAccessors>
     // Fourier transforms
     template <int FFTDataType, int FFTDirection, typename FFT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>
-    fft(const FFT& fft) const {
-      return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), fft);
+    fft(const FFT& dims) const {
+      return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), dims);
     }
 
     // Scan.
@@ -559,51 +688,53 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>());
     }
 
-    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>
+    template <typename Dims,int NanPropagation=PropagateFast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>
     maximum(const Dims& dims) const {
-      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType>());
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType,NanPropagation>());
     }
 
-    const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    template <int NanPropagation=PropagateFast>
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>
     maximum() const {
       DimensionList<Index, NumDimensions> in_dims;
-      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType>());
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType,NanPropagation>());
     }
 
-    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>
+    template <typename Dims,int NanPropagation=PropagateFast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>
     minimum(const Dims& dims) const {
-      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType>());
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType,NanPropagation>());
     }
 
-    const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    template <int NanPropagation=PropagateFast>
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>
     minimum() const {
       DimensionList<Index, NumDimensions> in_dims;
-      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType>());
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType,NanPropagation>());
     }
 
     template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::AndReducer, const Dims, const TensorConversionOp<bool, const Derived> >
+    const TensorReductionOp<internal::AndReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
     all(const Dims& dims) const {
       return cast<bool>().reduce(dims, internal::AndReducer());
     }
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> >
+    const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
     all() const {
       DimensionList<Index, NumDimensions> in_dims;
       return cast<bool>().reduce(in_dims, internal::AndReducer());
     }
 
     template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::OrReducer, const Dims, const TensorConversionOp<bool, const Derived> >
+    const TensorReductionOp<internal::OrReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
     any(const Dims& dims) const {
       return cast<bool>().reduce(dims, internal::OrReducer());
     }
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> >
+    const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
     any() const {
       DimensionList<Index, NumDimensions> in_dims;
       return cast<bool>().reduce(in_dims, internal::OrReducer());
@@ -615,7 +746,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
       const array<Index, NumDimensions>, const Derived>
     argmax() const {
       array<Index, NumDimensions> in_dims;
-      for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+      for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
       return TensorTupleReducerOp<
         internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
         const array<Index, NumDimensions>,
@@ -628,7 +759,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
       const array<Index, NumDimensions>, const Derived>
     argmin() const {
       array<Index, NumDimensions> in_dims;
-      for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+      for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
       return TensorTupleReducerOp<
         internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
         const array<Index, NumDimensions>,
@@ -639,7 +770,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
     const TensorTupleReducerOp<
       internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
       const array<Index, 1>, const Derived>
-    argmax(const int return_dim) const {
+    argmax(const Index return_dim) const {
       array<Index, 1> in_dims;
       in_dims[0] = return_dim;
       return TensorTupleReducerOp<
@@ -652,7 +783,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
     const TensorTupleReducerOp<
       internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
       const array<Index, 1>, const Derived>
-    argmin(const int return_dim) const {
+    argmin(const Index return_dim) const {
       array<Index, 1> in_dims;
       in_dims[0] = return_dim;
       return TensorTupleReducerOp<
@@ -667,10 +798,22 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer);
     }
 
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorTraceOp<const Dims, const Derived>
+    trace(const Dims& dims) const {
+      return TensorTraceOp<const Dims, const Derived>(derived(), dims);
+    }
+
+    const TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>
+    trace() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims);
+    }
+
     template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorBroadcastingOp<const Broadcast, const Derived>
-    broadcast(const Broadcast& broadcast) const {
-      return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), broadcast);
+    broadcast(const Broadcast& bcast) const {
+      return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), bcast);
     }
 
     template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -778,8 +921,8 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
     template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorShufflingOp<const Shuffle, const Derived>
-    shuffle(const Shuffle& shuffle) const {
-      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
+    shuffle(const Shuffle& shfl) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl);
     }
     template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorStridingOp<const Strides, const Derived>
@@ -820,7 +963,8 @@ class TensorBase<Derived, ReadOnlyAccessors>
   protected:
     template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
     template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
-    template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+    // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0
+    template <typename OtherDerived, int AccessLevel> friend class Eigen::TensorBase;
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
 };
@@ -828,6 +972,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
 template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value>
 class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
  public:
+    typedef TensorBase<Derived, ReadOnlyAccessors> Base;
     typedef internal::traits<Derived> DerivedTraits;
     typedef typename DerivedTraits::Scalar Scalar;
     typedef typename DerivedTraits::Index Index;
@@ -836,7 +981,8 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
 
     template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
     template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
-    template <typename OtherDerived, int OtherAccessLevel> friend class TensorBase;
+    // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0
+    template <typename OtherDerived, int OtherAccessLevel> friend class Eigen::TensorBase;
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& setZero() {
@@ -974,13 +1120,13 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
 
     template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorShufflingOp<const Shuffle, const Derived>
-    shuffle(const Shuffle& shuffle) const {
-      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
+    shuffle(const Shuffle& shfl) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl);
     }
     template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorShufflingOp<const Shuffle, Derived>
-    shuffle(const Shuffle& shuffle) {
-      return TensorShufflingOp<const Shuffle, Derived>(derived(), shuffle);
+    shuffle(const Shuffle& shfl) {
+      return TensorShufflingOp<const Shuffle, Derived>(derived(), shfl);
     }
 
     template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -996,11 +1142,29 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
 
     // Select the device on which to evaluate the expression.
     template <typename DeviceType>
-    TensorDevice<Derived, DeviceType> device(const DeviceType& device) {
-      return TensorDevice<Derived, DeviceType>(device, derived());
+    TensorDevice<Derived, DeviceType> device(const DeviceType& dev) {
+      return TensorDevice<Derived, DeviceType>(dev, derived());
+    }
+
+    // Select the async device on which to evaluate the expression.
+    template <typename DeviceType, typename DoneCallback>
+    TensorAsyncDevice<Derived, DeviceType, DoneCallback> device(const DeviceType& dev, DoneCallback done) {
+      return TensorAsyncDevice<Derived, DeviceType, DoneCallback>(dev, derived(), std::move(done));
     }
 
  protected:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TensorBase)
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorBase)
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other)
+    {
+      typedef TensorAssignOp<Derived, const OtherDerived> Assign;
+      Assign assign(derived(), other.derived());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return derived();
+    }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
     EIGEN_DEVICE_FUNC
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e55d12c42fc2ece4035b743915eef10d996ea0a
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -0,0 +1,1559 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+
+namespace Eigen {
+namespace internal {
+
+// -------------------------------------------------------------------------- //
+// Forward declarations for templates defined below.
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIO;
+
+// -------------------------------------------------------------------------- //
+// Helper function to compute strides for densely stored buffer of given
+// dimensions.
+
+// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
+// this function instead everywhere.
+template <int Layout, typename IndexType, int NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
+    const DSizes<IndexType, NumDims>& dimensions) {
+  DSizes<IndexType, NumDims> strides;
+  if (NumDims == 0) return strides;
+
+  // TODO(ezhulenev): Use templates to unroll this loop (similar to
+  // h_array_reduce in CXX11meta.h)? Benchmark it.
+  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+    strides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      strides[i] = strides[i - 1] * dimensions[i - 1];
+    }
+  } else {
+    strides[NumDims - 1] = 1;
+    for (int i = NumDims - 2; i >= 0; --i) {
+      strides[i] = strides[i + 1] * dimensions[i + 1];
+    }
+  }
+
+  return strides;
+}
+
+template <int Layout, typename IndexType, size_t NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
+    const Eigen::array<IndexType, NumDims>& dimensions) {
+  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
+}
+
+template <int Layout, std::ptrdiff_t... Indices>
+EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
+    const Sizes<Indices...>& sizes) {
+  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
+}
+
+// -------------------------------------------------------------------------- //
+
+// Tensor block shape type defines what are the shape preference for the blocks
+// extracted from the larger tensor.
+//
+// Example: blocks of 100 elements from the large 100x100 tensor:
+// - tensor: 100x100
+// - target_block_size: 100
+//
+// TensorBlockShapeType:
+//  - kUniformAllDims: 100 blocks of size 10x10
+//  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
+//                      or row major layout)
+enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
+
+struct TensorBlockResourceRequirements {
+  TensorBlockShapeType shape_type;  // target block shape
+  size_t size;                      // target block size
+  TensorOpCost cost_per_coeff;      // cost of computing a single block element
+
+#ifdef EIGEN_HIPCC
+  // For HIPCC, we need to explicitly declare as a "device fun", the constructor
+  // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
+  // errors out complaining about the lack of a matching constructor
+  EIGEN_DEVICE_FUNC
+  TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
+				  TensorOpCost cost_)
+    : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
+  {}
+#endif
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
+      TensorBlockShapeType shape_type, size_t size_in_bytes,
+      TensorOpCost cost) {
+    const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
+    return {shape_type, size, cost};
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
+      TensorBlockShapeType shape_type, size_t size_in_bytes) {
+    // This default cost per coefficient is valid for most materialized tensor
+    // block evaluation implementations, because they typically just read
+    // coefficients from the underlying tensor storage, and write to the tensor
+    // block buffer (scratch or destination memory, reads and writes have linear
+    // access pattern). We ignore the fixed cost of block evaluation, because in
+    // practice it should negligible.
+    //
+    // Lazy block evaluation adds the cost of calling a functor for each
+    // coefficient.
+    //
+    // All non-trivial block evaluation implementations must provide their own
+    // cost approximation (e.g. shuffling inner dimension has a much higher cost
+    // because it reads memory randomly, although the total number of moved
+    // bytes is the same).
+    return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
+                                    {/*bytes_loaded=*/sizeof(Scalar),
+                                     /*bytes_stored=*/sizeof(Scalar),
+                                     /*compute_cycles=*/0});
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
+      size_t size_in_bytes) {
+    return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
+                                    size_in_bytes);
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
+      size_t size_in_bytes) {
+    return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
+                                    size_in_bytes);
+  }
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
+  merge(const TensorBlockResourceRequirements& lhs,
+        const TensorBlockResourceRequirements& rhs) {
+    return {merge(lhs.shape_type, rhs.shape_type),           // shape_type
+            merge(lhs.size, rhs.size),                       // size
+            merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};  // cost_per_coeff
+  }
+
+  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
+      TensorOpCost cost) {
+    cost_per_coeff += cost;
+    return *this;
+  }
+
+  // This is a resource requirement that should be returned from expressions
+  // that do not have any block evaluation preference (e.g. default tensor
+  // expression with raw buffer access).
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
+    return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
+  }
+
+ private:
+  using Requirements = TensorBlockResourceRequirements;
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
+    return numext::maxi(lhs_size, rhs_size);
+  }
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorBlockShapeType
+  merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
+    return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
+            rhs == TensorBlockShapeType::kSkewedInnerDims)
+               ? TensorBlockShapeType::kSkewedInnerDims
+               : TensorBlockShapeType::kUniformAllDims;
+  }
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
+                                                TensorOpCost rhs_cost) {
+    return lhs_cost + rhs_cost;
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockDescriptor specifies a block offset within a tensor and the block
+// sizes along each of the tensor dimensions.
+
+template <int NumDims, typename IndexType = Eigen::Index>
+class TensorBlockDescriptor {
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  // If we evaluate a Tensor assignment, and expression on the left, already has
+  // a memory buffer, then we might do performance optimization, and evaluate
+  // the root expression directly into the final output memory. Some time it's
+  // possible to reuse it for materializing subexpressions inside an expression
+  // tree, to to avoid dynamic memory allocation.
+  //
+  // The pointer type of the underlying storage is erased, because passing
+  // Scalar type through all the expression evaluation layers is way too many
+  // templates. In practice destination buffer type should always match the
+  // evaluated expression scalar type.
+  class DestinationBuffer {
+   public:
+    enum DestinationBufferKind : int {
+      // The above explicit specification of "int" as the enum basetype is
+      // needed to get around a HIPCC link error ("the field type is not
+      // amp-compatible")
+      // which is issued for class members with the enum type.
+      // TODO(rocm):
+      // remove the "int" basetype once HIPCC has been fixed to not error out
+      // in the above scenario.
+
+      // Destination buffer is not defined (`m_data` == nullptr).
+      kEmpty,
+
+      // Tensor block defined by an owning tensor block descriptor can fit
+      // contiguously into the destination buffer. In this case it's safe to
+      // materialize tensor block in the destination buffer, wrap it in a
+      // TensorMap, and use to build Eigen expression on top of it.
+      kContiguous,
+
+      // Destination buffer strides do not match strides of the contiguously
+      // stored block, and it's impossible to define a TensorMap over this
+      // buffer. However if we are evaluating a root of an expression tree, we
+      // still can materialize an output into this destination, because we can
+      // guarantee that no one will ever access it through block API.
+      //
+      // In theory it is possible to build valid TensorStriding<TensorMap>
+      // expression on top of this destination buffer, however it has
+      // inefficient coeff/packet access, and defeats the purpose of fast block
+      // evaluation API.
+      kStrided
+    };
+
+    template <typename Scalar>
+    Scalar* data() const {
+      eigen_assert(m_data_type_size == sizeof(Scalar));
+      return static_cast<Scalar*>(m_data);
+    }
+
+    const Dimensions& strides() const { return m_strides; }
+    const DestinationBufferKind& kind() const { return m_kind; }
+
+   private:
+    friend class TensorBlockDescriptor;
+
+    DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
+
+    template <typename Scalar>
+    DestinationBuffer(Scalar* data, const Dimensions& strides,
+                      DestinationBufferKind kind)
+        : m_data(static_cast<void*>(data)),
+          m_data_type_size(sizeof(Scalar)),
+          m_strides(strides),
+          m_kind(kind) {}
+
+    template <int Layout, typename Scalar>
+    static DestinationBuffer make(const TensorBlockDescriptor& desc,
+                                  Scalar* data, const Dimensions& strides) {
+      return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
+    }
+
+    template <int Layout>
+    static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
+                                      const Dimensions& strides) {
+      const Dimensions& desc_dims = desc.dimensions();
+      const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
+      for (int i = 0; i < NumDims; ++i) {
+        if (desc_dims[i] == 1) continue;
+        if (desc_strides[i] != strides[i]) return kStrided;
+      }
+      return kContiguous;
+    }
+
+    // Storage pointer is type erased, to reduce template bloat, but we still
+    // keep the size of the underlying element type for error checking.
+    void* m_data;
+    size_t m_data_type_size;
+
+    // Destination buffer dimensions always match the dimensions of a tensor
+    // block descriptor it belongs to, however strides might be different.
+    Dimensions m_strides;
+
+    DestinationBufferKind m_kind;
+  };
+
+  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
+                        const DestinationBuffer& destination)
+      : m_offset(offset),
+        m_dimensions(dimensions),
+        m_destination(destination) {}
+
+  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
+      : m_offset(offset),
+        m_dimensions(dimensions),
+        m_destination(DestinationBuffer()) {}
+
+  IndexType offset() const { return m_offset; }
+  const Dimensions& dimensions() const { return m_dimensions; }
+  IndexType dimension(int index) const { return m_dimensions[index]; }
+  IndexType size() const { return array_prod<IndexType>(m_dimensions); }
+
+  const DestinationBuffer& destination() const { return m_destination; }
+
+  template <int Layout, typename Scalar>
+  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
+    eigen_assert(dst_base != NULL);
+    m_destination =
+        DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
+  }
+
+  template <int Layout, typename Scalar, typename DstStridesIndexType>
+  void AddDestinationBuffer(
+      Scalar* dst_base,
+      const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
+    // DSizes constructor will do index type promotion if it's safe.
+    AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
+  }
+
+  TensorBlockDescriptor& DropDestinationBuffer() {
+    m_destination.m_data = NULL;
+    m_destination.m_kind = DestinationBuffer::kEmpty;
+    return *this;
+  }
+
+  bool HasDestinationBuffer() const {
+    return m_destination.kind() != DestinationBuffer::kEmpty;
+  }
+
+  // Returns a copy of `*this` with updated offset.
+  TensorBlockDescriptor WithOffset(IndexType offset) const {
+    return TensorBlockDescriptor(offset, m_dimensions, m_destination);
+  }
+
+ private:
+  // Offset and dimensions are immutable after construction. Block descriptor
+  // can only be mutated by adding or dropping destination.
+  const IndexType m_offset;
+  const Dimensions m_dimensions;
+  DestinationBuffer m_destination;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
+
+template <int NumDims, int Layout, typename IndexType = Eigen::Index>
+class TensorBlockMapper {
+  typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
+
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  TensorBlockMapper() = default;
+  TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
+                    const TensorBlockResourceRequirements& requirements)
+      : m_tensor_dimensions(dimensions), m_requirements(requirements) {
+    // Compute block dimensions and the total number of blocks.
+    InitializeBlockDimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
+    return m_total_block_count;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
+    return m_block_dimensions.TotalSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
+  blockDimensions() const {
+    return m_block_dimensions;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
+  blockDescriptor(IndexType block_index) const {
+    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
+
+    IndexType offset = 0;
+    DSizes<IndexType, NumDims> dimensions;
+
+    if (NumDims == 0) return BlockDescriptor(offset, dimensions);
+
+    // Iterate outer -> inner dimensions.
+    for (int i = NumDims - 1; i >= 0; --i) {
+      const int dim = isColMajor ? i : NumDims - i - 1;
+
+      const IndexType idx = block_index / m_block_strides[dim];
+      block_index -= idx * m_block_strides[dim];
+
+      const IndexType coord = idx * m_block_dimensions[dim];
+      dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
+                                     m_block_dimensions[dim]);
+      offset += coord * m_tensor_strides[dim];
+    }
+
+    return {offset, dimensions};
+  }
+
+ private:
+  void InitializeBlockDimensions() {
+    // Requested block shape and size.
+    const TensorBlockShapeType shape_type = m_requirements.shape_type;
+    IndexType target_block_size =
+        numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
+
+    IndexType tensor_size = m_tensor_dimensions.TotalSize();
+
+    // Corner case: one of the dimensions is zero. Logic below is too complex
+    // to handle this case on a general basis, just use unit block size.
+    // Note: we must not yield blocks with zero dimensions (recipe for
+    // overflows/underflows, divisions by zero and NaNs later).
+    if (tensor_size == 0) {
+      for (int i = 0; i < NumDims; ++i) {
+        m_block_dimensions[i] = 1;
+      }
+      m_total_block_count = 0;
+      return;
+    }
+
+    // If tensor fits into a target block size, evaluate it as a single block.
+    if (tensor_size <= target_block_size) {
+      m_block_dimensions = m_tensor_dimensions;
+      m_total_block_count = 1;
+      // The only valid block index is `0`, and in this case we do not need
+      // to compute real strides for tensor or blocks (see blockDescriptor).
+      for (int i = 0; i < NumDims; ++i) {
+        m_tensor_strides[i] = 0;
+        m_block_strides[i] = 1;
+      }
+      return;
+    }
+
+    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
+
+    // Block shape skewed towards inner dimension.
+    if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
+      IndexType coeff_to_allocate = target_block_size;
+
+      for (int i = 0; i < NumDims; ++i) {
+        const int dim = isColMajor ? i : NumDims - i - 1;
+        m_block_dimensions[dim] =
+            numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
+        coeff_to_allocate = divup(
+            coeff_to_allocate,
+            numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
+      }
+      eigen_assert(coeff_to_allocate == 1);
+
+    } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
+      // Tensor will not fit within 'target_block_size' budget: calculate tensor
+      // block dimension sizes based on "square" dimension size target.
+      const IndexType dim_size_target = convert_index<IndexType>(
+          std::pow(static_cast<float>(target_block_size),
+                   1.0f / static_cast<float>(m_block_dimensions.rank())));
+
+      for (int i = 0; i < NumDims; ++i) {
+        // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
+        // a multiple of the packet size. Note that reducing
+        // 'block_dim_size' in this manner can increase the number of
+        // blocks, and so will amplify any per-block overhead.
+        m_block_dimensions[i] =
+            numext::mini(dim_size_target, m_tensor_dimensions[i]);
+      }
+
+      // Add any un-allocated coefficients to inner dimension(s).
+      IndexType total_size = m_block_dimensions.TotalSize();
+      for (int i = 0; i < NumDims; ++i) {
+        const int dim = isColMajor ? i : NumDims - i - 1;
+
+        if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
+          const IndexType total_size_other_dims =
+              total_size / m_block_dimensions[dim];
+          const IndexType alloc_avail =
+              divup<IndexType>(target_block_size, total_size_other_dims);
+          if (alloc_avail == m_block_dimensions[dim]) {
+            // Insufficient excess coefficients to allocate.
+            break;
+          }
+          m_block_dimensions[dim] =
+              numext::mini(m_tensor_dimensions[dim], alloc_avail);
+          total_size = total_size_other_dims * m_block_dimensions[dim];
+        }
+      }
+
+    } else {
+      eigen_assert(false);  // unknown block shape
+    }
+
+    eigen_assert(m_block_dimensions.TotalSize() >=
+                 numext::mini<IndexType>(target_block_size,
+                                         m_tensor_dimensions.TotalSize()));
+
+    // Calculate block counts by dimension and total block count.
+    DSizes<IndexType, NumDims> block_count;
+    for (int i = 0; i < NumDims; ++i) {
+      block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
+    }
+    m_total_block_count = array_prod(block_count);
+
+    // Calculate block strides (used for enumerating blocks).
+    m_tensor_strides = strides<Layout>(m_tensor_dimensions);
+    m_block_strides = strides<Layout>(block_count);
+  }
+
+  DSizes<IndexType, NumDims> m_tensor_dimensions;
+  TensorBlockResourceRequirements m_requirements;
+
+  DSizes<IndexType, NumDims> m_block_dimensions;
+  IndexType m_total_block_count;
+
+  DSizes<IndexType, NumDims> m_tensor_strides;
+  DSizes<IndexType, NumDims> m_block_strides;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockScratchAllocator is responsible for allocating temporary buffers
+// for block evaluation (output or input block materialization). Given that
+// Eigen expression traversal order is deterministic, all temporary allocations
+// are happening in the same order, and usually have exactly the same size.
+// Scratch allocator keeps a trace of all dynamic allocations, and after the
+// first block evaluation is completed, we should be able to reuse all the
+// temporary buffers for the next block evaluation.
+
+template <typename Device>
+class TensorBlockScratchAllocator {
+ public:
+  explicit TensorBlockScratchAllocator(const Device& device)
+      : m_device(device), m_allocation_index(0) {}
+
+  ~TensorBlockScratchAllocator() {
+    for (size_t i = 0; i < m_allocations.size(); ++i) {
+      m_device.deallocate(m_allocations[i].ptr);
+    }
+  }
+
+  void* allocate(size_t size) {
+    // TODO(ezhulenev): Remove when replaced with inlined vector.
+    if (m_allocations.capacity() == 0) m_allocations.reserve(8);
+
+    // Check if we already have an existing allocation att current index.
+    const int num_allocations = static_cast<int>(m_allocations.size());
+    const bool has_allocation = m_allocation_index < num_allocations;
+
+    // Allocation index can't be larger than the number of allocations.
+    eigen_assert(m_allocation_index <= num_allocations);
+
+    // If we have existing allocation, and its size is larger or equal to
+    // requested size, we do nothing.
+
+    // If current allocation can't fit requested size, we deallocate it, and
+    // replace with a larger allocation.
+    if (has_allocation && m_allocations[m_allocation_index].size < size) {
+      m_device.deallocate(m_allocations[m_allocation_index].ptr);
+      m_allocations[m_allocation_index].ptr = m_device.allocate(size);
+      m_allocations[m_allocation_index].size = size;
+    }
+
+    // Make a new allocation if we don't have and existing one.
+    if (!has_allocation) {
+      Allocation allocation;
+      allocation.ptr = m_device.allocate(size);
+      allocation.size = size;
+      m_allocations.push_back(allocation);
+    }
+
+    eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
+    eigen_assert(m_allocations[m_allocation_index].size >= size);
+
+    return m_allocations[m_allocation_index++].ptr;
+  }
+
+  void reset() { m_allocation_index = 0; }
+
+ private:
+  struct Allocation {
+    void* ptr;
+    size_t size;
+  };
+
+  const Device& m_device;
+  int m_allocation_index;
+  // TODO(ezhulenev): This should be an inlined vector.
+  std::vector<Allocation> m_allocations;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockKind represents all possible block kinds, that can be produced by
+// TensorEvaluator::evalBlock function.
+enum TensorBlockKind {
+  // Tensor block that is a lazy expression that must be assigned to a
+  // destination using TensorBlockAssign.
+  kExpr,
+
+  // Tensor block that is a view into a memory buffer owned by an underlying
+  // Tensor expression (e.g. it can be a view into a Tensor buffer).
+  kView,
+
+  // Tensor block that was materialized in a scratch memory buffer, allocated
+  // with TensorBlockScratchAllocator. This block must be copied to a
+  // destination, similar to a block of `kExpr` type.
+  kMaterializedInScratch,
+
+  // Tensor block that was materialized directly into the final output memory
+  // buffer. For example if the left side of an assignment is a Tensor, we can
+  // directly materialize the block in the destination memory.
+  //
+  // If strides in the output buffer do not match tensor block strides, the
+  // Tensor expression will be invalid, and should not be used by
+  // TensorBlockAssign or for constructing another block expression.
+  kMaterializedInOutput
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
+// TensorEvaluators that do not support block evaluation.
+
+class TensorBlockNotImplemented {
+ public:
+  typedef void XprType;
+};
+
+// -------------------------------------------------------------------------- //
+// XprScalar extracts Scalar type from the Eigen expressions (if expression type
+// is not void). It's required to be able to define lazy block expression for
+// argument types, that do not support block evaluation.
+
+template <typename XprType>
+struct XprScalar {
+  typedef typename XprType::Scalar type;
+};
+template <>
+struct XprScalar<void> {
+  typedef void type;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorMaterializedBlock is a fully evaluated block of the original tensor,
+// and XprType is just a TensorMap over the data. This block type is typically
+// used to materialize blocks of tensor expressions, that can't be efficiently
+// represented as lazy Tensor expressions with fast coeff/packet operations,
+// e.g. we materialize all broadcasts into evaluated blocks.
+//
+// TensorMaterializedBlock does not own its memory buffer, it's either a memory
+// buffer that backs the original expression (e.g. block is just a view into a
+// Tensor), or a memory buffer allocated with scratch allocator, and in this
+// case the scratch allocator will deallocate it at the end of block based
+// expression execution.
+//
+// If the block was evaluated directly into the output buffer, and strides in
+// the output buffer do not match block strides, the TensorMap expression will
+// be invalid, and should never be used in block assignment or any other tensor
+// expression.
+
+template <typename Scalar, int NumDims, int Layout,
+          typename IndexType = Eigen::Index>
+class TensorMaterializedBlock {
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+  typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
+
+  TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
+                          const Dimensions& dimensions, bool valid_expr = true)
+      : m_kind(kind),
+        m_data(data),
+        m_dimensions(dimensions),
+        m_expr(m_data, m_dimensions),
+        m_valid_expr(valid_expr) {
+    eigen_assert(m_kind == internal::TensorBlockKind::kView ||
+                 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
+                 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
+  }
+
+  TensorBlockKind kind() const { return m_kind; }
+  // NOTE(ezhulenev): Returning XprType by value like in other block types
+  // causes asan failures. The theory is that XprType::Nested doesn't work
+  // properly for TensorMap.
+  const XprType& expr() const {
+    eigen_assert(m_valid_expr);
+    return m_expr;
+  }
+  const Scalar* data() const { return m_data; }
+  void cleanup() {}
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+
+  // TensorMaterializedBlock can be backed by different types of storage:
+  //
+  //   (1) Contiguous block of memory allocated with scratch allocator.
+  //   (2) Contiguous block of memory reused from tensor block descriptor
+  //       destination buffer.
+  //   (3) Strided block of memory reused from tensor block descriptor
+  //       destination buffer.
+  //
+  class Storage {
+   public:
+    Scalar* data() const { return m_data; }
+    const Dimensions& dimensions() const { return m_dimensions; }
+    const Dimensions& strides() const { return m_strides; }
+
+    TensorMaterializedBlock AsTensorMaterializedBlock() const {
+      return TensorMaterializedBlock(
+          m_materialized_in_output
+              ? internal::TensorBlockKind::kMaterializedInOutput
+              : internal::TensorBlockKind::kMaterializedInScratch,
+          m_data, m_dimensions, !m_strided_storage);
+    }
+
+   private:
+    friend class TensorMaterializedBlock;
+
+    Storage(Scalar* data, const Dimensions& dimensions,
+            const Dimensions& strides, bool materialized_in_output,
+            bool strided_storage)
+        : m_data(data),
+          m_dimensions(dimensions),
+          m_strides(strides),
+          m_materialized_in_output(materialized_in_output),
+          m_strided_storage(strided_storage) {}
+
+    Scalar* m_data;
+    Dimensions m_dimensions;
+    Dimensions m_strides;
+    bool m_materialized_in_output;
+    bool m_strided_storage;
+  };
+
+  // Creates a storage for materialized block either from the block descriptor
+  // destination buffer, or allocates a new buffer with scratch allocator.
+  template <typename TensorBlockScratch>
+  EIGEN_STRONG_INLINE static Storage prepareStorage(
+      TensorBlockDesc& desc, TensorBlockScratch& scratch,
+      bool allow_strided_storage = false) {
+    // Try to reuse destination as an output block buffer.
+    typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
+
+    if (desc.destination().kind() == DestinationBuffer::kContiguous) {
+      Scalar* buffer = desc.destination().template data<Scalar>();
+      desc.DropDestinationBuffer();
+      return Storage(buffer, desc.dimensions(),
+                     internal::strides<Layout>(desc.dimensions()),
+                     /*materialized_in_output=*/true,
+                     /*strided_storage=*/false);
+
+    } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
+               allow_strided_storage) {
+      Scalar* buffer = desc.destination().template data<Scalar>();
+      desc.DropDestinationBuffer();
+      return Storage(buffer, desc.dimensions(), desc.destination().strides(),
+                     /*materialized_in_output=*/true, /*strided_storage=*/true);
+
+    } else {
+      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
+      return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
+                     internal::strides<Layout>(desc.dimensions()),
+                     /*materialized_in_output=*/false,
+                     /*strided_storage=*/false);
+    }
+  }
+
+  // Creates a materialized block for the given descriptor from a memory buffer.
+  template <typename DataDimensions, typename TensorBlockScratch>
+  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
+      const Scalar* data, const DataDimensions& data_dims,
+      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
+
+    // If a tensor block dimensions covers a contiguous block of the underlying
+    // memory, we can skip block buffer memory allocation, and construct a block
+    // from existing `data` memory buffer.
+    //
+    // Example: (RowMajor layout)
+    //   data_dims:          [11, 12, 13, 14]
+    //   desc.dimensions():  [1,   1,  3, 14]
+    //
+    // In this case we can construct a TensorBlock starting at
+    // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
+    static const bool is_col_major = Layout == ColMajor;
+
+    // Find out how many inner dimensions have a matching size.
+    int num_matching_inner_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      int dim = is_col_major ? i : NumDims - i - 1;
+      if (data_dims[dim] != desc.dimensions()[dim]) break;
+      ++num_matching_inner_dims;
+    }
+
+    // All the outer dimensions must be of size `1`, except a single dimension
+    // before the matching inner dimension (`3` in the example above).
+    bool can_use_direct_access = true;
+    for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
+      int dim = is_col_major ? i : NumDims - i - 1;
+      if (desc.dimension(dim) != 1) {
+        can_use_direct_access = false;
+        break;
+      }
+    }
+
+    if (can_use_direct_access) {
+      const Scalar* block_start = data + desc.offset();
+      return TensorMaterializedBlock(internal::TensorBlockKind::kView,
+                                     block_start, desc.dimensions());
+
+    } else {
+      // Reuse destination buffer or allocate new buffer with scratch allocator.
+      const Storage storage = prepareStorage(desc, scratch);
+
+      typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
+          TensorBlockIO;
+      typedef typename TensorBlockIO::Dst TensorBlockIODst;
+      typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+      TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
+                           data, desc.offset());
+      TensorBlockIODst dst(storage.dimensions(), storage.strides(),
+                           storage.data());
+
+      TensorBlockIO::Copy(dst, src);
+      return storage.AsTensorMaterializedBlock();
+    }
+  }
+
+ private:
+  TensorBlockKind m_kind;
+  const Scalar* m_data;
+  Dimensions m_dimensions;
+  XprType m_expr;
+  bool m_valid_expr;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
+// functor to the blocks produced by the underlying Tensor expression.
+
+template <typename UnaryOp, typename ArgTensorBlock>
+class TensorCwiseUnaryBlock {
+  static const bool NoArgBlockAccess =
+      internal::is_void<typename ArgTensorBlock::XprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
+      type XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
+      : m_arg_block(arg_block), m_functor(functor) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+
+  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
+  const Scalar* data() const { return NULL; }
+  void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+  ArgTensorBlock m_arg_block;
+  UnaryOp m_functor;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
+// functor to the blocks produced by the underlying Tensor expression.
+
+template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
+class TensorCwiseBinaryBlock {
+  static const bool NoArgBlockAccess =
+      internal::is_void<typename LhsTensorBlock::XprType>::value ||
+      internal::is_void<typename RhsTensorBlock::XprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
+                          const typename RhsTensorBlock::XprType> >::type
+      XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
+                         const RhsTensorBlock& right_block,
+                         const BinaryOp& functor)
+      : m_left_block(left_block),
+        m_right_block(right_block),
+        m_functor(functor) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+
+  XprType expr() const {
+    return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
+  }
+
+  const Scalar* data() const { return NULL; }
+
+  void cleanup() {
+    m_left_block.cleanup();
+    m_right_block.cleanup();
+  }
+
+ private:
+  LhsTensorBlock m_left_block;
+  RhsTensorBlock m_right_block;
+  BinaryOp m_functor;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorUnaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from a block of the underlying type (this is a
+// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
+
+template <typename BlockFactory, typename ArgTensorBlock>
+class TensorUnaryExprBlock {
+  typedef typename ArgTensorBlock::XprType ArgXprType;
+  static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
+                       const BlockFactory& factory)
+      : m_arg_block(arg_block), m_factory(factory) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
+  const Scalar* data() const { return NULL; }
+  void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+  ArgTensorBlock m_arg_block;
+  BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorTernaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from three blocks of the underlying type.
+
+template <typename BlockFactory, typename Arg1TensorBlock,
+          typename Arg2TensorBlock, typename Arg3TensorBlock>
+class TensorTernaryExprBlock {
+  typedef typename Arg1TensorBlock::XprType Arg1XprType;
+  typedef typename Arg2TensorBlock::XprType Arg2XprType;
+  typedef typename Arg3TensorBlock::XprType Arg3XprType;
+
+  static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
+                                       internal::is_void<Arg2XprType>::value ||
+                                       internal::is_void<Arg3XprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
+                                              Arg3XprType>::type>::type XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
+                         const Arg2TensorBlock& arg2_block,
+                         const Arg3TensorBlock& arg3_block,
+                         const BlockFactory& factory)
+      : m_arg1_block(arg1_block),
+        m_arg2_block(arg2_block),
+        m_arg3_block(arg3_block),
+        m_factory(factory) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+  XprType expr() const {
+    return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
+                          m_arg3_block.expr());
+  }
+  const Scalar* data() const { return NULL; }
+  void cleanup() {
+    m_arg1_block.cleanup();
+    m_arg2_block.cleanup();
+    m_arg3_block.cleanup();
+  }
+
+ private:
+  Arg1TensorBlock m_arg1_block;
+  Arg2TensorBlock m_arg2_block;
+  Arg3TensorBlock m_arg3_block;
+  BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// StridedLinearBufferCopy provides a method to copy data between two linear
+// buffers with different strides, with optimized paths for scatter/gather.
+
+template <typename Scalar, typename IndexType>
+class StridedLinearBufferCopy {
+  typedef typename packet_traits<Scalar>::type Packet;
+  enum {
+    Vectorizable = packet_traits<Scalar>::Vectorizable,
+    PacketSize = packet_traits<Scalar>::size
+  };
+
+ public:
+  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
+  enum class Kind {
+    Linear = 0,       // src_stride == 1 && dst_stride == 1
+    Scatter = 1,      // src_stride == 1 && dst_stride != 1
+    FillLinear = 2,   // src_stride == 0 && dst_stride == 1
+    FillScatter = 3,  // src_stride == 0 && dst_stride != 1
+    Gather = 4,       // dst_stride == 1
+    Random = 5        // everything else
+  };
+
+  struct Dst {
+    Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
+
+    IndexType offset;
+    IndexType stride;
+    Scalar* data;
+  };
+
+  struct Src {
+    Src(IndexType o, IndexType s, const Scalar* d)
+        : offset(o), stride(s), data(d) {}
+
+    IndexType offset;
+    IndexType stride;
+    const Scalar* data;
+  };
+
+  template <typename StridedLinearBufferCopy::Kind kind>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
+                                                        const Src& src,
+                                                        const size_t count) {
+    Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
+              src.data);
+  }
+
+ private:
+  template <typename StridedLinearBufferCopy::Kind kind>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const IndexType count, const IndexType dst_offset,
+      const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
+      const IndexType src_offset, const IndexType src_stride,
+      const Scalar* EIGEN_RESTRICT src_data) {
+    const Scalar* src = &src_data[src_offset];
+    Scalar* dst = &dst_data[dst_offset];
+
+    if (!Vectorizable) {
+      for (Index i = 0; i < count; ++i) {
+        dst[i * dst_stride] = src[i * src_stride];
+      }
+      return;
+    }
+
+    const IndexType vectorized_size = count - PacketSize;
+    IndexType i = 0;
+
+    if (kind == StridedLinearBufferCopy::Kind::Linear) {
+      // ******************************************************************** //
+      // Linear copy from `src` to `dst`.
+      const IndexType unrolled_size = count - 4 * PacketSize;
+      eigen_assert(src_stride == 1 && dst_stride == 1);
+      for (; i <= unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          Packet p = ploadu<Packet>(src + i + j * PacketSize);
+          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+        }
+      }
+      for (; i <= vectorized_size; i += PacketSize) {
+        Packet p = ploadu<Packet>(src + i);
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      for (; i < count; ++i) {
+        dst[i] = src[i];
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
+      // Scatter from `src` to `dst`.
+      eigen_assert(src_stride == 1 && dst_stride != 1);
+      for (; i <= vectorized_size; i += PacketSize) {
+        Packet p = ploadu<Packet>(src + i);
+        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
+      }
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = src[i];
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
+      // Fill `dst` with value at `*src`.
+      eigen_assert(src_stride == 0 && dst_stride == 1);
+      const IndexType unrolled_size = count - 4 * PacketSize;
+      Packet p = pload1<Packet>(src);
+      for (; i <= unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+        }
+      }
+      for (; i <= vectorized_size; i += PacketSize) {
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      for (; i < count; ++i) {
+        dst[i] = *src;
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
+      // Scatter `*src` into `dst`.
+      eigen_assert(src_stride == 0 && dst_stride != 1);
+      Packet p = pload1<Packet>(src);
+      for (; i <= vectorized_size; i += PacketSize) {
+        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
+      }
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = *src;
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
+      // Gather from `src` into `dst`.
+      eigen_assert(dst_stride == 1);
+      for (; i <= vectorized_size; i += PacketSize) {
+        Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      for (; i < count; ++i) {
+        dst[i] = src[i * src_stride];
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::Random) {
+      // Random.
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = src[i * src_stride];
+      }
+    } else {
+      eigen_assert(false);
+    }
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
+// It's possible to specify src->dst dimension mapping for the copy operation.
+// Dimensions of `dst` specify how many elements have to be copied, for the
+// `src` we need to know only stride to navigate through source memory buffer.
+
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIO {
+  static const bool IsColMajor = (Layout == ColMajor);
+
+  typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
+
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+  typedef DSizes<int, NumDims> DimensionsMap;
+
+  struct Dst {
+    Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
+        IndexType dst_offset = 0)
+        : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
+
+    Dimensions dims;
+    Dimensions strides;
+    Scalar* data;
+    IndexType offset;
+  };
+
+  struct Src {
+    Src(const Dimensions& src_strides, const Scalar* src,
+        IndexType src_offset = 0)
+        : strides(src_strides), data(src), offset(src_offset) {}
+
+    Dimensions strides;
+    const Scalar* data;
+    IndexType offset;
+  };
+
+  // Copies data to `dst` from `src`, using provided dimensions mapping:
+  //
+  //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
+  //
+  // Returns the number of copied elements.
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
+      const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
+    // Copy single scalar value from `src` to `dst`.
+    if (NumDims == 0) {
+      *(dst.data + dst.offset) = *(src.data + src.offset);
+      return 1;
+    }
+
+    // Both `dst` and `src` must have contiguous innermost dimension. We also
+    // accept the special case with stride '0', because it's used as a trick to
+    // implement broadcasting.
+    {
+      int inner_dim = IsColMajor ? 0 : NumDims - 1;
+      EIGEN_UNUSED_VARIABLE(inner_dim);
+      eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
+      eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
+    }
+
+    // Give a shorter name to `dst_to_src_dim_map`.
+    const DimensionsMap& dim_map = dst_to_src_dim_map;
+
+    // Do not squeeze reordered inner dimensions.
+    int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
+
+    // NOTE: We find the innermost dimension (contiguous in memory) in the dst
+    // block, and we write data linearly into that dimension, reading it from
+    // the src. If dimensions are reordered, we might end up reading data from
+    // the src with `stride != 1`.
+    //
+    // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
+    // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
+
+    // Find the innermost dimension in the dst whose size is not 1. This is the
+    // effective inner dim.
+    int num_size_one_inner_dims = 0;
+    for (int i = 0; i < num_squeezable_dims; ++i) {
+      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
+      if (dst.dims[dst_dim] != 1) break;
+      num_size_one_inner_dims++;
+    }
+
+    // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
+    if (num_size_one_inner_dims == NumDims) {
+      *(dst.data + dst.offset) = *(src.data + src.offset);
+      return 1;
+    }
+
+    // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
+    const int dst_stride1_dim = IsColMajor
+                                    ? num_size_one_inner_dims
+                                    : NumDims - num_size_one_inner_dims - 1;
+
+    // Dimension in the src that corresponds to the dst innermost dimension.
+    const int src_dim_for_dst_stride1_dim =
+        NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
+
+    // Size of the innermost dimension (length of contiguous blocks of memory).
+    IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
+
+    // Squeeze multiple inner dims into one if they are contiguous in `dst` and
+    // `src` memory, so we can do less linear copy calls.
+    for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
+      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
+      const IndexType dst_stride = dst.strides[dst_dim];
+      const IndexType src_stride = src.strides[dim_map[dst_dim]];
+      if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
+        dst_inner_dim_size *= dst.dims[dst_dim];
+        ++num_size_one_inner_dims;
+      } else {
+        break;
+      }
+    }
+
+    // Setup strides to read data from `src` and write to `dst`.
+    IndexType input_offset = src.offset;
+    IndexType output_offset = dst.offset;
+    IndexType input_stride =
+        NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
+    IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
+
+    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
+    array<BlockIteratorState, at_least_1_dim> it;
+
+    // Initialize block iterator state. Squeeze away any dimension of size 1.
+    int idx = 0;  // currently initialized iterator state index
+    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
+      const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
+      if (dst.dims[dst_dim] == 1) continue;
+
+      it[idx].size = dst.dims[dst_dim];
+      it[idx].input_stride = src.strides[dim_map[dst_dim]];
+      it[idx].output_stride = dst.strides[dst_dim];
+
+      it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+
+      idx++;
+    }
+
+    // Iterate copying data from src to dst.
+    const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
+
+#define COPY_INNER_DIM(KIND)                                           \
+  IndexType num_copied = 0;                                            \
+  for (num_copied = 0; num_copied < block_total_size;                  \
+       num_copied += dst_inner_dim_size) {                             \
+    LinCopy::template Run<KIND>(                                       \
+        typename LinCopy::Dst(output_offset, output_stride, dst.data), \
+        typename LinCopy::Src(input_offset, input_stride, src.data),   \
+        dst_inner_dim_size);                                           \
+                                                                       \
+    for (int j = 0; j < idx; ++j) {                                    \
+      if (++it[j].count < it[j].size) {                                \
+        input_offset += it[j].input_stride;                            \
+        output_offset += it[j].output_stride;                          \
+        break;                                                         \
+      }                                                                \
+      it[j].count = 0;                                                 \
+      input_offset -= it[j].input_span;                                \
+      output_offset -= it[j].output_span;                              \
+    }                                                                  \
+  }                                                                    \
+  return num_copied;
+
+    if (input_stride == 1 && output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Kind::Linear);
+    } else if (input_stride == 1 && output_stride != 1) {
+      COPY_INNER_DIM(LinCopy::Kind::Scatter);
+    } else if (input_stride == 0 && output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Kind::FillLinear);
+    } else if (input_stride == 0 && output_stride != 1) {
+      COPY_INNER_DIM(LinCopy::Kind::FillScatter);
+    } else if (output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Kind::Gather);
+    } else {
+      COPY_INNER_DIM(LinCopy::Kind::Random);
+    }
+
+#undef COPY_INNER_DIM
+  }
+
+  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
+  // the number of copied elements.
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
+                                                              const Src& src) {
+    DimensionsMap dst_to_src_map;
+    for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
+    return Copy(dst, src, dst_to_src_map);
+  }
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : size(0),
+          count(0),
+          input_stride(0),
+          output_stride(0),
+          input_span(0),
+          output_span(0) {}
+
+    IndexType size;
+    IndexType count;
+    IndexType input_stride;
+    IndexType output_stride;
+    IndexType input_span;
+    IndexType output_span;
+  };
+
+  // Compute how many inner dimensions it's allowed to squeeze when doing IO
+  // between two tensor blocks. It's safe to squeeze inner dimensions, only
+  // if they are not reordered.
+  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
+    int num_squeezable_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      if (dim_map[dim] != dim) break;
+      num_squeezable_dims++;
+    }
+    return num_squeezable_dims;
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
+// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
+//
+// Currently there is no way to write from a Tensor expression to a block of
+// memory, if dimensions are reordered. If you need to do that, you should
+// materialize a Tensor block expression into a memory buffer, and then use
+// TensorBlockIO to copy data between two memory buffers with a custom
+// `target->src` dimension map (see definition above).
+//
+// Also currently the innermost dimension of `target` must have a stride '1'
+// (contiguous in memory). This restriction could be lifted with a `pscatter`,
+// but in practice it's never needed, and there is a similar TensorBlockIO
+// workaround for that.
+//
+// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
+// where `src` is a tensor expression. Explore if it is possible to rewrite IO
+// to use expressions instead of pointers, and after that TensorBlockAssignment
+// will become an alias to IO.
+template <typename Scalar, int NumDims, typename TensorBlockExpr,
+          typename IndexType = Eigen::Index>
+class TensorBlockAssignment {
+  // We will use coeff/packet path to evaluate block expressions.
+  typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
+      TensorBlockEvaluator;
+
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  enum {
+    Vectorizable = packet_traits<Scalar>::Vectorizable,
+    PacketSize = packet_traits<Scalar>::size
+  };
+
+  template <bool Vectorizable, typename Evaluator>
+  struct InnerDimAssign {
+    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
+                                        const Evaluator& eval,
+                                        IndexType eval_offset) {
+      for (IndexType i = 0; i < count; ++i) {
+        target[i] = eval.coeff(eval_offset + i);
+      }
+    }
+  };
+
+  template <typename Evaluator>
+  struct InnerDimAssign<true, Evaluator> {
+    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
+                                        const Evaluator& eval,
+                                        IndexType eval_offset) {
+      typedef typename packet_traits<Scalar>::type Packet;
+
+      const IndexType unrolled_size = count - 4 * PacketSize;
+      const IndexType vectorized_size = count - PacketSize;
+      IndexType i = 0;
+
+      for (; i <= unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          const IndexType idx = eval_offset + i + j * PacketSize;
+          Packet p = eval.template packet<Unaligned>(idx);
+          pstoreu<Scalar>(target + i + j * PacketSize, p);
+        }
+      }
+
+      for (; i <= vectorized_size; i += PacketSize) {
+        Packet p = eval.template packet<Unaligned>(eval_offset + i);
+        pstoreu<Scalar>(target + i, p);
+      }
+
+      for (; i < count; ++i) {
+        target[i] = eval.coeff(eval_offset + i);
+      }
+    }
+  };
+
+ public:
+  struct Target {
+    Target(const Dimensions& target_dims, const Dimensions& target_strides,
+           Scalar* target_data, IndexType target_offset = 0)
+        : dims(target_dims),
+          strides(target_strides),
+          data(target_data),
+          offset(target_offset) {}
+
+    Dimensions dims;
+    Dimensions strides;
+    Scalar* data;
+    IndexType offset;
+  };
+
+  static Target target(const Dimensions& target_dims,
+                       const Dimensions& target_strides, Scalar* target_data,
+                       IndexType target_offset = 0) {
+    return Target(target_dims, target_strides, target_data, target_offset);
+  }
+
+  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
+  static Target target(
+      const DSizes<TargetDimsIndexType, NumDims>& target_dims,
+      const DSizes<TargetStridesIndexType, NumDims>& target_strides,
+      Scalar* target_data, IndexType target_offset = 0) {
+    // DSizes constructor will do index type promotion if it's safe.
+    return Target(Dimensions(target_dims), Dimensions(target_strides),
+                  target_data, target_offset);
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Target& target, const TensorBlockExpr& expr) {
+    // Prepare evaluator for block expression.
+    DefaultDevice default_device;
+    TensorBlockEvaluator eval(expr, default_device);
+
+    // Tensor block expression dimension should match destination dimensions.
+    eigen_assert(dimensions_match(target.dims, eval.dimensions()));
+
+    static const int Layout = TensorBlockEvaluator::Layout;
+    static const bool is_col_major = Layout == ColMajor;
+
+    // Initialize output inner dimension size based on a layout.
+    const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
+    const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
+    IndexType output_inner_dim_size = target.dims[inner_dim_idx];
+
+    // Target inner dimension stride must be '1'.
+    eigen_assert(target.strides[inner_dim_idx] == 1);
+
+    // Squeeze multiple inner dims into one if they are contiguous in `target`.
+    IndexType num_squeezed_dims = 0;
+    for (Index i = 1; i < NumDims; ++i) {
+      const Index dim = is_col_major ? i : NumDims - i - 1;
+      const IndexType target_stride = target.strides[dim];
+
+      if (output_inner_dim_size == target_stride) {
+        output_inner_dim_size *= target.dims[dim];
+        num_squeezed_dims++;
+      } else {
+        break;
+      }
+    }
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+
+    int idx = 0;  // currently initialized iterator state index
+    for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
+      const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
+
+      it[idx].count = 0;
+      it[idx].size = target.dims[dim];
+      it[idx].output_stride = target.strides[dim];
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+      idx++;
+    }
+
+    // We read block expression from the beginning, and start writing data to
+    // `target` at given offset.
+    IndexType input_offset = 0;
+    IndexType output_offset = target.offset;
+
+    // Iterate copying data from `eval` to `target`.
+    for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
+      // Assign to `target` at current offset.
+      InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
+                     TensorBlockEvaluator>::Run(target.data + output_offset,
+                                                output_inner_dim_size, eval,
+                                                input_offset);
+
+      // Move input offset forward by the number of assigned coefficients.
+      input_offset += output_inner_dim_size;
+
+      // Update index.
+      for (int j = 0; j < idx; ++j) {
+        if (++it[j].count < it[j].size) {
+          output_offset += it[j].output_stride;
+          break;
+        }
+        it[j].count = 0;
+        output_offset -= it[j].output_span;
+      }
+    }
+  }
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : count(0), size(0), output_stride(0), output_span(0) {}
+
+    IndexType count;
+    IndexType size;
+    IndexType output_stride;
+    IndexType output_span;
+  };
+};
+
+// -------------------------------------------------------------------------- //
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 4cfe300eb428f0b25f5fd77b774aa47ba19ecf2f..a354132f607995f99dbab96a50db0c0bb5085ee7 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -31,12 +31,13 @@ struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Broadcast, typename XprType>
 struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
 {
-  typedef const TensorBroadcastingOp<Broadcast, XprType>& type;
+  typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type;
 };
 
 template<typename Broadcast, typename XprType>
@@ -54,7 +55,7 @@ struct is_input_scalar<Sizes<> > {
   static const bool value = true;
 };
 #ifndef EIGEN_EMULATE_CXX11_META_H
-template <typename std::size_t... Indices>
+template <typename std::ptrdiff_t... Indices>
 struct is_input_scalar<Sizes<Indices...> > {
   static const bool value = (Sizes<Indices...>::total_size == 1);
 };
@@ -103,27 +104,57 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  protected: //  all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout;
+  bool isCopy, nByOne, oneByN;
+  public:
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = true,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess = false
+    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : m_broadcast(op.broadcast()),m_impl(op.expression(), device)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  // We do block based broadcasting using a trick with 2x tensor rank and 0
+  // strides. See block method implementation for details.
+  typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : isCopy(false), nByOne(false), oneByN(false),
+        m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device)
   {
+
     // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
     // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
     // tensor with N >= 1 of 1 element first and then broadcast.
     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
     const InputDimensions& input_dims = m_impl.dimensions();
-    const Broadcast& broadcast = op.broadcast();
+    isCopy = true;
     for (int i = 0; i < NumDims; ++i) {
       eigen_assert(input_dims[i] > 0);
-      m_dimensions[i] = input_dims[i] * broadcast[i];
+      m_dimensions[i] = input_dims[i] * m_broadcast[i];
+      if (m_broadcast[i] != 1) {
+        isCopy = false;
+      }
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -141,16 +172,58 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
         m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
       }
     }
+
+    if (input_dims[0] == 1) {
+      oneByN = true;
+      for (int i = 1; i < NumDims; ++i) {
+        if (m_broadcast[i] != 1) {
+          oneByN = false;
+          break;
+        }
+      }
+    } else if (input_dims[NumDims-1] == 1) {
+      nByOne = true;
+      for (int i = 0; i < NumDims-1; ++i) {
+        if (m_broadcast[i] != 1) {
+          nByOne = false;
+          break;
+        }
+      }
+    }
+
+    // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
+    // broadcast shape is '[N, 1..., N]'
+    if (!oneByN && !nByOne) {
+      if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) {
+        nByOne = true;
+        oneByN = true;
+        for (int i = 1; i < NumDims-1; ++i) {
+          if (m_broadcast[i] != 1) {
+            nByOne = false;
+            oneByN = false;
+            break;
+          }
+        }
+      }
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -161,16 +234,24 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      return coeffColMajor(index);
+      if (isCopy) {
+        return m_impl.coeff(index);
+      } else {
+        return coeffColMajor(index);
+      }
     } else {
-      return coeffRowMajor(index);
+      if (isCopy) {
+        return m_impl.coeff(index);
+      } else {
+        return coeffRowMajor(index);
+      }
     }
   }
 
   // TODO: attempt to speed this up. The integer divisions and modulo are slow
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
-  {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
     Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
     for (int i = NumDims - 1; i > 0; --i) {
       const Index idx = index / m_outputStrides[i];
       if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -195,12 +276,17 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
         inputIndex += (index % m_impl.dimensions()[0]);
       }
     }
-    return m_impl.coeff(inputIndex);
+    return inputIndex;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
   {
+    return m_impl.coeff(indexColMajor(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
     Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < NumDims - 1; ++i) {
       const Index idx = index / m_outputStrides[i];
       if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -215,17 +301,22 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       }
       index -= idx * m_outputStrides[i];
     }
-    if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
-      eigen_assert(index < m_impl.dimensions()[NumDims-1]);
+    if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
+      eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
       inputIndex += index;
     } else {
-      if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
-        eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
+      if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
+        eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
       } else {
-        inputIndex += (index % m_impl.dimensions()[NumDims-1]);
+        inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
       }
     }
-    return m_impl.coeff(inputIndex);
+    return inputIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
+  {
+    return m_impl.coeff(indexRowMajor(index));
   }
 
   template<int LoadMode>
@@ -236,9 +327,148 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      return packetColMajor<LoadMode>(index);
+      if (isCopy) {
+        #ifdef EIGEN_GPU_COMPILE_PHASE
+        // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
+        // unaligned loads here. The reason is unclear though.
+        return m_impl.template packet<Unaligned>(index);
+        #else
+        return m_impl.template packet<LoadMode>(index);
+        #endif
+      } else if (oneByN && !nByOne) {
+        return packetNByOne<LoadMode>(index);
+      } else if (!oneByN && nByOne) {
+        return packetOneByN<LoadMode>(index);
+      } else if (oneByN && nByOne) {
+        return packetOneByNByOne<LoadMode>(index);
+      } else {
+        return packetColMajor<LoadMode>(index);
+      }
     } else {
-      return packetRowMajor<LoadMode>(index);
+      if (isCopy) {
+        #ifdef EIGEN_GPU_COMPILE_PHASE
+        // See above.
+        return m_impl.template packet<Unaligned>(index);
+        #else
+        return m_impl.template packet<LoadMode>(index);
+        #endif
+      } else if (oneByN && !nByOne) {
+        return packetOneByN<LoadMode>(index);
+      } else if (!oneByN && nByOne) {
+        return packetNByOne<LoadMode>(index);
+      } else if (oneByN && nByOne) {
+        return packetOneByNByOne<LoadMode>(index);
+      } else {
+        return packetRowMajor<LoadMode>(index);
+      }
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne
+  (Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    Index startDim, endDim;
+    Index inputIndex, outputOffset, batchedIndex;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      startDim = NumDims - 1;
+      endDim = 1;
+    } else {
+      startDim = 0;
+      endDim = NumDims - 2;
+    }
+
+    batchedIndex = index % m_outputStrides[startDim];
+    inputIndex   = batchedIndex / m_outputStrides[endDim];
+    outputOffset = batchedIndex % m_outputStrides[endDim];
+
+    if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
+      values[0] = m_impl.coeff(inputIndex);
+      return internal::pload1<PacketReturnType>(values);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
+        if (outputOffset + cur < m_outputStrides[endDim]) {
+          values[i] = m_impl.coeff(inputIndex);
+        } else {
+          ++inputIndex;
+          inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
+          values[i] = m_impl.coeff(inputIndex);
+          outputOffset = 0;
+          cur = 0;
+        }
+      }
+      return internal::pload<PacketReturnType>(values);
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    Index dim, inputIndex;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      dim = NumDims - 1;
+    } else {
+      dim = 0;
+    }
+
+    inputIndex = index % m_inputStrides[dim];
+    if (inputIndex + PacketSize <= m_inputStrides[dim]) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        if (inputIndex > m_inputStrides[dim]-1) {
+          inputIndex = 0;
+        }
+        values[i] = m_impl.coeff(inputIndex++);
+      }
+      return internal::pload<PacketReturnType>(values);
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    Index dim, inputIndex, outputOffset;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      dim = 1;
+    } else {
+      dim = NumDims - 2;
+    }
+
+    inputIndex   = index / m_outputStrides[dim];
+    outputOffset = index % m_outputStrides[dim];
+    if (outputOffset + PacketSize <= m_outputStrides[dim]) {
+      values[0] = m_impl.coeff(inputIndex);
+      return internal::pload1<PacketReturnType>(values);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
+        if (outputOffset + cur < m_outputStrides[dim]) {
+          values[i] = m_impl.coeff(inputIndex);
+        } else {
+          values[i] = m_impl.coeff(++inputIndex);
+          outputOffset = 0;
+          cur = 0;
+        }
+      }
+      return internal::pload<PacketReturnType>(values);
     }
   }
 
@@ -253,6 +483,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     const Index originalIndex = index;
 
     Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
     for (int i = NumDims - 1; i > 0; --i) {
       const Index idx = index / m_outputStrides[i];
       if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -288,8 +519,13 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     } else {
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndex);
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize; ++i) {
-        values[i] = coeffColMajor(originalIndex+i);
+        if (innermostLoc + i < m_impl.dimensions()[0]) {
+          values[i] = m_impl.coeff(inputIndex+i);
+        } else {
+          values[i] = coeffColMajor(originalIndex+i);
+        }
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;
@@ -305,6 +541,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     const Index originalIndex = index;
 
     Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < NumDims - 1; ++i) {
       const Index idx = index / m_outputStrides[i];
       if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -340,8 +577,13 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     } else {
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndex);
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize; ++i) {
-        values[i] = coeffRowMajor(originalIndex+i);
+        if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
+          values[i] = m_impl.coeff(inputIndex+i);
+        } else {
+          values[i] = coeffRowMajor(originalIndex+i);
+        }
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;
@@ -351,7 +593,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
     double compute_cost = TensorOpCost::AddCost<Index>();
-    if (NumDims > 0) {
+    if (!isCopy && NumDims > 0) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         compute_cost += TensorOpCost::DivCost<Index>();
         if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -372,14 +615,472 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
            TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
+    // tensors. But this might need further tuning.
+    const size_t target_size = m_device.firstLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        m_impl.getResourceRequirements(),
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    BlockBroadcastingParams params = blockBroadcastingParams(desc);
+
+    if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) {
+      return emptyBlock();
+    }
+
+    // Prepare storage for the materialized broadcasting result.
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch);
+    ScalarNoConst* materialized_output = block_storage.data();
+
+    // We potentially will need to materialize input blocks.
+    size_t materialized_input_size = 0;
+    ScalarNoConst* materialized_input = NULL;
+
+    // Initialize block broadcating iterator state for outer dimensions (outer
+    // with regard to bcast dimension). Dimension in this array are always in
+    // inner_most -> outer_most order (col major layout).
+    array<BlockBroadcastingIteratorState, NumDims> it;
+    int idx = 0;
+
+    for (int i = params.inner_dim_count + 1; i < NumDims; ++i) {
+      const Index dim = IsColMajor ? i : NumDims - 1 - i;
+      it[idx].size = params.output_dims[dim];
+      it[idx].count = 0;
+      it[idx].output_stride = m_outputStrides[dim];
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+      idx++;
+    }
+
+    // Write output into the beginning of `materialized_output`.
+    Index output_offset = 0;
+
+    // We will fill output block by broadcasting along the bcast dim, and
+    // iterating over outer dimension.
+    const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
+
+    for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) {
+      ScalarNoConst* bcast_output = materialized_output + num_output_coeffs;
+      Index bcast_offset = desc.offset() + output_offset;
+
+      // Broadcast along the bcast dimension.
+      num_output_coeffs += BroadcastBlockAlongBcastDim(
+          params, bcast_offset, scratch, bcast_output, &materialized_input,
+          &materialized_input_size);
+
+      // Switch to the next outer dimension.
+      for (int j = 0; j < idx; ++j) {
+        if (++it[j].count < it[j].size) {
+          output_offset += it[j].output_stride;
+          break;
+        }
+        it[j].count = 0;
+        output_offset -= it[j].output_span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
   Broadcast functor() const { return m_broadcast; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
+      cl::sycl::handler& cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
+ private:
+  static const bool IsColMajor =
+      static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+  // We will build a general case block broadcasting on top of broadcasting
+  // primitive that will do broadcasting only for the inner dimension(s) along
+  // the first dimension smaller than the input size (it's called `bcast_dim`).
+  //
+  // Example:
+  //           dim:  0  1  2   (ColMajor)
+  //    input size: [9, 3, 6]
+  //    block size: [9, 2, 6]
+  //
+  // We will compute broadcasted block by iterating over the outer dimensions
+  // before `bcast_dim` (only dimension `2` in this example) and computing
+  // broadcasts along the `bcast_dim` (dimension `1` in this example).
+
+  // BlockBroadcastingParams holds precomputed parameters for broadcasting a
+  // single block along the broadcasting dimension. Sizes and strides along the
+  // `bcast_dim` might be invalid, they will be adjusted later in
+  // `BroadcastBlockAlongBcastDim`.
+  struct BlockBroadcastingParams {
+    Dimensions input_dims;      // input expression dimensions
+    Dimensions output_dims;     // output block sizes
+    Dimensions output_strides;  // output block strides
+
+    int inner_dim_count;   // count inner dimensions matching in size
+    int bcast_dim;         // broadcasting dimension index
+    Index bcast_dim_size;  // broadcasting dimension size
+    Index inner_dim_size;  // inner dimensions size
+
+    // Block sizes and strides for the input block where all dimensions before
+    // `bcast_dim` are equal to `1`.
+    Dimensions input_block_sizes;
+    Dimensions input_block_strides;
+
+    // Block sizes and strides for blocks with extra dimensions and strides `0`.
+    BroadcastDimensions bcast_block_sizes;
+    BroadcastDimensions bcast_block_strides;
+    BroadcastDimensions bcast_input_strides;
+  };
+
+  struct BlockBroadcastingIteratorState {
+    Index size;
+    Index count;
+    Index output_stride;
+    Index output_span;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams
+  blockBroadcastingParams(TensorBlockDesc& desc) const {
+    BlockBroadcastingParams params;
+
+    params.input_dims = Dimensions(m_impl.dimensions());
+
+    // Output block sizes and strides.
+    params.output_dims = desc.dimensions();
+    params.output_strides = internal::strides<Layout>(params.output_dims);
+
+    // Find the broadcasting dimension (first dimension with output size smaller
+    // that the input size).
+    params.bcast_dim = 0;
+    params.bcast_dim_size = 1;
+    params.inner_dim_size = 1;
+
+    // Count the number of inner dimensions that have the same size in the block
+    // and in the broadcast expression.
+    params.inner_dim_count = 0;
+
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+
+      if (params.output_dims[dim] == m_dimensions[dim]) {
+        params.inner_dim_size *= params.output_dims[dim];
+        ++params.inner_dim_count;
+        continue;
+      }
+
+      // First non-matching dimension is the broadcasting dimension.
+      eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
+      params.bcast_dim = dim;
+      params.bcast_dim_size = params.output_dims[dim];
+      break;
+    }
+
+    // Calculate the input block size for looking into the input.
+    for (int i = 0; i < params.inner_dim_count; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      params.input_block_sizes[dim] = params.input_dims[dim];
+    }
+    for (int i = params.inner_dim_count; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      params.input_block_sizes[dim] = 1;
+    }
+    params.input_block_strides =
+        internal::strides<Layout>(params.input_block_sizes);
+
+    // Broadcast with the 0-stride trick: Create 1 extra dim for each
+    // broadcast, set the input stride to 0.
+    //
+    // When ColMajor:
+    //
+    // - bcast_block_sizes:
+    //   [d_0, b_0, d_1, b_1, ...]
+    //
+    // - bcast_block_strides:
+    //   [output_block_strides[0], output_block_strides[0] * d_0,
+    //    output_block_strides[1], output_block_strides[1] * d_1,
+    //   ...]
+    //
+    // - bcast_input_strides:
+    //   [input_block_strides[0], 0,
+    //    input_block_strides[1], 0,
+    //   ...].
+    //
+    for (int i = 0; i < params.inner_dim_count; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+
+      const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1;
+      const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1;
+
+      params.bcast_block_sizes[copy_dim] = params.input_dims[dim];
+      params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
+      params.bcast_block_strides[copy_dim] = params.output_strides[dim];
+      params.bcast_block_strides[broadcast_dim] =
+          params.output_strides[dim] * params.input_dims[dim];
+      params.bcast_input_strides[copy_dim] = params.input_block_strides[dim];
+      params.bcast_input_strides[broadcast_dim] = 0;
+    }
+
+    for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) {
+      const int dim = IsColMajor ? i : 2 * NumDims - i - 1;
+      params.bcast_block_sizes[dim] = 1;
+      params.bcast_block_strides[dim] = 0;
+      params.bcast_input_strides[dim] = 0;
+    }
+
+    return params;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const {
+    DSizes<Index, NumDims> dimensions;
+    for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
+    return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim(
+      BlockBroadcastingParams params, Index bcast_offset,
+      TensorBlockScratch& scratch, ScalarNoConst* materialized_output,
+      ScalarNoConst** materialized_input,
+      size_t* materialized_input_size) const {
+    if (params.bcast_dim_size == 1) {
+      // We just need one block read using the ready-set values above.
+      return BroadcastBlock(
+          params.input_block_sizes, params.input_block_strides,
+          params.bcast_block_sizes, params.bcast_block_strides,
+          params.bcast_input_strides, bcast_offset, 0, scratch,
+          materialized_output, materialized_input, materialized_input_size);
+
+    } else if (params.input_dims[params.bcast_dim] == 1) {
+      // Broadcast bcast dimension (< NumDims) by bcast_dim_size.
+      const int broadcast_bcast_dim =
+          IsColMajor ? 2 * params.inner_dim_count + 1
+                     : 2 * NumDims - 2 * params.inner_dim_count - 2;
+
+      params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size;
+      params.bcast_input_strides[broadcast_bcast_dim] = 0;
+      params.bcast_block_strides[broadcast_bcast_dim] =
+          params.output_strides[params.bcast_dim];
+
+      return BroadcastBlock(
+          params.input_block_sizes, params.input_block_strides,
+          params.bcast_block_sizes, params.bcast_block_strides,
+          params.bcast_input_strides, bcast_offset, 0, scratch,
+          materialized_output, materialized_input, materialized_input_size);
+
+    } else {
+      // Keep track of the total number of the coefficients written to the
+      // output block.
+      Index num_output_coeffs = 0;
+
+      // The general case. Let's denote the output block as
+      //
+      //   x[..., a:a+bcast_dim_size, :, ..., :]
+      //
+      // where a:a+bcast_dim_size is a slice on the bcast_dim dimension
+      // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3
+      // sub-blocks:
+      //
+      // (1) a:b, where b is the smallest multiple of
+      //     input_dims[bcast_dim_start] in [a, a+bcast_dim_size].
+      //
+      // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start]
+      //     in [a, a+bcast_dim_size].
+      //
+      // (3) c:a+bcast_dim_size .
+      //
+      // Or, when b and c do not exist, we just need to process the whole block
+      // together.
+
+      // Find a.
+      const Index bcast_dim_left_index =
+          bcast_offset / m_outputStrides[params.bcast_dim];
+
+      // Find b and c.
+      const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
+
+      // First multiple after a. This is b when <= bcast_dim_left_index +
+      // bcast_dim_size.
+      const Index first_multiple =
+          divup<Index>(bcast_dim_left_index, input_bcast_dim_size) *
+          input_bcast_dim_size;
+
+      if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
+        // b exists, so does c. Find it.
+        const Index last_multiple =
+            (bcast_dim_left_index + params.bcast_dim_size) /
+            input_bcast_dim_size * input_bcast_dim_size;
+        const int copy_bcast_dim =
+            IsColMajor ? 2 * params.inner_dim_count
+                       : 2 * NumDims - 2 * params.inner_dim_count - 1;
+        const int broadcast_bcast_dim =
+            IsColMajor ? 2 * params.inner_dim_count + 1
+                       : 2 * NumDims - 2 * params.inner_dim_count - 2;
+
+        if (first_multiple > bcast_dim_left_index) {
+          const Index head_size = first_multiple - bcast_dim_left_index;
+          params.input_block_sizes[params.bcast_dim] = head_size;
+          params.bcast_block_sizes[copy_bcast_dim] = head_size;
+          params.bcast_input_strides[copy_bcast_dim] =
+              params.input_block_strides[params.bcast_dim];
+          params.bcast_block_strides[copy_bcast_dim] =
+              params.output_strides[params.bcast_dim];
+          params.bcast_block_sizes[broadcast_bcast_dim] = 1;
+          params.bcast_input_strides[broadcast_bcast_dim] = 0;
+          params.bcast_block_strides[broadcast_bcast_dim] =
+              params.output_strides[params.bcast_dim] *
+              params.input_dims[params.bcast_dim];
+
+          num_output_coeffs += BroadcastBlock(
+              params.input_block_sizes, params.input_block_strides,
+              params.bcast_block_sizes, params.bcast_block_strides,
+              params.bcast_input_strides, bcast_offset, 0, scratch,
+              materialized_output, materialized_input, materialized_input_size);
+        }
+        if (first_multiple < last_multiple) {
+          params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size;
+          params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size;
+          params.bcast_input_strides[copy_bcast_dim] =
+              params.input_block_strides[params.bcast_dim];
+          params.bcast_block_strides[copy_bcast_dim] =
+              params.output_strides[params.bcast_dim];
+          params.bcast_block_sizes[broadcast_bcast_dim] =
+              (last_multiple - first_multiple) / input_bcast_dim_size;
+          params.bcast_input_strides[broadcast_bcast_dim] = 0;
+          params.bcast_block_strides[broadcast_bcast_dim] =
+              params.output_strides[params.bcast_dim] *
+              params.input_dims[params.bcast_dim];
+          const Index offset = (first_multiple - bcast_dim_left_index) *
+                               m_outputStrides[params.bcast_dim];
+
+          num_output_coeffs += BroadcastBlock(
+              params.input_block_sizes, params.input_block_strides,
+              params.bcast_block_sizes, params.bcast_block_strides,
+              params.bcast_input_strides, bcast_offset, offset, scratch,
+              materialized_output, materialized_input, materialized_input_size);
+        }
+        if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) {
+          const Index tail_size =
+              bcast_dim_left_index + params.bcast_dim_size - last_multiple;
+          params.input_block_sizes[params.bcast_dim] = tail_size;
+          params.bcast_block_sizes[copy_bcast_dim] = tail_size;
+          params.bcast_input_strides[copy_bcast_dim] =
+              params.input_block_strides[params.bcast_dim];
+          params.bcast_block_strides[copy_bcast_dim] =
+              params.output_strides[params.bcast_dim];
+          params.bcast_block_sizes[broadcast_bcast_dim] = 1;
+          params.bcast_input_strides[broadcast_bcast_dim] = 0;
+          params.bcast_block_strides[broadcast_bcast_dim] =
+              params.output_strides[params.bcast_dim] *
+              params.input_dims[params.bcast_dim];
+          const Index offset = (last_multiple - bcast_dim_left_index) *
+                               m_outputStrides[params.bcast_dim];
+
+          num_output_coeffs += BroadcastBlock(
+              params.input_block_sizes, params.input_block_strides,
+              params.bcast_block_sizes, params.bcast_block_strides,
+              params.bcast_input_strides, bcast_offset, offset, scratch,
+              materialized_output, materialized_input, materialized_input_size);
+        }
+      } else {
+        // b and c do not exist.
+        const int copy_bcast_dim =
+            IsColMajor ? 2 * params.inner_dim_count
+                       : 2 * NumDims - 2 * params.inner_dim_count - 1;
+        params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size;
+        params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size;
+        params.bcast_input_strides[copy_bcast_dim] =
+            params.input_block_strides[params.bcast_dim];
+        params.bcast_block_strides[copy_bcast_dim] =
+            params.output_strides[params.bcast_dim];
+
+        num_output_coeffs += BroadcastBlock(
+            params.input_block_sizes, params.input_block_strides,
+            params.bcast_block_sizes, params.bcast_block_strides,
+            params.bcast_input_strides, bcast_offset, 0, scratch,
+            materialized_output, materialized_input, materialized_input_size);
+      }
+
+      return num_output_coeffs;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock(
+      const Dimensions& input_block_sizes,
+      const Dimensions& input_block_strides,
+      const BroadcastDimensions& bcast_block_sizes,
+      const BroadcastDimensions& bcast_block_strides,
+      const BroadcastDimensions& bcast_input_strides, Index bcast_offset,
+      Index offset, TensorBlockScratch& scratch,
+      ScalarNoConst* materialized_output, ScalarNoConst** materialized_input,
+      size_t* materialized_input_size) const {
+    // ---------------------------------------------------------------------- //
+    // Tensor block descriptor for reading block from the input.
+    const Index input_offset = bcast_offset + offset;
+    TensorBlockDesc input_desc(
+        IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
+        input_block_sizes);
+
+    ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
+
+    // ---------------------------------------------------------------------- //
+    // Materialize input block into a temporary memory buffer only if it's not
+    // already available in the arg block.
+    const ScalarNoConst* input_buffer = NULL;
+
+    if (input_block.data() != NULL) {
+      // Input block already has raw data, there is no need to materialize it.
+      input_buffer = input_block.data();
+
+    } else {
+      // Otherwise we have to do block assignment into a temporary buffer.
+
+      // Maybe reuse previously allocated buffer, or allocate a new one with a
+      // scratch allocator.
+      const size_t input_total_size = input_block_sizes.TotalSize();
+      if (*materialized_input == NULL ||
+          *materialized_input_size < input_total_size) {
+        *materialized_input_size = input_total_size;
+        void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
+        *materialized_input = static_cast<ScalarNoConst*>(mem);
+      }
+
+      typedef internal::TensorBlockAssignment<
+          ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(input_block_sizes, input_block_strides,
+                                        *materialized_input),
+          input_block.expr());
+
+      input_buffer = *materialized_input;
+    }
+
+    // ---------------------------------------------------------------------- //
+    // Copy data from materialized input block to the materialized output, using
+    // given broadcast strides (strides with zeroes).
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout>
+        TensorBlockIO;
+
+    typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
+    typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides,
+                                      materialized_output + offset);
+
+    return TensorBlockIO::Copy(dst, src);
+  }
 
- protected:
-  const Broadcast m_broadcast;
+protected:
+  const Device EIGEN_DEVICE_REF m_device;
+  const typename internal::remove_reference<Broadcast>::type m_broadcast;
   Dimensions m_dimensions;
   array<Index, NumDims> m_outputStrides;
   array<Index, NumDims> m_inputStrides;
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 1ba7ef170cc52919105854a1e3e962ec03075450..376457341120f2224b652d796942c80a293ecfa9 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -32,12 +32,13 @@ struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions - 1;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<DenseIndex DimId, typename XprType>
 struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense>
 {
-  typedef const TensorChippingOp<DimId, XprType>& type;
+  typedef const TensorChippingOp<DimId, XprType> EIGEN_DEVICE_REF type;
 };
 
 template<DenseIndex DimId, typename XprType>
@@ -50,6 +51,7 @@ template <DenseIndex DimId>
 struct DimensionId
 {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
+    EIGEN_UNUSED_VARIABLE(dim);
     eigen_assert(dim == DimId);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
@@ -78,44 +80,28 @@ template<DenseIndex DimId, typename XprType>
 class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
 {
   public:
-  typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
+    typedef TensorBase<TensorChippingOp<DimId, XprType> > Base;
+    typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
+        : m_xpr(expr), m_offset(offset), m_dim(dim) {
+    }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
-      : m_xpr(expr), m_offset(offset), m_dim(dim) {
-  }
+    EIGEN_DEVICE_FUNC
+    const Index offset() const { return m_offset; }
+    EIGEN_DEVICE_FUNC
+    const Index dim() const { return m_dim.actualDim(); }
 
-  EIGEN_DEVICE_FUNC
-  const Index offset() const { return m_offset; }
-  EIGEN_DEVICE_FUNC
-  const Index dim() const { return m_dim.actualDim(); }
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
 
-  EIGEN_DEVICE_FUNC
-  const typename internal::remove_all<typename XprType::Nested>::type&
-  expression() const { return m_xpr; }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other)
-  {
-    typedef TensorAssignOp<TensorChippingOp, const TensorChippingOp> Assign;
-    Assign assign(*this, other);
-    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-    return *this;
-  }
-
-  template<typename OtherDerived>
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other)
-  {
-    typedef TensorAssignOp<TensorChippingOp, const OtherDerived> Assign;
-    Assign assign(*this, other);
-    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-    return *this;
-  }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorChippingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -136,20 +122,49 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
     // slice offsets.
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = false,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    // Chipping of outer-most dimension is a trivial operation, because we can
+    // read and write directly from the underlying tensor using single offset.
+    IsOuterChipping   = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) ||
+                        (static_cast<int>(Layout) == RowMajor && DimId == 0),
+    // Chipping inner-most dimension.
+    IsInnerChipping   = (static_cast<int>(Layout) == ColMajor && DimId == 0) ||
+                        (static_cast<int>(Layout) == RowMajor && DimId == NumInputDims - 1),
+    // Prefer block access if the underlying expression prefers it, otherwise
+    // only if chipping is not trivial.
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess ||
+                        !IsOuterChipping,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef internal::TensorBlockDescriptor<NumInputDims, Index>
+      ArgTensorBlockDesc;
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
   {
     EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -185,12 +200,12 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -205,21 +220,20 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
-    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
-	(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+    if (isInnerChipping()) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(m_stride == 1);
       Index inputIndex = index * m_inputStride + m_inputOffset;
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < PacketSize; ++i) {
         values[i] = m_impl.coeff(inputIndex);
         inputIndex += m_inputStride;
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;
-    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
-	       (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
-      // m_stride is aways greater than index, so let's avoid the integer division.
+    } else if (isOuterChipping()) {
+      // m_stride is always greater than index, so let's avoid the integer division.
       eigen_assert(m_stride > index);
       return m_impl.template packet<LoadMode>(index + m_inputOffset);
     } else {
@@ -231,6 +245,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
       } else {
         // Cross the stride boundary. Fallback to slow path.
         EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+       EIGEN_UNROLL_LOOP
         for (int i = 0; i < PacketSize; ++i) {
           values[i] = coeff(index);
           ++index;
@@ -263,29 +278,100 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
            TensorOpCost(0, 0, cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const {
-    CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
-    if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) ||
-         (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) &&
-        result) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
+        m_impl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool root_of_expr_ast = false) const {
+    const Index chip_dim = m_dim.actualDim();
+
+    DSizes<Index, NumInputDims> input_block_dims;
+    for (int i = 0; i < NumInputDims; ++i) {
+      input_block_dims[i]
+            = i < chip_dim ? desc.dimension(i)
+            : i > chip_dim ? desc.dimension(i - 1)
+            : 1;
+    }
+
+    ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims);
+
+    // Try to reuse destination buffer for materializing argument block.
+    if (desc.HasDestinationBuffer()) {
+      DSizes<Index, NumInputDims> arg_destination_strides;
+      for (int i = 0; i < NumInputDims; ++i) {
+      arg_destination_strides[i]
+            = i < chip_dim ? desc.destination().strides()[i]
+            : i > chip_dim ? desc.destination().strides()[i - 1]
+            : 0; // for dimensions of size `1` stride should never be used.
+      }
+
+      arg_desc.template AddDestinationBuffer<Layout>(
+          desc.destination().template data<ScalarNoConst>(),
+          arg_destination_strides);
+    }
+
+    ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast);
+    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
+
+    if (arg_block.data() != NULL) {
+      // Forward argument block buffer if possible.
+      return TensorBlock(arg_block.kind(), arg_block.data(),
+                           desc.dimensions());
+
+    } else {
+      // Assign argument block expression to a buffer.
+
+      // Prepare storage for the materialized chipping result.
+      const typename TensorBlock::Storage block_storage =
+          TensorBlock::prepareStorage(desc, scratch);
+
+      typedef internal::TensorBlockAssignment<
+          ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(
+              arg_desc.dimensions(),
+              internal::strides<Layout>(arg_desc.dimensions()),
+              block_storage.data()),
+          arg_block.expr());
+
+      return block_storage.AsTensorMaterializedBlock();
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+    typename Storage::Type result = constCast(m_impl.data());
+    if (isOuterChipping() && result) {
       return result + m_inputOffset;
     } else {
       return NULL;
     }
   }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex;
-    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
-	(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+    if (isInnerChipping()) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(m_stride == 1);
       inputIndex = index * m_inputStride + m_inputOffset;
-    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) ||
-	       (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
-      // m_stride is aways greater than index, so let's avoid the integer division.
+    } else if (isOuterChipping()) {
+      // m_stride is always greater than index, so let's avoid the integer
+      // division.
       eigen_assert(m_stride > index);
       inputIndex = index + m_inputOffset;
     } else {
@@ -297,13 +383,25 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     return inputIndex;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isInnerChipping() const {
+    return IsInnerChipping ||
+           (static_cast<int>(Layout) == ColMajor && m_dim.actualDim() == 0) ||
+           (static_cast<int>(Layout) == RowMajor && m_dim.actualDim() == NumInputDims - 1);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isOuterChipping() const {
+    return IsOuterChipping ||
+           (static_cast<int>(Layout) == ColMajor && m_dim.actualDim() == NumInputDims-1) ||
+           (static_cast<int>(Layout) == RowMajor && m_dim.actualDim() == 0);
+  }
+
   Dimensions m_dimensions;
   Index m_stride;
   Index m_inputOffset;
   Index m_inputStride;
   TensorEvaluator<ArgType, Device> m_impl;
   const internal::DimensionId<DimId> m_dim;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
 };
 
 
@@ -321,15 +419,21 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    RawAccess = false
+    IsAligned     = false,
+    PacketAccess  = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess   = TensorEvaluator<ArgType, Device>::RawAccess,
+    Layout        = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess     = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
     { }
 
@@ -343,20 +447,19 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   {
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
 
-    if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
-	(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
+    if (this->isInnerChipping()) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(this->m_stride == 1);
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
       Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < PacketSize; ++i) {
         this->m_impl.coeffRef(inputIndex) = values[i];
         inputIndex += this->m_inputStride;
       }
-    } else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) ||
-	       (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
-      // m_stride is aways greater than index, so let's avoid the integer division.
+    } else if (this->isOuterChipping()) {
+      // m_stride is always greater than index, so let's avoid the integer division.
       eigen_assert(this->m_stride > index);
       this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
     } else {
@@ -369,6 +472,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
         // Cross stride boundary. Fallback to slow path.
         EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
         internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+        EIGEN_UNROLL_LOOP
         for (int i = 0; i < PacketSize; ++i) {
           this->coeffRef(index) = values[i];
           ++index;
@@ -376,6 +480,36 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
       }
     }
   }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlockDesc& desc, const TensorBlock& block) {
+    assert(this->m_impl.data() != NULL);
+
+    const Index chip_dim = this->m_dim.actualDim();
+
+    DSizes<Index, NumInputDims> input_block_dims;
+    for (int i = 0; i < NumInputDims; ++i) {
+      input_block_dims[i] = i < chip_dim ? desc.dimension(i)
+                          : i > chip_dim ? desc.dimension(i - 1)
+                          : 1;
+    }
+
+    typedef TensorReshapingOp<const DSizes<Index, NumInputDims>,
+                              const typename TensorBlock::XprType>
+        TensorBlockExpr;
+
+    typedef internal::TensorBlockAssignment<Scalar, NumInputDims,
+                                            TensorBlockExpr, Index>
+        TensorBlockAssign;
+
+    TensorBlockAssign::Run(
+        TensorBlockAssign::target(
+            input_block_dims,
+            internal::strides<Layout>(this->m_impl.dimensions()),
+            this->m_impl.data(), this->srcCoeff(desc.offset())),
+        block.expr().reshape(input_block_dims));
+  }
 };
 
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 59bf90d93902d2e37c54138d47470e335351d4d7..5235a8e6f93f9d0571ba1acbe7e7e28990b9e5d4 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -37,6 +37,8 @@ struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >
   static const int NumDimensions = traits<LhsXprType>::NumDimensions;
   static const int Layout = traits<LhsXprType>::Layout;
   enum { Flags = 0 };
+  typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                               typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
 };
 
 template<typename Axis, typename LhsXprType, typename RhsXprType>
@@ -58,6 +60,7 @@ template<typename Axis, typename LhsXprType, typename RhsXprType>
 class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors>
 {
   public:
+    typedef TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors> Base;
     typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar;
     typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind;
     typedef typename internal::traits<TensorConcatenationOp>::Index Index;
@@ -79,25 +82,7 @@ class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsX
 
     EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const TensorConcatenationOp& other)
-    {
-      typedef TensorAssignOp<TensorConcatenationOp, const TensorConcatenationOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorConcatenationOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorConcatenationOp)
   protected:
     typename LhsXprType::Nested m_lhs_xpr;
     typename RhsXprType::Nested m_rhs_xpr;
@@ -117,14 +102,24 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
   enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
+                        TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess       = false,
+    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -177,14 +172,14 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/)
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType)
   {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
+  EIGEN_STRONG_INLINE void cleanup()
   {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
@@ -215,11 +210,13 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
       Index left_index;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         left_index = subs[0];
+        EIGEN_UNROLL_LOOP
         for (int i = 1; i < NumDims; ++i) {
           left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
         }
       } else {
         left_index = subs[NumDims - 1];
+        EIGEN_UNROLL_LOOP
         for (int i = NumDims - 2; i >= 0; --i) {
           left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
         }
@@ -231,11 +228,13 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
       Index right_index;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         right_index = subs[0];
+        EIGEN_UNROLL_LOOP
         for (int i = 1; i < NumDims; ++i) {
           right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
         }
       } else {
         right_index = subs[NumDims - 1];
+        EIGEN_UNROLL_LOOP
         for (int i = NumDims - 2; i >= 0; --i) {
           right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
         }
@@ -248,11 +247,12 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
     EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
 
     EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < packetSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -275,7 +275,15 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
            TensorOpCost(0, 0, compute_cost);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  #ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_leftImpl.bind(cgh);
+    m_rightImpl.bind(cgh);
+  }
+  #endif
 
   protected:
     Dimensions m_dimensions;
@@ -296,13 +304,21 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
   typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
   typedef typename Base::Dimensions Dimensions;
   enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
+                        TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess       = false,
+    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
     : Base(op, device)
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -344,7 +360,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
     EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 20b29e5fde9868c690c1096c88fe20c59fe58e8c..8b35f7985160a0424ec78aebe59e58b7f2c475fe 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -21,8 +21,8 @@ namespace Eigen {
   */
 namespace internal {
 
-template<typename Dimensions, typename LhsXprType, typename RhsXprType>
-struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
+template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
   typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
@@ -38,53 +38,305 @@ struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
 
   // From NumDims below.
-  static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
+  static const int NumDimensions = traits<LhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
   static const int Layout = traits<LhsXprType>::Layout;
+  typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                               typename traits<LhsXprType>::PointerType,
+                               typename traits<RhsXprType>::PointerType>::type
+      PointerType;
 
   enum {
     Flags = 0
   };
 };
 
-template<typename Dimensions, typename LhsXprType, typename RhsXprType>
-struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, Eigen::Dense>
+template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, Eigen::Dense>
 {
-  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType>& type;
+  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>& type;
 };
 
-template<typename Dimensions, typename LhsXprType, typename RhsXprType>
-struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >::type>
+template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >::type>
 {
-  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType> type;
+  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> type;
 };
 
-template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename Device_>
-struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_>, Device_> > {
+template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename OutputKernelType_, typename Device_>
+struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_, OutputKernelType_>, Device_> > {
   typedef Indices_ Indices;
   typedef LeftArgType_ LeftArgType;
   typedef RightArgType_ RightArgType;
+  typedef OutputKernelType_ OutputKernelType;
   typedef Device_ Device;
 
   // From NumDims below.
   static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
 };
 
+// Helper class to allocate and deallocate temporary memory for packed buffers.
+template <typename LhsScalar, typename RhsScalar>
+struct TensorContractionBlockMemAllocator {
+  typedef void* BlockMemHandle;
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm,
+                                                   const Index bk,
+                                                   const Index bn,
+                                                   LhsScalar** lhs_block,
+                                                   RhsScalar** rhs_block) {
+    eigen_assert(lhs_block);
+    eigen_assert(rhs_block);
+    BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
+    char* block_mem = static_cast<char*>(d.allocate(sz.lhs_size + sz.rhs_size));
+    eigen_assert(block_mem);
+    *lhs_block = reinterpret_cast<LhsScalar*>(block_mem);
+    *rhs_block = reinterpret_cast<RhsScalar*>(block_mem + sz.lhs_size);
+    return block_mem;
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static BlockMemHandle allocateSlices(
+      Device& d, const Index bm, const Index bk, const Index bn,
+      const Index num_lhs, const Index num_rhs, const Index num_slices,
+      std::vector<LhsScalar*>* lhs_blocks,
+      std::vector<RhsScalar*>* rhs_blocks) {
+    eigen_assert(num_slices > 0);
+    eigen_assert(num_lhs >= 0 && num_rhs >= 0);
+    eigen_assert(num_lhs == 0 || lhs_blocks);
+    eigen_assert(num_rhs == 0 || rhs_blocks);
+    BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
+    void* block_mem = d.allocate(
+        (num_lhs * sz.lhs_size + num_rhs * sz.rhs_size) * num_slices);
+    eigen_assert(block_mem);
+    char* mem = static_cast<char*>(block_mem);
+
+    for (Index x = 0; x < num_slices; x++) {
+      if (num_lhs > 0) lhs_blocks[x].resize(num_lhs);
+      for (Index m = 0; m < num_lhs; m++) {
+        lhs_blocks[x][m] = reinterpret_cast<LhsScalar*>(mem);
+        mem += sz.lhs_size;
+      }
+      if (num_rhs > 0) rhs_blocks[x].resize(num_rhs);
+      for (Index n = 0; n < num_rhs; n++) {
+        rhs_blocks[x][n] = reinterpret_cast<RhsScalar*>(mem);
+        mem += sz.rhs_size;
+      }
+    }
+
+    return block_mem;
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) {
+    d.deallocate(handle);
+  }
+
+ private:
+  struct BlockSizes {
+    Index lhs_size;
+    Index rhs_size;
+  };
+  EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm,
+                                                              const Index bk,
+                                                              const Index bn) {
+    Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+    BlockSizes sz;
+    sz.lhs_size = divup<Index>(bm * bk * sizeof(LhsScalar), align) * align;
+    sz.rhs_size = divup<Index>(bn * bk * sizeof(RhsScalar), align) * align;
+    return sz;
+  }
+};
+
+// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
+// ColMajor storage order. This property is guaranteed by the
+// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
+// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
+// multiplication for these blocks. Default tensor contraction uses
+// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
+// GeneralBlocPanelKernel.h for details).
+//
+// By specializing contraction kernels we can use other low level libraries to
+// perform matrix multiplication, and still rely on Eigen contraction evaluator.
+// This also includes full support in TensorContractionThreadPool, assuming that
+// underlying gemm do not use it's own threading.
+//
+// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
+//   multiplication, lhs tensor and rhs tensor respectively.
+//
+// - StorageIndex - index type for the tensor expressions. In practice almost
+//   always is Eigen::Index.
+//
+// - OutputMapper provides access to the memory of the output matrix. In
+//   practice it's always column major blas_data_mapper (it must be of ResScalar
+//   type).
+//
+// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
+//   view into the Lhs/Rhs tensor expressions. In practice it's
+//   TensorContractionInputMapper, or some specialization of it based on the
+//   type of tensor expression (e.g. TensorImagePatchOp has optimized input
+//   mapper).
+template <typename ResScalar, typename LhsScalar, typename RhsScalar,
+    typename StorageIndex, typename OutputMapper, typename LhsMapper,
+    typename RhsMapper>
+struct TensorContractionKernel {
+  // True if `invoke()` supports `beta` in `C <- alpha * A * B + beta * C`
+  // (otherwise beta should be always equal to 1).
+  enum { HasBeta = false };
+
+  EIGEN_DEVICE_FUNC
+  TensorContractionKernel(StorageIndex m_, StorageIndex k_, StorageIndex n_,
+                          StorageIndex bm_, StorageIndex bk_, StorageIndex bn_)
+      : m(m_), k(k_), n(n_), bm(bm_), bk(bk_), bn(bn_) {}
+
+  // Pack blocks of Lhs and Rhs into contiguous blocks in memory.
+  typedef LhsScalar* LhsBlock;
+  typedef RhsScalar* RhsBlock;
+
+  // Packed Lhs/Rhs block memory allocator.
+  typedef TensorContractionBlockMemAllocator<LhsScalar, RhsScalar>
+      BlockMemAllocator;
+  typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle;
+
+  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef internal::gemm_pack_lhs<
+      LhsScalar, StorageIndex, typename LhsMapper::SubMapper, Traits::mr,
+      Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor>
+      LhsPacker;
+
+  typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex,
+                                  typename RhsMapper::SubMapper, Traits::nr,
+                                  ColMajor>
+      RhsPacker;
+
+  typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex,
+                                OutputMapper, Traits::mr, Traits::nr,
+      /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
+      GebpKernel;
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block,
+                                            RhsBlock* rhs_block) {
+    return BlockMemAllocator::allocate(d, bm, bk, bn, lhs_block, rhs_block);
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC BlockMemHandle allocateSlices(
+      Device& d, const StorageIndex num_lhs, const StorageIndex num_rhs,
+      const StorageIndex num_slices, std::vector<LhsBlock>* lhs_blocks,
+      std::vector<RhsBlock>* rhs_blocks) {
+    return BlockMemAllocator::allocateSlices(
+        d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_blocks, rhs_blocks);
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) {
+    BlockMemAllocator::deallocate(d, handle);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs(
+      LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper,
+      const StorageIndex depth, const StorageIndex rows) {
+    LhsPacker()(*lhsBlock, data_mapper, depth, rows, /*stride*/ 0,
+        /*offset*/ 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs(
+      RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper,
+      const StorageIndex depth, const StorageIndex cols) {
+    RhsPacker()(*rhsBlock, data_mapper, depth, cols);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke(
+      const OutputMapper& output_mapper, const LhsBlock& lhsBlock,
+      const RhsBlock& rhsBlock, const StorageIndex rows,
+      const StorageIndex depth, const StorageIndex cols,
+      const ResScalar alpha, const ResScalar beta) {
+    // Default GEBP kernel does not support beta.
+    eigen_assert(beta == ResScalar(1));
+    static const int kComputeStrideFromBlockDimensions = -1;
+    GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
+        /*strideA*/ kComputeStrideFromBlockDimensions,
+        /*strideB*/ kComputeStrideFromBlockDimensions,
+        /*offsetA*/ 0, /*offsetB*/ 0);
+  }
+
+ private:
+  // These are dimensions of the original Tensors, and selected block sizes. The
+  // actual block sizes passed to all function above might be smaller because of
+  // the partial blocks at the end.
+  const StorageIndex m;
+  const StorageIndex k;
+  const StorageIndex n;
+  const StorageIndex bm;
+  const StorageIndex bk;
+  const StorageIndex bn;
+};
+
 }  // end namespace internal
 
-template<typename Indices, typename LhsXprType, typename RhsXprType>
-class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType>, ReadOnlyAccessors>
+// Tensor contraction params that should enable to get from output matrix
+// 2-dimensional coordinates to the output tensor dimensions.
+struct TensorContractionParams {
+  // TensorContraction evaluator assumes that both tensors are in ColMajor
+  // layout, if tensors are in RowMajor evaluator swap lhs with rhs.
+  bool swapped_arguments;
+};
+
+// Output kernel allows to fuse operations into the tensor contraction.
+//
+// Examples:
+//   1. Elementwise Relu transformation following Conv2D.
+//   2. AddBias to the Conv2D output channels dimension.
+//
+// The NoOpOutputKernel implements an output kernel that does absolutely nothing.
+struct NoOpOutputKernel {
+  /**
+   * Tensor contraction evaluator calls this kernel after finishing each block
+   * of output matrix. Output blocks belong to the 2-dimensional output tensor.
+   *
+   * TensorContractionParams contains contraction dimensions information
+   * required to map output 2-d space into the expected output tensor space
+   * (potentially higher dimensional).
+   *
+   * \param[in] output_mapper Access to output tensor memory
+   * \param[in] params   Tensor contraction parameters
+   * \param[in] i        Index of a first row available through output_mapper
+   * \param[in] j        Index of a first column available through output_mapper
+   * \param[in] num_rows Number of available rows
+   * \param[in] num_cols Number of available columns
+   */
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+      const TensorContractionParams& params, Index i,
+      Index j, Index num_rows, Index num_cols) const {
+    EIGEN_UNUSED_VARIABLE(output_mapper);
+    EIGEN_UNUSED_VARIABLE(params);
+    EIGEN_UNUSED_VARIABLE(i);
+    EIGEN_UNUSED_VARIABLE(j);
+    EIGEN_UNUSED_VARIABLE(num_rows);
+    EIGEN_UNUSED_VARIABLE(num_cols);
+  }
+};
+
+template<typename Indices, typename LhsXprType, typename RhsXprType, typename OutputKernelType = const NoOpOutputKernel>
+class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType, OutputKernelType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
   typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
-                                                   typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
+                                         typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(
-      const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims)
-      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {}
+      const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims,
+      const OutputKernelType& output_kernel = OutputKernelType())
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims),
+        m_output_kernel(output_kernel) {}
 
   EIGEN_DEVICE_FUNC
   const Indices& indices() const { return m_indices; }
@@ -98,35 +350,48 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
   const typename internal::remove_all<typename RhsXprType::Nested>::type&
   rhsExpression() const { return m_rhs_xpr; }
 
+  EIGEN_DEVICE_FUNC
+  const OutputKernelType& outputKernel() const { return m_output_kernel; }
+
   protected:
     typename LhsXprType::Nested m_lhs_xpr;
     typename RhsXprType::Nested m_rhs_xpr;
     const Indices m_indices;
+    const OutputKernelType m_output_kernel;
 };
 
 
 template<typename Derived>
-struct TensorContractionEvaluatorBase
+struct TensorContractionEvaluatorBase : internal::no_assignment_operator
 {
   typedef typename internal::traits<Derived>::Indices Indices;
   typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
   typedef typename internal::traits<Derived>::RightArgType RightArgType;
+  typedef typename internal::traits<Derived>::OutputKernelType OutputKernelType;
   typedef typename internal::traits<Derived>::Device Device;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = true,
-    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = true
+    IsAligned         = true,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = false,
+    PreferBlockAccess = false,
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = true
   };
 
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
   // Most of the code is assuming that both input tensors are ColMajor. If the
   // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
   // If we want to compute A * B = C, where A is LHS and B is RHS, the code
@@ -136,6 +401,9 @@ struct TensorContractionEvaluatorBase
   typedef typename internal::conditional<
     static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
 
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluatorType;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluatorType;
+
   static const int LDims =
       internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
   static const int RDims =
@@ -149,16 +417,17 @@ struct TensorContractionEvaluatorBase
 
   typedef DSizes<Index, NumDims> Dimensions;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  EIGEN_STRONG_INLINE
   TensorContractionEvaluatorBase(const XprType& op, const Device& device)
-    : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
+      : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
                           op.lhsExpression(), op.rhsExpression()), device),
-    m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
-                          op.rhsExpression(), op.lhsExpression()), device),
+        m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
+                           op.rhsExpression(), op.lhsExpression()), device),
         m_device(device),
+        m_output_kernel(op.outputKernel()),
         m_result(NULL) {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
-			   static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
                         YOU_MADE_A_PROGRAMMING_MISTAKE);
 
 
@@ -233,7 +502,7 @@ struct TensorContractionEvaluatorBase
     // dimensions and right non-contracting dimensions.
     m_lhs_inner_dim_contiguous = true;
     int dim_idx = 0;
-    unsigned int nocontract_idx = 0;
+    Index nocontract_idx = 0;
 
     for (int i = 0; i < LDims; i++) {
       // find if we are contracting on index i of left tensor
@@ -323,64 +592,144 @@ struct TensorContractionEvaluatorBase
         numext::swap(m_dimensions[i], m_dimensions[j]);
       }
     }
+
+    // A set of parameters that will allow output kernel to get from output
+    // tensor dimensions (i, j) into the original tensor dimensions.
+    // TODO(ezhulenev): Add parameters required to infer output tensor index for
+    // more complex contractions than 2x2 on internal dimension.
+    m_tensor_contraction_params.swapped_arguments = static_cast<int>(Layout) == RowMajor;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     if (data) {
       evalTo(data);
       return false;
     } else {
-      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      m_result = static_cast<EvaluatorPointerType>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
       evalTo(m_result);
       return true;
     }
   }
 
-  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
-    if (this->m_lhs_inner_dim_contiguous) {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          static_cast<const Derived*>(this)->template evalProduct<true, true, true, Unaligned>(buffer);
-        }
-        else {
-          static_cast<const Derived*>(this)->template evalProduct<true, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          static_cast<const Derived*>(this)->template evalProduct<true, false, true, Unaligned>(buffer);
-        }
-        else {
-          static_cast<const Derived*>(this)->template evalProduct<true, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-    else {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          static_cast<const Derived*>(this)->template evalProduct<false, true, true, Unaligned>(buffer);
-        }
-        else {
-          static_cast<const Derived*>(this)->template evalProduct<false, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          static_cast<const Derived*>(this)->template evalProduct<false, false, true, Unaligned>(buffer);
-        }
-        else {
-          static_cast<const Derived*>(this)->template evalProduct<false, false, false, Unaligned>(buffer);
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
+      m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
+        if (dest) {
+          evalToAsync(dest, [done]() { done(false); });
+        } else {
+          m_result = static_cast<EvaluatorPointerType>(
+              m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+          evalToAsync(m_result, [done]() { done(true); });
         }
-      }
+      });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+#ifndef TENSOR_CONTRACTION_DISPATCH
+#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \
+  if (this->m_lhs_inner_dim_contiguous) {                    \
+    if (this->m_rhs_inner_dim_contiguous) {                  \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<true, true, true, ALIGNMENT> ARGS;            \
+      } else {                                               \
+        METHOD<true, true, false, ALIGNMENT> ARGS;           \
+      }                                                      \
+    } else {                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<true, false, true, ALIGNMENT> ARGS;           \
+      } else {                                               \
+        METHOD<true, false, false, ALIGNMENT> ARGS;          \
+      }                                                      \
+    }                                                        \
+  } else {                                                   \
+    if (this->m_rhs_inner_dim_contiguous) {                  \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<false, true, true, ALIGNMENT> ARGS;           \
+      } else {                                               \
+        METHOD<false, true, false, ALIGNMENT> ARGS;          \
+      }                                                      \
+    } else {                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<false, false, true, ALIGNMENT> ARGS;          \
+      } else {                                               \
+        METHOD<false, false, false, ALIGNMENT> ARGS;         \
+      }                                                      \
+    }                                                        \
+  }
+#endif
+
+#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH
+#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \
+  if (this->m_lhs_inner_dim_contiguous) {                                    \
+    if (this->m_rhs_inner_dim_contiguous) {                                  \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, true, true, true, ALIGNMENT> ARGS)->FN;            \
+      } else {                                                               \
+        (new METHOD<DONE, true, true, false, ALIGNMENT> ARGS)->FN;           \
+      }                                                                      \
+    } else {                                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, true, false, true, ALIGNMENT> ARGS)->FN;           \
+      } else {                                                               \
+        (new METHOD<DONE, true, false, false, ALIGNMENT> ARGS)->FN;          \
+      }                                                                      \
+    }                                                                        \
+  } else {                                                                   \
+    if (this->m_rhs_inner_dim_contiguous) {                                  \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, false, true, true, ALIGNMENT> ARGS)->FN;           \
+      } else {                                                               \
+        (new METHOD<DONE, false, true, false, ALIGNMENT> ARGS)->FN;          \
+      }                                                                      \
+    } else {                                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, false, false, true, ALIGNMENT> ARGS)->FN;          \
+      } else {                                                               \
+        (new METHOD<DONE, false, false, false, ALIGNMENT> ARGS)->FN;         \
+      }                                                                      \
+    }                                                                        \
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
+   static_cast<const Derived*>(this)->template evalProduct<Unaligned>(buffer);
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalToCallback>
+  void evalToAsync(Scalar* buffer, EvalToCallback done) const {
+    static_cast<const Derived*>(this)
+        ->template evalProductAsync<EvalToCallback, Unaligned>(buffer,
+                                                               std::move(done));
+  }
+#endif  // EIGEN_USE_THREADS
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+            bool rhs_inner_dim_reordered, int Alignment>
+  void evalProductSequential(Scalar* buffer) const {
+    if (this->m_j_size == 1) {
+      this->template evalGemv<lhs_inner_dim_contiguous,
+                              rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
+                              Alignment>(buffer);
+    } else {
+      this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
+                              rhs_inner_dim_reordered, Alignment>(buffer);
     }
   }
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const {
+  #if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
+  void evalGemv(Scalar* buffer) const {
     const Index rows = m_i_size;
     const Index cols = m_k_size;
 
@@ -418,12 +767,41 @@ struct TensorContractionEvaluatorBase
     internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
         rows, cols, lhs, rhs,
         buffer, resIncr, alpha);
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+    m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params,
+                    static_cast<Index>(0), static_cast<Index>(0), rows,
+                    static_cast<Index>(1));
   }
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
+  #if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
+  void evalGemm(Scalar* buffer) const {
     // columns in left side, rows in right side
     const Index k = this->m_k_size;
+    this->template evalGemmPartial<lhs_inner_dim_contiguous,
+                                   rhs_inner_dim_contiguous,
+                                   rhs_inner_dim_reordered,
+                                   Alignment, true>(buffer, 0, k, 1);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+      bool rhs_inner_dim_reordered, int Alignment>
+  EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel(
+      Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
+    evalGemmPartial<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
+                    rhs_inner_dim_reordered, Alignment,
+        /*use_output_kernel*/ false>(buffer, k_start, k_end,
+                                     num_threads);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment, bool use_output_kernel>
+  EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
+    eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= this->m_k_size);
+    // columns in slice on left side, rows on right side
+    const Index k_slice = k_end - k_start;
 
     // rows in left side
     const Index m = this->m_i_size;
@@ -431,16 +809,9 @@ struct TensorContractionEvaluatorBase
     // columns in right side
     const Index n = this->m_j_size;
 
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
-    // define mr, nr, and all of my data mapper types
+    // define data mappers for Lhs and Rhs
     typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
     typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
-
-    const Index nr = Traits::nr;
-    const Index mr = Traits::mr;
 
     typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
     typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
@@ -462,11 +833,9 @@ struct TensorContractionEvaluatorBase
 
     typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
 
-    // Declare GEBP packing and kernel structs
-    internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
-    internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
-
-    internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
+    typedef internal::TensorContractionKernel<
+        Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+        TensorContractionKernel;
 
     // initialize data mappers
     LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
@@ -478,42 +847,72 @@ struct TensorContractionEvaluatorBase
     OutputMapper output(buffer, m);
 
     // Sizes of the blocks to load in cache. See the Goto paper for details.
-    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1);
+    internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar,
+                                        Index, internal::ShardByCol>
+        blocking(k_slice, m, n, num_threads);
     const Index kc = blocking.kc();
     const Index mc = numext::mini(m, blocking.mc());
     const Index nc = numext::mini(n, blocking.nc());
-    const Index sizeA = mc * kc;
-    const Index sizeB = kc * nc;
 
-    LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
-    RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
+    typedef typename TensorContractionKernel::LhsBlock LhsBlock;
+    typedef typename TensorContractionKernel::RhsBlock RhsBlock;
+
+    LhsBlock blockA;
+    RhsBlock blockB;
+
+    TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc);
+
+    typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
+    const BlockMemHandle packed_mem =
+        kernel.allocate(this->m_device, &blockA, &blockB);
+
+    // If a contraction kernel does not support beta, explicitly initialize
+    // output buffer with zeroes.
+    if (!TensorContractionKernel::HasBeta) {
+      this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+    }
 
     for(Index i2=0; i2<m; i2+=mc)
     {
       const Index actual_mc = numext::mini(i2+mc,m)-i2;
-      for (Index k2 = 0; k2 < k; k2 += kc) {
+      for (Index k2 = k_start; k2 < k_end; k2 += kc) {
         // make sure we don't overshoot right edge of left matrix, then pack vertical panel
-        const Index actual_kc = numext::mini(k2 + kc, k) - k2;
-        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
+        const Index actual_kc = numext::mini(k2 + kc, k_end) - k2;
+        kernel.packLhs(&blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+        // If kernel supports beta, there is no need to initialize output
+        // buffer with zeroes.
+        const Scalar alpha = Scalar(1);
+        const Scalar beta = (TensorContractionKernel::HasBeta && k2 == k_start)
+                                ? Scalar(0)
+                                : Scalar(1);
 
         // series of horizontal blocks
         for (Index j2 = 0; j2 < n; j2 += nc) {
           // make sure we don't overshoot right edge of right matrix, then pack block
           const Index actual_nc = numext::mini(j2 + nc, n) - j2;
-          pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
+          kernel.packRhs(&blockB, rhs.getSubMapper(k2, j2), actual_kc,
+                         actual_nc);
 
           // call gebp (matrix kernel)
           // The parameters here are copied from Eigen's GEMM implementation
-          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, Scalar(1), -1, -1, 0, 0);
+          const OutputMapper output_mapper = output.getSubMapper(i2, j2);
+          kernel.invoke(output_mapper, blockA, blockB, actual_mc, actual_kc,
+                        actual_nc, alpha, beta);
+
+          // We are done with this [i2, j2] output block.
+          if (use_output_kernel && k2 + kc >= k_end) {
+            m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2,
+                            actual_mc, actual_nc);
+          }
         }
       }
     }
 
-    this->m_device.deallocate(blockA);
-    this->m_device.deallocate(blockB);
+    kernel.deallocate(this->m_device, packed_mem);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
 
@@ -536,11 +935,9 @@ struct TensorContractionEvaluatorBase
     return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; }
 
-  protected:
-  // Prevent assignment
-  TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
+protected:
   Dimensions m_dimensions;
 
   contract_t m_k_strides;
@@ -560,22 +957,25 @@ struct TensorContractionEvaluatorBase
   Index m_j_size;
   Index m_k_size;
 
+  TensorContractionParams m_tensor_contraction_params;
+
   TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
   TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
-  const Device& m_device;
-  Scalar* m_result;
+  const Device EIGEN_DEVICE_REF m_device;
+  OutputKernelType m_output_kernel;
+  EvaluatorPointerType m_result;
 };
 
 
 // evaluator for default device
-template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType, typename Device>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> :
     public TensorContractionEvaluatorBase<
-      TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+      TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> > {
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
   typedef TensorContractionEvaluatorBase<Self> Base;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -609,17 +1009,12 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   // Could we use NumDimensions here?
   typedef DSizes<Index, NumDims> Dimensions;
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+  TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) { }
 
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const {
-    if (this->m_j_size == 1) {
-      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
-      return;
-    }
-
-    this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+  template <int Alignment>
+  void evalProduct(Scalar* buffer) const {
+    TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer));
   }
 };
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index 5cf7b4f7189a7584ff19c0d1730d49dbe87a4733..974feb0ad22fb7b78198c5ab2ccbcca90be62c12 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -21,14 +21,28 @@ enum {
 
 
 // Default Blocking Strategy
-template <typename LhsMapper, typename RhsMapper, typename Index, int ShardingType=ShardByCol>
+template<typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex, int ShardingType = ShardByCol>
 class TensorContractionBlocking {
  public:
 
-  typedef typename LhsMapper::Scalar LhsScalar;
-  typedef typename RhsMapper::Scalar RhsScalar;
+ /*
+   adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h`
+     requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h`
+     which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in `GeneralBlockPanelKernel.h`
+     which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
+     (else HIPCC will error out)
 
-  EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
+   However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
+   results in NVCC erroring out with the following error
+
+   ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901:
+      dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function
+ */
+
+  #if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
+ TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) :
       kc_(k), mc_(m), nc_(n)
   {
     if (ShardingType == ShardByCol) {
@@ -37,19 +51,22 @@ class TensorContractionBlocking {
     else {
       computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
     }
+
+    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+    kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ?
+      kc_ : (kc_ / rhs_packet_size) * rhs_packet_size;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
 
  private:
-  Index kc_;
-  Index mc_;
-  Index nc_;
+  StorageIndex kc_;
+  StorageIndex mc_;
+  StorageIndex nc_;
 };
 
-
 } // end namespace internal
 } // end namespace Eigen
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index d65dbb40f1887ac379cf51cafe58981cdb5943ee..3f315fedc54b64adc75ae42f4b8a7cc9202ea1b8 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -1,1391 +1,6 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
-// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
+#if defined(__clang__) || defined(__GNUC__)
+#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file"
+#endif
 
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-
-namespace Eigen {
-
-template<typename Scalar, typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
-__device__ EIGEN_STRONG_INLINE void
-EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
-                               const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
-                       const Index m_size, const Index n_size, const Index k_size) {
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  // declare and initialize 64 registers for output 8x8 block
-
-  // prefetch registers
-  Scalar lhs_pf0;
-  Scalar lhs_pf1;
-  Scalar lhs_pf2;
-  Scalar lhs_pf3;
-  Scalar lhs_pf4;
-  Scalar lhs_pf5;
-  Scalar lhs_pf6;
-  Scalar lhs_pf7;
-
-  Scalar rhs_pf0;
-  Scalar rhs_pf1;
-  Scalar rhs_pf2;
-  Scalar rhs_pf3;
-  Scalar rhs_pf4;
-  Scalar rhs_pf5;
-  Scalar rhs_pf6;
-  Scalar rhs_pf7;
-
-  // shared memory is formatted
-  // (contract idx in block, nocontract idx in block, block idx)
-  // where block idx is column major. This transposition limits the number of
-  // bank conflicts when reading the LHS. The core idea is that since the contracting
-  // index is shared by both sides, then the contracting index should be in threadIdx.x.
-
-  // On the LHS, we pad each row inside of each block with an extra element. This makes
-  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
-  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
-
-  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
-  // conflicts on writes and also none on reads.
-
-  // storage indices
-  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
-  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
-
-  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
-  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
-  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
-  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
-  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
-  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
-  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
-  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
-
-  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
-  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
-  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
-  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
-  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
-  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
-  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
-  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
-
-  // in the loading code, the following variables are important:
-  // threadIdx.x: the vertical position in an 8x8 block
-  // threadIdx.y: the vertical index of the 8x8 block in the grid
-  // threadIdx.z: the horizontal position in an 8x8 block
-  // k: the horizontal index of the 8x8 block in the grid
-  //
-  // The k parameter is implicit (it was the loop counter for a loop that went
-  // from 0 to <8, but now that loop is unrolled in the below code.
-
-  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
-  const Index lhs_vert = base_m + load_idx_vert;
-
-#define prefetchIntoRegisters(base_k)                           \
-  {                                                             \
-    lhs_pf0 = conv(0);                                          \
-    lhs_pf1 = conv(0);                                          \
-    lhs_pf2 = conv(0);                                          \
-    lhs_pf3 = conv(0);                                          \
-    lhs_pf4 = conv(0);                                          \
-    lhs_pf5 = conv(0);                                          \
-    lhs_pf6 = conv(0);                                          \
-    lhs_pf7 = conv(0);                                          \
-                                                                \
-    rhs_pf0 = conv(0);                                          \
-    rhs_pf1 = conv(0);                                          \
-    rhs_pf2 = conv(0);                                          \
-    rhs_pf3 = conv(0);                                          \
-    rhs_pf4 = conv(0);                                          \
-    rhs_pf5 = conv(0);                                          \
-    rhs_pf6 = conv(0);                                          \
-    rhs_pf7 = conv(0);                                          \
-                                                                \
-    if (!needs_edge_check || lhs_vert < m_size) {               \
-      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8;   \
-      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8;   \
-      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8;   \
-      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8;   \
-      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8;   \
-      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8;   \
-      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8;   \
-      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8;   \
-                                                                \
-      if (!needs_edge_check || lhs_horiz_7 < k_size) {          \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
-        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                   \
-      } else if (lhs_horiz_6 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
-      } else if (lhs_horiz_5 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-      } else if (lhs_horiz_4 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-      } else if (lhs_horiz_3 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-      } else if (lhs_horiz_2 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-      } else if (lhs_horiz_1 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-      } else if (lhs_horiz_0 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-      }                                                         \
-    }                                                           \
-                                                                \
-    const Index rhs_vert = base_k + load_idx_vert;              \
-    if (!needs_edge_check || rhs_vert < k_size) {               \
-      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8;   \
-      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8;   \
-      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8;   \
-      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8;   \
-      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8;   \
-      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8;   \
-      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8;   \
-      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8;   \
-                                                                \
-      if (rhs_horiz_7 < n_size) {                               \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
-        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                   \
-      } else if (rhs_horiz_6 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
-      } else if (rhs_horiz_5 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-      } else if (rhs_horiz_4 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-      } else if (rhs_horiz_3 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-      } else if (rhs_horiz_2 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-      } else if (rhs_horiz_1 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-      } else if (rhs_horiz_0 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-      }                                                         \
-    }                                                           \
-  }                                                             \
-
-#define writeRegToShmem(_)                      \
-  lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
-  rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
-                                                \
-  lhs_shmem[lhs_store_idx_1] = lhs_pf1;         \
-  rhs_shmem[rhs_store_idx_1] = rhs_pf1;         \
-                                                \
-  lhs_shmem[lhs_store_idx_2] = lhs_pf2;         \
-  rhs_shmem[rhs_store_idx_2] = rhs_pf2;         \
-                                                \
-  lhs_shmem[lhs_store_idx_3] = lhs_pf3;         \
-  rhs_shmem[rhs_store_idx_3] = rhs_pf3;         \
-                                                \
-  lhs_shmem[lhs_store_idx_4] = lhs_pf4;         \
-  rhs_shmem[rhs_store_idx_4] = rhs_pf4;         \
-                                                \
-  lhs_shmem[lhs_store_idx_5] = lhs_pf5;         \
-  rhs_shmem[rhs_store_idx_5] = rhs_pf5;         \
-                                                \
-  lhs_shmem[lhs_store_idx_6] = lhs_pf6;         \
-  rhs_shmem[rhs_store_idx_6] = rhs_pf6;         \
-                                                \
-  lhs_shmem[lhs_store_idx_7] = lhs_pf7;         \
-  rhs_shmem[rhs_store_idx_7] = rhs_pf7;         \
-
-  // declare and initialize result array
-#define res(i, j) _res_##i##j
-#define initResultRow(i)                        \
-  Scalar res(i, 0) = conv(0);                   \
-  Scalar res(i, 1) = conv(0);                   \
-  Scalar res(i, 2) = conv(0);                   \
-  Scalar res(i, 3) = conv(0);                   \
-  Scalar res(i, 4) = conv(0);                   \
-  Scalar res(i, 5) = conv(0);                   \
-  Scalar res(i, 6) = conv(0);                   \
-  Scalar res(i, 7) = conv(0);                   \
-
-  internal::scalar_cast_op<int, Scalar> conv;
-  initResultRow(0);
-  initResultRow(1);
-  initResultRow(2);
-  initResultRow(3);
-  initResultRow(4);
-  initResultRow(5);
-  initResultRow(6);
-  initResultRow(7);
-#undef initResultRow
-
-  for (Index base_k = 0; base_k < k_size; base_k += 64) {
-    // wait for previous iteration to finish with shmem. Despite common sense,
-    // the code is a bit faster with this here then at bottom of loop
-    __syncthreads();
-
-    prefetchIntoRegisters(base_k);
-    writeRegToShmem();
-
-    #undef prefetchIntoRegisters
-    #undef writeRegToShmem
-
-    // wait for shared mem packing to be done before starting computation
-    __syncthreads();
-
-    // compute 8x8 matrix product by outer product. This involves packing one column
-    // of LHS and one row of RHS into registers (takes 16 registers).
-
-#define lcol(i) _lcol##i
-    Scalar lcol(0);
-    Scalar lcol(1);
-    Scalar lcol(2);
-    Scalar lcol(3);
-    Scalar lcol(4);
-    Scalar lcol(5);
-    Scalar lcol(6);
-    Scalar lcol(7);
-
-#define rrow(j) _rrow##j
-    Scalar rrow(0);
-    Scalar rrow(1);
-    Scalar rrow(2);
-    Scalar rrow(3);
-    Scalar rrow(4);
-    Scalar rrow(5);
-    Scalar rrow(6);
-    Scalar rrow(7);
-
-    // Now x corresponds to k, y to m, and z to n
-    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
-    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
-
-#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
-#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
-
-#define loadData(i, j)                          \
-    lcol(0) = lhs_element(0, j);               \
-    rrow(0) = rhs_element(i, 0);               \
-    lcol(1) = lhs_element(1, j);               \
-    rrow(1) = rhs_element(i, 1);               \
-    lcol(2) = lhs_element(2, j);               \
-    rrow(2) = rhs_element(i, 2);               \
-    lcol(3) = lhs_element(3, j);               \
-    rrow(3) = rhs_element(i, 3);               \
-    lcol(4) = lhs_element(4, j);               \
-    rrow(4) = rhs_element(i, 4);               \
-    lcol(5) = lhs_element(5, j);               \
-    rrow(5) = rhs_element(i, 5);               \
-    lcol(6) = lhs_element(6, j);               \
-    rrow(6) = rhs_element(i, 6);               \
-    lcol(7) = lhs_element(7, j);               \
-    rrow(7) = rhs_element(i, 7);               \
-
-#define computeCol(j)                           \
-    res(0, j) += lcol(0) * rrow(j);             \
-    res(1, j) += lcol(1) * rrow(j);             \
-    res(2, j) += lcol(2) * rrow(j);             \
-    res(3, j) += lcol(3) * rrow(j);             \
-    res(4, j) += lcol(4) * rrow(j);             \
-    res(5, j) += lcol(5) * rrow(j);             \
-    res(6, j) += lcol(6) * rrow(j);             \
-    res(7, j) += lcol(7) * rrow(j);             \
-
-#define computePass(i)                          \
-    loadData(i, i);                             \
-                                                \
-    computeCol(0);                              \
-    computeCol(1);                              \
-    computeCol(2);                              \
-    computeCol(3);                              \
-    computeCol(4);                              \
-    computeCol(5);                              \
-    computeCol(6);                              \
-    computeCol(7);                              \
-
-    computePass(0);
-    computePass(1);
-    computePass(2);
-    computePass(3);
-    computePass(4);
-    computePass(5);
-    computePass(6);
-    computePass(7);
-
-#undef lcol
-#undef rrow
-#undef lhs_element
-#undef rhs_element
-#undef loadData
-#undef computeCol
-#undef computePass
-  } // end loop over k
-
-  // we've now iterated over all of the large (ie width 64) k blocks and
-  // accumulated results in registers. At this point thread (x, y, z) contains
-  // the sum across all big k blocks of the product of little k block of index (x, y)
-  // with block of index (y, z). To compute the final output, we need to reduce
-  // the 8 threads over y by summation.
-#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
-
-#define reduceRow(i, mask)                      \
-  shuffleInc(i, 0, mask);                       \
-  shuffleInc(i, 1, mask);                       \
-  shuffleInc(i, 2, mask);                       \
-  shuffleInc(i, 3, mask);                       \
-  shuffleInc(i, 4, mask);                       \
-  shuffleInc(i, 5, mask);                       \
-  shuffleInc(i, 6, mask);                       \
-  shuffleInc(i, 7, mask);                       \
-
-#define reduceMatrix(mask)                      \
-  reduceRow(0, mask);                           \
-  reduceRow(1, mask);                           \
-  reduceRow(2, mask);                           \
-  reduceRow(3, mask);                           \
-  reduceRow(4, mask);                           \
-  reduceRow(5, mask);                           \
-  reduceRow(6, mask);                           \
-  reduceRow(7, mask);                           \
-
-  // actually perform the reduction, now each thread of index (_, y, z)
-  // contains the correct values in its registers that belong in the output
-  // block
-  reduceMatrix(1);
-  reduceMatrix(2);
-  reduceMatrix(4);
-
-#undef shuffleInc
-#undef reduceRow
-#undef reduceMatrix
-
-  // now we need to copy the 64 values into main memory. We can't split work
-  // among threads because all variables are in registers. There's 2 ways
-  // to do this:
-  // (1) have 1 thread do 64 writes from registers into global memory
-  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
-  //     each do 8 writes into global memory. We can just overwrite the shared
-  //     memory from the problem we just solved.
-  // (2) is slightly faster than (1) due to less branching and more ILP
-
-  // TODO: won't yield much gain, but could just use currently unused shared mem
-  //       and then we won't have to sync
-  // wait for shared mem to be out of use
-  __syncthreads();
-
-#define writeResultShmem(i, j)                                          \
-  lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
-
-#define writeRow(i)                             \
-  writeResultShmem(i, 0);                       \
-  writeResultShmem(i, 1);                       \
-  writeResultShmem(i, 2);                       \
-  writeResultShmem(i, 3);                       \
-  writeResultShmem(i, 4);                       \
-  writeResultShmem(i, 5);                       \
-  writeResultShmem(i, 6);                       \
-  writeResultShmem(i, 7);                       \
-
-  if (threadIdx.x == 0) {
-    writeRow(0);
-    writeRow(1);
-    writeRow(2);
-    writeRow(3);
-    writeRow(4);
-    writeRow(5);
-    writeRow(6);
-    writeRow(7);
-  }
-#undef writeResultShmem
-#undef writeRow
-
-  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
-  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
-
-  if (threadIdx.x < max_i_write) {
-    if (max_j_write == 8) {
-      // TODO: can i trade bank conflicts for coalesced writes?
-      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
-      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
-      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
-      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
-      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
-      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
-      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
-      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
-
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
-    } else {
-#pragma unroll 7
-      for (int j = 0; j < max_j_write; j++) {
-        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
-        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
-      }
-    }
-  }
-#undef res
-}
-
-
-template<typename Scalar, typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(512)
-EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ Scalar lhs_shmem[72 * 64];
-  __shared__ Scalar rhs_shmem[72 * 64];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  if (base_m + 63 < m_size && base_n + 63 < n_size) {
-    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
-  } else {
-    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
-         bool CHECK_RHS_BOUNDARY>
-__device__ EIGEN_STRONG_INLINE void
-EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output, float2 lhs_shmem2[][16],
-                       float2 rhs_shmem2[][8], const Index m_size,
-                       const Index n_size, const Index k_size,
-                       const Index base_m, const Index base_n) {
-  typedef float Scalar;
-
-  // prefetch registers
-  float4 lhs_pf0, rhs_pf0;
-
-  float4 results[4];
-  for (int i=0; i < 4; i++) {
-    results[i].x = results[i].y = results[i].z = results[i].w = 0;
-  }
-
-
-#define prefetch_lhs(reg, row, col)                   \
-    if (!CHECK_LHS_BOUNDARY) {                        \
-      if (col < k_size) {                             \
-        reg =lhs.loadPacket<Unaligned>(row, col);     \
-      }                                               \
-    } else {                                          \
-      if (col < k_size) {                             \
-        if (row + 3 < m_size) {                       \
-          reg =lhs.loadPacket<Unaligned>(row, col);   \
-        } else if (row + 2 < m_size) {                \
-          reg.x =lhs(row + 0, col);                   \
-          reg.y =lhs(row + 1, col);                   \
-          reg.z =lhs(row + 2, col);                   \
-        } else if (row + 1 < m_size) {                \
-          reg.x =lhs(row + 0, col);                   \
-          reg.y =lhs(row + 1, col);                   \
-        } else if (row  < m_size) {                   \
-          reg.x =lhs(row + 0, col);                   \
-        }                                             \
-      }                                               \
-    }                                                 \
-
-
-  Index lhs_vert = base_m+threadIdx.x*4;
-
-  for (Index k = 0; k < k_size; k += 16) {
-    lhs_pf0 = internal::pset1<float4>(0);
-    rhs_pf0 = internal::pset1<float4>(0);
-
-    Index lhs_horiz = threadIdx.y+k;
-    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
-
-    Index rhs_vert = k+(threadIdx.x%4)*4;
-    Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
-
-    if (!CHECK_RHS_BOUNDARY) {
-      if ((rhs_vert + 3) < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-      } else if (rhs_vert + 2 < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-      } else if (rhs_vert + 1 < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-      } else if (rhs_vert  < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-      }
-    } else {
-      if (rhs_horiz0 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-        } else if ((rhs_vert + 2) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        } else if ((rhs_vert + 1) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        } else if (rhs_vert  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        }
-      }
-    }
-    float x1, x2 ;
-    // the following can be a bitwise operation..... some day.
-    if((threadIdx.x%8) < 4) {
-      x1 = rhs_pf0.y;
-      x2 = rhs_pf0.w;
-    } else {
-      x1 = rhs_pf0.x;
-      x2 = rhs_pf0.z;
-    }
-    x1 = __shfl_xor(x1, 4);
-    x2 = __shfl_xor(x2, 4);
-    if((threadIdx.x%8) < 4) {
-      rhs_pf0.y = x1;
-      rhs_pf0.w = x2;
-    } else {
-      rhs_pf0.x = x1;
-      rhs_pf0.z = x2;
-    }
-
-    // We have 64 features.
-    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
-    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
-    // ...
-    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
-    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
-    // ...
-    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
-    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
-
-    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // ...
-    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
-    // ...
-
-    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
-    lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
-
-
-#define add_vals(fl1, fl2, fr1, fr2)\
-    results[0].x += fl1.x * fr1.x;\
-    results[0].y += fl1.y * fr1.x;\
-    results[0].z += fl2.x * fr1.x;\
-    results[0].w += fl2.y * fr1.x;\
-\
-    results[1].x += fl1.x * fr1.y;\
-    results[1].y += fl1.y * fr1.y;\
-    results[1].z += fl2.x * fr1.y;\
-    results[1].w += fl2.y * fr1.y;\
-\
-    results[2].x += fl1.x * fr2.x;\
-    results[2].y += fl1.y * fr2.x;\
-    results[2].z += fl2.x * fr2.x;\
-    results[2].w += fl2.y * fr2.x;\
-\
-    results[3].x += fl1.x * fr2.y;\
-    results[3].y += fl1.y * fr2.y;\
-    results[3].z += fl2.x * fr2.y;\
-    results[3].w += fl2.y * fr2.y;\
-
-    __syncthreads();
-
-    // Do the multiplies.
-    #pragma unroll
-    for (int koff = 0; koff < 16; koff ++) {
-      // 32 x threads.
-      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
-      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
-
-      int start_feature = threadIdx.y * 4;
-      float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
-      float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
-
-      add_vals(fl1, fl2, fr1, fr2)
-    }
-    __syncthreads();
-  }
-
-#undef prefetch_lhs
-#undef add_vals
-
-  Index horiz_base = threadIdx.y*4+base_n;
-  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
-    for (int i = 0; i < 4; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }
-  } else if (!CHECK_RHS_BOUNDARY) {
-    // CHECK LHS
-    if (lhs_vert + 3 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    } else if (lhs_vert + 2 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      }
-    } else if (lhs_vert + 1 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      }
-    } else if (lhs_vert  < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-      }
-    }
-  } else if (!CHECK_LHS_BOUNDARY) {
-    // CHECK RHS
-    /*
-    int ncols_rem = fminf(n_size- horiz_base, 4);
-    for (int i = 0; i < ncols_rem; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }*/
-    for (int i = 0; i < 4; i++) {
-      if (horiz_base+i < n_size) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-       }
-    }
-  } else {
-    // CHECK both boundaries.
-    for (int i = 0; i < 4; i++) {
-      if (horiz_base+i < n_size) {
-        if (lhs_vert < m_size)
-          output(lhs_vert, horiz_base + i) = results[i].x;
-        if (lhs_vert + 1 < m_size)
-          output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        if (lhs_vert + 2 < m_size)
-          output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        if (lhs_vert + 3 < m_size)
-          output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
-         bool CHECK_RHS_BOUNDARY>
-__device__ EIGEN_STRONG_INLINE void
-EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output, float2 lhs_shmem2[][32],
-                       float2 rhs_shmem2[][8], const Index m_size,
-                       const Index n_size, const Index k_size,
-                       const Index base_m, const Index base_n) {
-  typedef float Scalar;
-
-  // prefetch registers
-  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
-  float4 rhs_pf0, rhs_pf1;
-
-  float4 results[8];
-  for (int i=0; i < 8; i++) {
-    results[i].x = results[i].y = results[i].z = results[i].w = 0;
-  }
-
-
-  Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
-  for (Index k = 0; k < k_size; k += 32) {
-    lhs_pf0 = internal::pset1<float4>(0);
-    lhs_pf1 = internal::pset1<float4>(0);
-    lhs_pf2 = internal::pset1<float4>(0);
-    lhs_pf3 = internal::pset1<float4>(0);
-
-    rhs_pf0 = internal::pset1<float4>(0);
-    rhs_pf1 = internal::pset1<float4>(0);
-
-     if (!CHECK_LHS_BOUNDARY) {
-      if ((threadIdx.y/4+k+24) < k_size) {
-        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-        lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
-      } else if ((threadIdx.y/4+k+16) < k_size) {
-        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-      } else if ((threadIdx.y/4+k+8) < k_size) {
-        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-      } else if ((threadIdx.y/4+k) < k_size) {
-        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-      }
-    } else {
-      // just CHECK_LHS_BOUNDARY
-      if (lhs_vert + 3 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-          lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert + 2 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
-          lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert + 1 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-        }
-      }
-    }
-    __syncthreads();
-    Index rhs_vert = k+threadIdx.x*4;
-    Index rhs_horiz0 = threadIdx.y*2+base_n;
-    Index rhs_horiz1 = threadIdx.y*2+1+base_n;
-    if (!CHECK_RHS_BOUNDARY) {
-      if ((rhs_vert + 3) < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-        rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
-      } else if (rhs_vert + 2 < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-      } else if (rhs_vert + 1 < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-      } else if (rhs_vert  < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-      }
-    } else {
-      if (rhs_horiz1 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-          rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
-        } else if (rhs_vert + 2 < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-        } else if (k+threadIdx.x*4 + 1 < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-        } else if (k+threadIdx.x*4  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        }
-      } else if (rhs_horiz0 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-        } else if ((rhs_vert + 2) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        } else if ((rhs_vert + 1) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        } else if (rhs_vert  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        }
-      }
-    }
-    __syncthreads();
-    // Loaded. Do computation
-    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
-    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
-    // ..
-    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
-    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
-    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
-    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
-    // ..
-    rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
-    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
-    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
-    rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
-    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
-    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
-    rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
-
-    // LHS.
-    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
-    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
-    // ...
-    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
-    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
-
-
-#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
-      results[0].x += a_feat1.x * f1.x;\
-      results[1].x += a_feat1.x * f1.y;\
-      results[2].x += a_feat1.x * f2.x;\
-      results[3].x += a_feat1.x * f2.y;\
-      results[4].x += a_feat1.x * f3.x;\
-      results[5].x += a_feat1.x * f3.y;\
-      results[6].x += a_feat1.x * f4.x;\
-      results[7].x += a_feat1.x * f4.y;\
-\
-      results[0].y += a_feat1.y * f1.x;\
-      results[1].y += a_feat1.y * f1.y;\
-      results[2].y += a_feat1.y * f2.x;\
-      results[3].y += a_feat1.y * f2.y;\
-      results[4].y += a_feat1.y * f3.x;\
-      results[5].y += a_feat1.y * f3.y;\
-      results[6].y += a_feat1.y * f4.x;\
-      results[7].y += a_feat1.y * f4.y;\
-\
-      results[0].z += a_feat2.x * f1.x;\
-      results[1].z += a_feat2.x * f1.y;\
-      results[2].z += a_feat2.x * f2.x;\
-      results[3].z += a_feat2.x * f2.y;\
-      results[4].z += a_feat2.x * f3.x;\
-      results[5].z += a_feat2.x * f3.y;\
-      results[6].z += a_feat2.x * f4.x;\
-      results[7].z += a_feat2.x * f4.y;\
-\
-      results[0].w += a_feat2.y * f1.x;\
-      results[1].w += a_feat2.y * f1.y;\
-      results[2].w += a_feat2.y * f2.x;\
-      results[3].w += a_feat2.y * f2.y;\
-      results[4].w += a_feat2.y * f3.x;\
-      results[5].w += a_feat2.y * f3.y;\
-      results[6].w += a_feat2.y * f4.x;\
-      results[7].w += a_feat2.y * f4.y;\
-
-    lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
-    lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
-    lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
-    lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
-
-    lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
-    lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
-    lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
-    lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
-
-    __syncthreads();
-
-    // Do the multiplies.
-    #pragma unroll
-    for (int koff = 0; koff < 32; koff ++) {
-      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
-      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
-
-      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
-      int start_feature = (threadIdx.y / 4) * 8;
-
-      float2 br1 = rhs_shmem2[start_feature/2 +     (koff % 4) * 32][koff/4];
-      float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
-      float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
-      float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
-
-      add_vals(a3, a4, br1, br2, br3, br4)
-    }
-    __syncthreads();
-  } // end loop over k
-
-
-  __syncthreads();
-  Index horiz_base = (threadIdx.y/4)*8+base_n;
-  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
-    for (int i = 0; i < 8; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }
-  } else if (!CHECK_RHS_BOUNDARY) {
-    if (lhs_vert + 3 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    } else if (lhs_vert + 2 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      }
-    } else if (lhs_vert + 1 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      }
-    } else if (lhs_vert  < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-      }
-    }
-  } else if (!CHECK_LHS_BOUNDARY) {
-    // CHECK BOUNDARY_B
-    for (int i = 0; i < 8; i++) {
-      if (horiz_base + i < n_size) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  } else {
-    // CHECK both boundaries.
-    for (int i = 0; i < 8; i++) {
-      if (horiz_base + i < n_size) {
-        if (lhs_vert < m_size)
-          output(lhs_vert, horiz_base + i) = results[i].x;
-        if (lhs_vert + 1 < m_size)
-          output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        if (lhs_vert + 2 < m_size)
-          output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        if (lhs_vert + 3 < m_size)
-          output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(256)
-EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ float2 lhs_shmem[64*32];
-  __shared__ float2 rhs_shmem[128*8];
-
-  typedef float2 LHS_MEM[64][32];
-  typedef float2 RHS_MEM[128][8];
-
-  typedef float2 LHS_MEM16x16[32][16];
-  typedef float2 RHS_MEM16x16[64][8];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 128 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  bool check_rhs = (base_n + 63) >= n_size;
-  bool check_lhs128 = (base_m + 127) >= m_size;
-
-  if (!check_rhs) {
-    if (!check_lhs128) {
-      // >= 128 rows left
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    }
-  } else {
-    if (!check_lhs128) {
-      // >= 128 rows left
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    }
-  }
-}
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(256)
-EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ float2 lhs_shmem[32][16];
-  __shared__ float2 rhs_shmem[64][8];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  if (base_m + 63 < m_size) {
-    if (base_n + 63 < n_size) {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    }
-  } else {
-    if (base_n + 63 < n_size) {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    }
-  }
-}
-
-
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > {
-
-  typedef GpuDevice Device;
-
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
-  typedef TensorContractionEvaluatorBase<Self> Base;
-
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
-
-  enum {
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-  };
-
-  // Most of the code is assuming that both input tensors are ColMajor. If the
-  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
-  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
-  // will pretend B is LHS and A is RHS.
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-
-  static const int LDims =
-      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
-  static const int RDims =
-      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
-  static const int ContractDims = internal::array_size<Indices>::value;
-
-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
-
-  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, LDims - ContractDims> left_nocontract_t;
-  typedef array<Index, RDims - ContractDims> right_nocontract_t;
-
-  static const int NumDims = LDims + RDims - 2 * ContractDims;
-
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  // typedefs needed in evalTo
-  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
-  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
-  typedef typename LeftEvaluator::Dimensions LeftDimensions;
-  typedef typename RightEvaluator::Dimensions RightDimensions;
-
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
-      Base(op, device) {}
-
-  // We need to redefine this method to make nvcc happy
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
-    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
-    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
-    if (data) {
-      evalTo(data);
-      return false;
-    } else {
-      this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
-      evalTo(this->m_result);
-      return true;
-    }
-  }
-
-  void evalTo(Scalar* buffer) const {
-    if (this->m_lhs_inner_dim_contiguous) {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, true, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<true, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, false, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<true, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-    else {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, true, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<false, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, false, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<false, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-  }
-
-  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
-    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
-    const Index m_blocks = (m + 63) / 64;
-    const Index n_blocks = (n + 63) / 64;
-    const dim3 num_blocks(m_blocks, n_blocks, 1);
-    const dim3 block_size(8, 8, 8);
-    LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-    }
-  };
-
-  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
-    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
-      if (m < 768 || n < 768) {
-        const Index m_blocks = (m + 63) / 64;
-        const Index n_blocks = (n + 63) / 64;
-        const dim3 num_blocks(m_blocks, n_blocks, 1);
-        const dim3 block_size(16, 16, 1);
-        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-      } else {
-        const Index m_blocks = (m + 127) / 128;
-        const Index n_blocks = (n + 63) / 64;
-        const dim3 num_blocks(m_blocks, n_blocks, 1);
-        const dim3 block_size(8, 32, 1);
-        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-      }
-    }
-  };
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalTyped(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-    EIGEN_UNUSED_VARIABLE(k)
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                   LeftEvaluator, left_nocontract_t,
-                                                   contract_t, 4,
-                                                   lhs_inner_dim_contiguous,
-                                                   false, Unaligned> LhsMapper;
-
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                   RightEvaluator, right_nocontract_t,
-                                                   contract_t, 4,
-                                                   rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
-
-    // initialize data mappers
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
-                  this->m_left_contracting_strides, this->m_k_strides);
-
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
-                  this->m_right_contracting_strides, this->m_k_strides);
-
-    OutputMapper output(buffer, m);
-
-    setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte);
-    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output,  m, n, k, this->m_device);
-  }
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_USE_GPU and __CUDACC__
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
+#include "TensorContractionGpu.h"
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8180382729619dbed51ff8a8db86641506f9d8c
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
@@ -0,0 +1,1413 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
+// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+namespace Eigen {
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
+__device__ EIGEN_STRONG_INLINE void
+EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                               const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
+                       const Index m_size, const Index n_size, const Index k_size) {
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  // declare and initialize 64 registers for output 8x8 block
+
+  // prefetch registers
+  Scalar lhs_pf0;
+  Scalar lhs_pf1;
+  Scalar lhs_pf2;
+  Scalar lhs_pf3;
+  Scalar lhs_pf4;
+  Scalar lhs_pf5;
+  Scalar lhs_pf6;
+  Scalar lhs_pf7;
+
+  Scalar rhs_pf0;
+  Scalar rhs_pf1;
+  Scalar rhs_pf2;
+  Scalar rhs_pf3;
+  Scalar rhs_pf4;
+  Scalar rhs_pf5;
+  Scalar rhs_pf6;
+  Scalar rhs_pf7;
+
+  // shared memory is formatted
+  // (contract idx in block, nocontract idx in block, block idx)
+  // where block idx is column major. This transposition limits the number of
+  // bank conflicts when reading the LHS. The core idea is that since the contracting
+  // index is shared by both sides, then the contracting index should be in threadIdx.x.
+
+  // On the LHS, we pad each row inside of each block with an extra element. This makes
+  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
+  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
+
+  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
+  // conflicts on writes and also none on reads.
+
+  // storage indices
+  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
+  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
+
+  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
+  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
+  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
+  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
+  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
+  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
+  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
+  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
+
+  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
+  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
+  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
+  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
+  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
+  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
+  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
+  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
+
+  // in the loading code, the following variables are important:
+  // threadIdx.x: the vertical position in an 8x8 block
+  // threadIdx.y: the vertical index of the 8x8 block in the grid
+  // threadIdx.z: the horizontal position in an 8x8 block
+  // k: the horizontal index of the 8x8 block in the grid
+  //
+  // The k parameter is implicit (it was the loop counter for a loop that went
+  // from 0 to <8, but now that loop is unrolled in the below code.
+
+  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
+  const Index lhs_vert = base_m + load_idx_vert;
+
+#define prefetchIntoRegisters(base_k)                           \
+  {                                                             \
+    lhs_pf0 = conv(0);                                          \
+    lhs_pf1 = conv(0);                                          \
+    lhs_pf2 = conv(0);                                          \
+    lhs_pf3 = conv(0);                                          \
+    lhs_pf4 = conv(0);                                          \
+    lhs_pf5 = conv(0);                                          \
+    lhs_pf6 = conv(0);                                          \
+    lhs_pf7 = conv(0);                                          \
+                                                                \
+    rhs_pf0 = conv(0);                                          \
+    rhs_pf1 = conv(0);                                          \
+    rhs_pf2 = conv(0);                                          \
+    rhs_pf3 = conv(0);                                          \
+    rhs_pf4 = conv(0);                                          \
+    rhs_pf5 = conv(0);                                          \
+    rhs_pf6 = conv(0);                                          \
+    rhs_pf7 = conv(0);                                          \
+                                                                \
+    if (!needs_edge_check || lhs_vert < m_size) {               \
+      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8;   \
+      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8;   \
+      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8;   \
+      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8;   \
+      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8;   \
+      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8;   \
+      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8;   \
+      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8;   \
+                                                                \
+      if (!needs_edge_check || lhs_horiz_7 < k_size) {          \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                   \
+      } else if (lhs_horiz_6 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+      } else if (lhs_horiz_5 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+      } else if (lhs_horiz_4 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+      } else if (lhs_horiz_3 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+      } else if (lhs_horiz_2 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+      } else if (lhs_horiz_1 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+      } else if (lhs_horiz_0 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+                                                                \
+    const Index rhs_vert = base_k + load_idx_vert;              \
+    if (!needs_edge_check || rhs_vert < k_size) {               \
+      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8;   \
+      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8;   \
+      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8;   \
+      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8;   \
+      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8;   \
+      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8;   \
+      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8;   \
+      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8;   \
+                                                                \
+      if (rhs_horiz_7 < n_size) {                               \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                   \
+      } else if (rhs_horiz_6 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+      } else if (rhs_horiz_5 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+      } else if (rhs_horiz_4 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+      } else if (rhs_horiz_3 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+      } else if (rhs_horiz_2 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+      } else if (rhs_horiz_1 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+      } else if (rhs_horiz_0 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+  }                                                             \
+
+#define writeRegToShmem(_)                      \
+  lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
+  rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
+                                                \
+  lhs_shmem[lhs_store_idx_1] = lhs_pf1;         \
+  rhs_shmem[rhs_store_idx_1] = rhs_pf1;         \
+                                                \
+  lhs_shmem[lhs_store_idx_2] = lhs_pf2;         \
+  rhs_shmem[rhs_store_idx_2] = rhs_pf2;         \
+                                                \
+  lhs_shmem[lhs_store_idx_3] = lhs_pf3;         \
+  rhs_shmem[rhs_store_idx_3] = rhs_pf3;         \
+                                                \
+  lhs_shmem[lhs_store_idx_4] = lhs_pf4;         \
+  rhs_shmem[rhs_store_idx_4] = rhs_pf4;         \
+                                                \
+  lhs_shmem[lhs_store_idx_5] = lhs_pf5;         \
+  rhs_shmem[rhs_store_idx_5] = rhs_pf5;         \
+                                                \
+  lhs_shmem[lhs_store_idx_6] = lhs_pf6;         \
+  rhs_shmem[rhs_store_idx_6] = rhs_pf6;         \
+                                                \
+  lhs_shmem[lhs_store_idx_7] = lhs_pf7;         \
+  rhs_shmem[rhs_store_idx_7] = rhs_pf7;         \
+
+  // declare and initialize result array
+#define res(i, j) _res_##i##j
+#define initResultRow(i)                        \
+  Scalar res(i, 0) = conv(0);                   \
+  Scalar res(i, 1) = conv(0);                   \
+  Scalar res(i, 2) = conv(0);                   \
+  Scalar res(i, 3) = conv(0);                   \
+  Scalar res(i, 4) = conv(0);                   \
+  Scalar res(i, 5) = conv(0);                   \
+  Scalar res(i, 6) = conv(0);                   \
+  Scalar res(i, 7) = conv(0);                   \
+
+  internal::scalar_cast_op<int, Scalar> conv;
+  initResultRow(0);
+  initResultRow(1);
+  initResultRow(2);
+  initResultRow(3);
+  initResultRow(4);
+  initResultRow(5);
+  initResultRow(6);
+  initResultRow(7);
+#undef initResultRow
+
+  for (Index base_k = 0; base_k < k_size; base_k += 64) {
+    // wait for previous iteration to finish with shmem. Despite common sense,
+    // the code is a bit faster with this here then at bottom of loop
+    __syncthreads();
+
+    prefetchIntoRegisters(base_k);
+    writeRegToShmem();
+
+    #undef prefetchIntoRegisters
+    #undef writeRegToShmem
+
+    // wait for shared mem packing to be done before starting computation
+    __syncthreads();
+
+    // compute 8x8 matrix product by outer product. This involves packing one column
+    // of LHS and one row of RHS into registers (takes 16 registers).
+
+#define lcol(i) _lcol##i
+    Scalar lcol(0);
+    Scalar lcol(1);
+    Scalar lcol(2);
+    Scalar lcol(3);
+    Scalar lcol(4);
+    Scalar lcol(5);
+    Scalar lcol(6);
+    Scalar lcol(7);
+
+#define rrow(j) _rrow##j
+    Scalar rrow(0);
+    Scalar rrow(1);
+    Scalar rrow(2);
+    Scalar rrow(3);
+    Scalar rrow(4);
+    Scalar rrow(5);
+    Scalar rrow(6);
+    Scalar rrow(7);
+
+    // Now x corresponds to k, y to m, and z to n
+    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
+    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
+
+#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
+#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
+
+#define loadData(i, j)                          \
+    lcol(0) = lhs_element(0, j);               \
+    rrow(0) = rhs_element(i, 0);               \
+    lcol(1) = lhs_element(1, j);               \
+    rrow(1) = rhs_element(i, 1);               \
+    lcol(2) = lhs_element(2, j);               \
+    rrow(2) = rhs_element(i, 2);               \
+    lcol(3) = lhs_element(3, j);               \
+    rrow(3) = rhs_element(i, 3);               \
+    lcol(4) = lhs_element(4, j);               \
+    rrow(4) = rhs_element(i, 4);               \
+    lcol(5) = lhs_element(5, j);               \
+    rrow(5) = rhs_element(i, 5);               \
+    lcol(6) = lhs_element(6, j);               \
+    rrow(6) = rhs_element(i, 6);               \
+    lcol(7) = lhs_element(7, j);               \
+    rrow(7) = rhs_element(i, 7);               \
+
+#define computeCol(j)                           \
+    res(0, j) += lcol(0) * rrow(j);             \
+    res(1, j) += lcol(1) * rrow(j);             \
+    res(2, j) += lcol(2) * rrow(j);             \
+    res(3, j) += lcol(3) * rrow(j);             \
+    res(4, j) += lcol(4) * rrow(j);             \
+    res(5, j) += lcol(5) * rrow(j);             \
+    res(6, j) += lcol(6) * rrow(j);             \
+    res(7, j) += lcol(7) * rrow(j);             \
+
+#define computePass(i)                          \
+    loadData(i, i);                             \
+                                                \
+    computeCol(0);                              \
+    computeCol(1);                              \
+    computeCol(2);                              \
+    computeCol(3);                              \
+    computeCol(4);                              \
+    computeCol(5);                              \
+    computeCol(6);                              \
+    computeCol(7);                              \
+
+    computePass(0);
+    computePass(1);
+    computePass(2);
+    computePass(3);
+    computePass(4);
+    computePass(5);
+    computePass(6);
+    computePass(7);
+
+#undef lcol
+#undef rrow
+#undef lhs_element
+#undef rhs_element
+#undef loadData
+#undef computeCol
+#undef computePass
+  } // end loop over k
+
+  // we've now iterated over all of the large (ie width 64) k blocks and
+  // accumulated results in registers. At this point thread (x, y, z) contains
+  // the sum across all big k blocks of the product of little k block of index (x, y)
+  // with block of index (y, z). To compute the final output, we need to reduce
+  // the 8 threads over y by summation.
+#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+#else
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
+#endif
+
+#define reduceRow(i, mask)                      \
+  shuffleInc(i, 0, mask);                       \
+  shuffleInc(i, 1, mask);                       \
+  shuffleInc(i, 2, mask);                       \
+  shuffleInc(i, 3, mask);                       \
+  shuffleInc(i, 4, mask);                       \
+  shuffleInc(i, 5, mask);                       \
+  shuffleInc(i, 6, mask);                       \
+  shuffleInc(i, 7, mask);                       \
+
+#define reduceMatrix(mask)                      \
+  reduceRow(0, mask);                           \
+  reduceRow(1, mask);                           \
+  reduceRow(2, mask);                           \
+  reduceRow(3, mask);                           \
+  reduceRow(4, mask);                           \
+  reduceRow(5, mask);                           \
+  reduceRow(6, mask);                           \
+  reduceRow(7, mask);                           \
+
+  // actually perform the reduction, now each thread of index (_, y, z)
+  // contains the correct values in its registers that belong in the output
+  // block
+  reduceMatrix(1);
+  reduceMatrix(2);
+  reduceMatrix(4);
+
+#undef shuffleInc
+#undef reduceRow
+#undef reduceMatrix
+
+  // now we need to copy the 64 values into main memory. We can't split work
+  // among threads because all variables are in registers. There's 2 ways
+  // to do this:
+  // (1) have 1 thread do 64 writes from registers into global memory
+  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
+  //     each do 8 writes into global memory. We can just overwrite the shared
+  //     memory from the problem we just solved.
+  // (2) is slightly faster than (1) due to less branching and more ILP
+
+  // TODO: won't yield much gain, but could just use currently unused shared mem
+  //       and then we won't have to sync
+  // wait for shared mem to be out of use
+  __syncthreads();
+
+#define writeResultShmem(i, j)                                          \
+  lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
+
+#define writeRow(i)                             \
+  writeResultShmem(i, 0);                       \
+  writeResultShmem(i, 1);                       \
+  writeResultShmem(i, 2);                       \
+  writeResultShmem(i, 3);                       \
+  writeResultShmem(i, 4);                       \
+  writeResultShmem(i, 5);                       \
+  writeResultShmem(i, 6);                       \
+  writeResultShmem(i, 7);                       \
+
+  if (threadIdx.x == 0) {
+    writeRow(0);
+    writeRow(1);
+    writeRow(2);
+    writeRow(3);
+    writeRow(4);
+    writeRow(5);
+    writeRow(6);
+    writeRow(7);
+  }
+#undef writeResultShmem
+#undef writeRow
+
+  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+
+  if (threadIdx.x < max_i_write) {
+    if (max_j_write == 8) {
+      // TODO: can i trade bank conflicts for coalesced writes?
+      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
+      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
+      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
+      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
+      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
+      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
+      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
+      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
+
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
+    } else {
+#pragma unroll 7
+      for (int j = 0; j < max_j_write; j++) {
+        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
+        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
+      }
+    }
+  }
+#undef res
+}
+
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(512)
+#endif
+EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ Scalar lhs_shmem[72 * 64];
+  __shared__ Scalar rhs_shmem[72 * 64];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size && base_n + 63 < n_size) {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  } else {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ __forceinline__ void
+EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][16],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+
+  // prefetch registers
+  float4 lhs_pf0, rhs_pf0;
+
+  float4 results[4];
+  for (int i=0; i < 4; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+#define prefetch_lhs(reg, row, col)                            \
+    if (!CHECK_LHS_BOUNDARY) {                                 \
+      if (col < k_size) {                                      \
+        reg =lhs.template loadPacket<float4,Unaligned>(row, col);     \
+      }                                                        \
+    } else {                                                   \
+      if (col < k_size) {                                      \
+        if (row + 3 < m_size) {                                \
+          reg =lhs.template loadPacket<float4,Unaligned>(row, col);   \
+        } else if (row + 2 < m_size) {                         \
+          reg.x =lhs(row + 0, col);                            \
+          reg.y =lhs(row + 1, col);                            \
+          reg.z =lhs(row + 2, col);                            \
+        } else if (row + 1 < m_size) {                         \
+          reg.x =lhs(row + 0, col);                            \
+          reg.y =lhs(row + 1, col);                            \
+        } else if (row  < m_size) {                            \
+          reg.x =lhs(row + 0, col);                            \
+        }                                                      \
+      }                                                        \
+    }							       \
+
+  Index lhs_vert = base_m+threadIdx.x*4;
+
+  for (Index k = 0; k < k_size; k += 16) {
+
+    lhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf0 = internal::pset1<float4>(0);
+
+    Index lhs_horiz = threadIdx.y+k;
+    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
+
+    Index rhs_vert = k+(threadIdx.x%4)*4;
+    Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
+
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+      }
+    } else {
+      if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    float x1, x2 ;
+    // the following can be a bitwise operation..... some day.
+    if((threadIdx.x%8) < 4) {
+      x1 = rhs_pf0.y;
+      x2 = rhs_pf0.w;
+    } else {
+      x1 = rhs_pf0.x;
+      x2 = rhs_pf0.z;
+    }
+    #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+    x1 = __shfl_xor(x1, 4);
+    x2 = __shfl_xor(x2, 4);
+    #else
+    x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
+    x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
+    #endif
+    if((threadIdx.x%8) < 4) {
+      rhs_pf0.y = x1;
+      rhs_pf0.w = x2;
+    } else {
+      rhs_pf0.x = x1;
+      rhs_pf0.z = x2;
+    }
+
+    // We have 64 features.
+    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
+    // ...
+    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
+    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
+    // ...
+    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
+    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
+
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // ...
+    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
+    // ...
+
+    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
+
+
+#define add_vals(fl1, fl2, fr1, fr2)\
+    results[0].x += fl1.x * fr1.x;\
+    results[0].y += fl1.y * fr1.x;\
+    results[0].z += fl2.x * fr1.x;\
+    results[0].w += fl2.y * fr1.x;\
+\
+    results[1].x += fl1.x * fr1.y;\
+    results[1].y += fl1.y * fr1.y;\
+    results[1].z += fl2.x * fr1.y;\
+    results[1].w += fl2.y * fr1.y;\
+\
+    results[2].x += fl1.x * fr2.x;\
+    results[2].y += fl1.y * fr2.x;\
+    results[2].z += fl2.x * fr2.x;\
+    results[2].w += fl2.y * fr2.x;\
+\
+    results[3].x += fl1.x * fr2.y;\
+    results[3].y += fl1.y * fr2.y;\
+    results[3].z += fl2.x * fr2.y;\
+    results[3].w += fl2.y * fr2.y;\
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 16; koff ++) {
+      // 32 x threads.
+      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
+      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
+
+      int start_feature = threadIdx.y * 4;
+      float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+      float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+
+      add_vals(fl1, fl2, fr1, fr2)
+    }
+    __syncthreads();
+  }
+
+#undef prefetch_lhs
+#undef add_vals
+
+  Index horiz_base = threadIdx.y*4+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 4; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    // CHECK LHS
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK RHS
+    /*
+    int ncols_rem = fminf(n_size- horiz_base, 4);
+    for (int i = 0; i < ncols_rem; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }*/
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+       }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ __forceinline__ void
+EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][32],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+
+  // prefetch registers
+  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
+  float4 rhs_pf0, rhs_pf1;
+
+  float4 results[8];
+  for (int i=0; i < 8; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+  Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
+  for (Index k = 0; k < k_size; k += 32) {
+    lhs_pf0 = internal::pset1<float4>(0);
+    lhs_pf1 = internal::pset1<float4>(0);
+    lhs_pf2 = internal::pset1<float4>(0);
+    lhs_pf3 = internal::pset1<float4>(0);
+
+    rhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf1 = internal::pset1<float4>(0);
+
+     if (!CHECK_LHS_BOUNDARY) {
+      if ((threadIdx.y/4+k+24) < k_size) {
+        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+      } else if ((threadIdx.y/4+k+16) < k_size) {
+        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+      } else if ((threadIdx.y/4+k+8) < k_size) {
+        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+      } else if ((threadIdx.y/4+k) < k_size) {
+        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+      }
+    } else {
+      // just CHECK_LHS_BOUNDARY
+      if (lhs_vert + 3 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+          lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert + 2 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+          lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert + 1 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+        }
+      }
+    }
+    __syncthreads();
+    Index rhs_vert = k+threadIdx.x*4;
+    Index rhs_horiz0 = threadIdx.y*2+base_n;
+    Index rhs_horiz1 = threadIdx.y*2+1+base_n;
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+        rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+      }
+    } else {
+      if (rhs_horiz1 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
+        } else if (rhs_vert + 2 < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+        } else if (k+threadIdx.x*4 + 1 < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        } else if (k+threadIdx.x*4  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        }
+      } else if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    __syncthreads();
+    // Loaded. Do computation
+    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
+    // ..
+    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
+    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
+    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
+    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
+    // ..
+    rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
+    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
+    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
+    rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
+    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
+    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
+    rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
+
+    // LHS.
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // ...
+    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+
+
+#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
+      results[0].x += a_feat1.x * f1.x;\
+      results[1].x += a_feat1.x * f1.y;\
+      results[2].x += a_feat1.x * f2.x;\
+      results[3].x += a_feat1.x * f2.y;\
+      results[4].x += a_feat1.x * f3.x;\
+      results[5].x += a_feat1.x * f3.y;\
+      results[6].x += a_feat1.x * f4.x;\
+      results[7].x += a_feat1.x * f4.y;\
+\
+      results[0].y += a_feat1.y * f1.x;\
+      results[1].y += a_feat1.y * f1.y;\
+      results[2].y += a_feat1.y * f2.x;\
+      results[3].y += a_feat1.y * f2.y;\
+      results[4].y += a_feat1.y * f3.x;\
+      results[5].y += a_feat1.y * f3.y;\
+      results[6].y += a_feat1.y * f4.x;\
+      results[7].y += a_feat1.y * f4.y;\
+\
+      results[0].z += a_feat2.x * f1.x;\
+      results[1].z += a_feat2.x * f1.y;\
+      results[2].z += a_feat2.x * f2.x;\
+      results[3].z += a_feat2.x * f2.y;\
+      results[4].z += a_feat2.x * f3.x;\
+      results[5].z += a_feat2.x * f3.y;\
+      results[6].z += a_feat2.x * f4.x;\
+      results[7].z += a_feat2.x * f4.y;\
+\
+      results[0].w += a_feat2.y * f1.x;\
+      results[1].w += a_feat2.y * f1.y;\
+      results[2].w += a_feat2.y * f2.x;\
+      results[3].w += a_feat2.y * f2.y;\
+      results[4].w += a_feat2.y * f3.x;\
+      results[5].w += a_feat2.y * f3.y;\
+      results[6].w += a_feat2.y * f4.x;\
+      results[7].w += a_feat2.y * f4.y;\
+
+    lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
+    lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
+    lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
+
+    lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
+    lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
+    lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
+    lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 32; koff ++) {
+      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
+      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
+
+      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
+      int start_feature = (threadIdx.y / 4) * 8;
+
+      float2 br1 = rhs_shmem2[start_feature/2 +     (koff % 4) * 32][koff/4];
+      float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
+      float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
+      float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
+
+      add_vals(a3, a4, br1, br2, br3, br4)
+    }
+    __syncthreads();
+  } // end loop over k
+
+  __syncthreads();
+  Index horiz_base = (threadIdx.y/4)*8+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 8; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK BOUNDARY_B
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(256, 1)
+#else
+__launch_bounds__(256)
+#endif
+EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[64*32];
+  __shared__ float2 rhs_shmem[128*8];
+
+  typedef float2 LHS_MEM[64][32];
+  typedef float2 RHS_MEM[128][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 128 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  bool check_rhs = (base_n + 63) >= n_size;
+  bool check_lhs128 = (base_m + 127) >= m_size;
+
+  if (!check_rhs) {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(256, 1)
+#else
+__launch_bounds__(256)
+#endif
+EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[32][16];
+  __shared__ float2 rhs_shmem[64][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size) {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > {
+
+  typedef GpuDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // typedefs needed in evalTo
+  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+  typedef typename LeftEvaluator::Dimensions LeftDimensions;
+  typedef typename RightEvaluator::Dimensions RightDimensions;
+
+  TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device)
+  {
+    EIGEN_STATIC_ASSERT( (internal::is_same<OutputKernelType, const NoOpOutputKernel>::value),
+                          GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS);
+  }
+
+  // We need to redefine this method to make nvcc happy
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(this->m_result);
+      return true;
+    }
+  }
+
+  void evalTo(Scalar* buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+    else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+    const Index m_blocks = (m + 63) / 64;
+    const Index n_blocks = (n + 63) / 64;
+    const dim3 num_blocks(m_blocks, n_blocks, 1);
+    const dim3 block_size(8, 8, 8);
+    LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+    }
+  };
+
+  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+      if (m < 768 || n < 768) {
+        const Index m_blocks = (m + 63) / 64;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(16, 16, 1);
+        LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+      } else {
+        const Index m_blocks = (m + 127) / 128;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(8, 32, 1);
+        LAUNCH_GPU_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+      }
+    }
+  };
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalTyped(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+    EIGEN_UNUSED_VARIABLE(k)
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, 4,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, 4,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+#if defined(EIGEN_USE_HIP)
+    setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
+#else
+    setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
+#endif
+
+    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output,  m, n, k, this->m_device);
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_USE_GPU and EIGEN_GPUCC
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index c28a10dd494f83cdd6d09983309cee2e03d2ffd1..9ab900b4a2c4e6b7651ab442bbe7e13cf7c22460 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -22,8 +22,19 @@ enum {
 /*
  * Implementation of the Eigen blas_data_mapper class for tensors.
  */
-
-template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
+/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which
+/// is scalar * for CoeffLoader.
+template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_ = MakePointer>
+struct CoeffLoader;
+
+template <typename Scalar, typename Index, int side, typename Tensor,
+          typename nocontract_t, typename contract_t, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          template <class> class MakePointer_ = MakePointer>
+class BaseTensorContractionMapper;
+
+template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_>
+struct CoeffLoader {
   enum {
     DirectOffsets = false
   };
@@ -34,6 +45,12 @@ template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
     eigen_assert(false && "unsupported");
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type
+  data() const {
+    eigen_assert(false && "unsupported");
+    return NULL;
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); }
 
  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -42,12 +59,19 @@ template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
     return m_tensor.template packet<LoadMode>(index);
   }
 
+  #ifdef EIGEN_USE_SYCL
+  // The placeholder accessors require to be bound to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_tensor.bind(cgh);
+  }
+  #endif
 
  private:
   const Tensor m_tensor;
 };
 
-template <typename Tensor> struct CoeffLoader<Tensor, true> {
+template <typename Tensor, template <class> class MakePointer_>
+struct CoeffLoader<Tensor, true, MakePointer_> {
   enum {
     DirectOffsets = true
   };
@@ -58,6 +82,11 @@ template <typename Tensor> struct CoeffLoader<Tensor, true> {
     m_data += offset;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type
+  data() const {
+    return m_data;
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); }
 
  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -65,15 +94,23 @@ template <typename Tensor> struct CoeffLoader<Tensor, true> {
   {
     return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
   }
+
+  #ifdef EIGEN_USE_SYCL
+  // The placeholder accessors require to be bound to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_data.bind(cgh);
+  }
+  #endif
  private:
   typedef typename Tensor::Scalar Scalar;
-  const Scalar* m_data;
+
+  typename MakePointer_<const Scalar>::Type m_data;
 };
 
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
-         int packet_size, bool inner_dim_contiguous, int Alignment>
+         int packet_size, bool inner_dim_contiguous, int Alignment, template <class> class MakePointer_ = MakePointer>
 class SimpleTensorContractionMapper {
   public:
   EIGEN_DEVICE_FUNC
@@ -89,7 +126,7 @@ class SimpleTensorContractionMapper {
       m_k_strides(k_strides) { }
 
   enum {
-    DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess>::DirectOffsets
+    DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>::DirectOffsets
   };
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
@@ -116,6 +153,7 @@ class SimpleTensorContractionMapper {
     EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
     Index nocontract_val = left ? row : col;
     Index linidx = 0;
+    EIGEN_UNROLL_LOOP
     for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
       const Index idx = nocontract_val / m_ij_strides[i];
       linidx += idx * m_nocontract_strides[i];
@@ -132,6 +170,7 @@ class SimpleTensorContractionMapper {
 
     Index contract_val = left ? col : row;
     if(array_size<contract_t>::value > 0) {
+      EIGEN_UNROLL_LOOP
       for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
         const Index idx = contract_val / m_k_strides[i];
         linidx += idx * m_contract_strides[i];
@@ -156,6 +195,7 @@ class SimpleTensorContractionMapper {
     Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
     Index linidx[2] = {0, 0};
     if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      EIGEN_UNROLL_LOOP
       for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
         const Index idx0 = nocontract_val[0] / m_ij_strides[i];
         const Index idx1 = nocontract_val[1] / m_ij_strides[i];
@@ -176,6 +216,7 @@ class SimpleTensorContractionMapper {
 
     Index contract_val[2] = {left ? col : row, left ? col : row + distance};
     if (array_size<contract_t>::value> 0) {
+      EIGEN_UNROLL_LOOP
       for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
         const Index idx0 = contract_val[0] / m_k_strides[i];
         const Index idx1 = contract_val[1] / m_k_strides[i];
@@ -207,24 +248,41 @@ class SimpleTensorContractionMapper {
     return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
   }
 
+  #ifdef EIGEN_USE_SYCL
+  // The placeholder accessors require to be bound to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_tensor.bind(cgh);
+  }
+  #endif
+
+  const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& tensor() const {
+    return m_tensor;
+  }
+
+  const nocontract_t& nocontract_strides() const {
+    return m_nocontract_strides;
+  }
+  const nocontract_t& ij_strides() const { return m_ij_strides; }
+  const contract_t& contract_strides() const { return m_contract_strides; }
+  const contract_t& k_strides() const { return m_k_strides; }
+
  protected:
-  CoeffLoader<Tensor, Tensor::RawAccess> m_tensor;
+  CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_> m_tensor;
   const nocontract_t m_nocontract_strides;
   const nocontract_t m_ij_strides;
   const contract_t m_contract_strides;
   const contract_t m_k_strides;
 };
 
-
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          int packet_size, bool inner_dim_contiguous,
-         bool inner_dim_reordered, int Alignment>
-class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment>
+         bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
+class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_>
 {
  public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper;
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
 
   EIGEN_DEVICE_FUNC
   BaseTensorContractionMapper(const Tensor& tensor,
@@ -234,12 +292,11 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
                               const contract_t& k_strides) :
   ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
 
-  typedef typename Tensor::PacketReturnType Packet;
-  typedef typename unpacket_traits<Packet>::half HalfPacket;
-
-  template <int AlignmentType>
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+  template <typename PacketT,int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename internal::enable_if<internal::unpacket_traits<PacketT>::size==packet_size,PacketT>::type
+  load(Index i, Index j) const
+  {
     // whole method makes column major assumption
 
     // don't need to add offsets for now (because operator handles that)
@@ -254,7 +311,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
 
     const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
     const Index first = indexPair.first;
-    const Index last = indexPair.second;
+    const Index lastIdx = indexPair.second;
 
     // We can always do optimized packet reads from left hand side right now, because
     // the vertical matrix dimension on the left hand side is never contracting.
@@ -262,7 +319,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
     // been shuffled first.
     if (Tensor::PacketAccess &&
         (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
-        (last - first) == (packet_size - 1)) {
+        (lastIdx - first) == (packet_size - 1)) {
 
       return this->m_tensor.template packet<AlignmentType>(first);
     }
@@ -270,31 +327,44 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
     EIGEN_ALIGN_MAX Scalar data[packet_size];
 
     data[0] = this->m_tensor.coeff(first);
+    EIGEN_UNROLL_LOOP
     for (Index k = 1; k < packet_size - 1; k += 2) {
       const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
       data[k] = this->m_tensor.coeff(internal_pair.first);
       data[k + 1] = this->m_tensor.coeff(internal_pair.second);
     }
-    data[packet_size - 1] = this->m_tensor.coeff(last);
+    data[packet_size - 1] = this->m_tensor.coeff(lastIdx);
 
-    return pload<Packet>(data);
+    return pload<PacketT>(data);
   }
 
-  template <int AlignmentType>
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
-    // whole method makes column major assumption
+  template <typename PacketT,int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename internal::enable_if<internal::unpacket_traits<PacketT>::size!=packet_size,PacketT>::type
+  load(Index i, Index j) const
+  {
+    const Index requested_packet_size = internal::unpacket_traits<PacketT>::size;
+    EIGEN_ALIGN_MAX Scalar data[requested_packet_size];
 
-    // don't need to add offsets for now (because operator handles that)
-    const Index half_packet_size = unpacket_traits<HalfPacket>::size;
-    if (half_packet_size == packet_size) {
-      return loadPacket<AlignmentType>(i, j);
-    }
-    EIGEN_ALIGN_MAX Scalar data[half_packet_size];
-    for (Index k = 0; k < half_packet_size; k++) {
-      data[k] = operator()(i + k, j);
+    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, requested_packet_size - 1);
+    const Index first = indexPair.first;
+    const Index lastIdx = indexPair.second;
+
+    data[0] = this->m_tensor.coeff(first);
+    for (Index k = 1; k < requested_packet_size - 1; k += 2) {
+      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+      data[k] = this->m_tensor.coeff(internal_pair.first);
+      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
     }
-    return pload<HalfPacket>(data);
+    data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx);
+
+    return pload<PacketT>(data);
+  }
+
+  template <typename PacketT,int AlignmentType>
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
+    return this->load<PacketT,AlignmentType>(i,j);
   }
 };
 
@@ -303,11 +373,12 @@ template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          bool inner_dim_contiguous,
-         bool inner_dim_reordered, int Alignment>
-class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment>
+         bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
+  : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_>
 {
  public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper;
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
 
   EIGEN_DEVICE_FUNC
   BaseTensorContractionMapper(const Tensor& tensor,
@@ -317,16 +388,17 @@ class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, con
                               const contract_t& k_strides) :
   ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
 
-  typedef typename Tensor::PacketReturnType Packet;
-  template <int> EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+  template <typename PacketT,int> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
     EIGEN_ALIGN_MAX Scalar data[1];
     data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
-    return pload<typename Tensor::PacketReturnType>(data);
+    return pload<PacketT>(data);
   }
-  template <int> EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
-    return loadPacket(i, j);
+  template <typename PacketT,int> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
+    EIGEN_ALIGN_MAX Scalar data[1];
+    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+    return pload<PacketT>(data);
   }
 };
 
@@ -335,14 +407,12 @@ template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
 class TensorContractionSubMapper {
  public:
-  typedef typename Tensor::PacketReturnType Packet;
-  typedef typename unpacket_traits<Packet>::half HalfPacket;
 
-  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> ParentMapper;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Self;
   typedef Self LinearMapper;
 
   enum {
@@ -374,27 +444,32 @@ class TensorContractionSubMapper {
     return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const {
     if (UseDirectOffsets) {
-      return m_base_mapper.template loadPacket<Alignment>(i, 0);
+      return m_base_mapper.template loadPacket<PacketT,Alignment>(i, 0);
     }
-    return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
+    return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, m_horiz_offset);
   }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
     if (UseDirectOffsets) {
-      return m_base_mapper.template loadPacket<Alignment>(i, j);
+      return m_base_mapper.template loadPacket<PacketT,Alignment>(i, j);
     }
-    return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset);
+    return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, j + m_horiz_offset);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
     if (UseDirectOffsets) {
-      return m_base_mapper.template loadHalfPacket<Alignment>(i, 0);
+      return m_base_mapper.template load<PacketT,AlignmentType>(i, j);
     }
-    return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
+    return m_base_mapper.template loadPacket<PacketT,AlignmentType>(i + m_vert_offset, j + m_horiz_offset);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const {
     if (UseDirectOffsets) {
       m_base_mapper.storePacket(i, 0, p);
     }
@@ -410,19 +485,30 @@ class TensorContractionSubMapper {
 
   template <typename PacketT, int AlignmentType>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
-    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, PacketT>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
     const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
     if (UseDirectOffsets) {
-     return m_base_mapper.template loadPacket<ActualAlignment>(i, 0);
+     return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i, 0);
     }
-    return m_base_mapper.template loadPacket<ActualAlignment>(i + m_vert_offset, m_horiz_offset);
+    return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i + m_vert_offset, m_horiz_offset);
   }
 
-  template <typename Packet>
+  template <typename PacketT>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
     return false;
   }
 
+  #ifdef EIGEN_USE_SYCL
+  // The placeholder accessors require to be bound to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_base_mapper.bind(cgh);
+  }
+  #endif
+
+  const ParentMapper& base_mapper() const { return m_base_mapper; }
+  Index vert_offset() const { return m_vert_offset; }
+  Index horiz_offset() const { return m_horiz_offset; }
+
  private:
   ParentMapper m_base_mapper;
   const Index m_vert_offset;
@@ -434,14 +520,14 @@ template<typename Scalar_, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,  template <class> class MakePointer_=MakePointer>
 class TensorContractionInputMapper
-  : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
+  : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> {
 
  public:
   typedef Scalar_ Scalar;
-  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Base;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> SubMapper;
   typedef SubMapper VectorMapper;
 
   EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
@@ -459,9 +545,29 @@ class TensorContractionInputMapper
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
     return VectorMapper(*this, i, j);
   }
+  
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& get_tensor() const {
+    return Base::m_tensor;
+  }
 };
 
 
+template <typename T> struct TensorContractionInputMapperTrait;
+
+template<typename Scalar_, typename Index_, int side_,
+         typename Tensor_,
+         typename nocontract_t_, typename contract_t_,
+         int packet_size_,
+         bool inner_dim_contiguous_, bool inner_dim_reordered_, int Alignment_,  template <class> class MakePointer_>
+struct TensorContractionInputMapperTrait<TensorContractionInputMapper<Scalar_, Index_, side_, Tensor_, 
+                                                    nocontract_t_, contract_t_, packet_size_, inner_dim_contiguous_, 
+                                                    inner_dim_reordered_, Alignment_, MakePointer_> > {
+
+      typedef Tensor_ XprType;
+      static const bool  inner_dim_contiguous = inner_dim_contiguous_;
+      static const bool  inner_dim_reordered = inner_dim_reordered_;
+  };  
+
 
 }  // end namespace internal
 }  // end namespace Eigen
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
new file mode 100755
index 0000000000000000000000000000000000000000..473c228490f238f1a6c1bb6f0d31b6ae8f93b557
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -0,0 +1,1650 @@
+// This file is part of Eigen, a lightweight C++ template library for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not
+// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorContractionSycl.h
+ *
+ * \brief:
+ *  TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
+
+namespace Eigen {
+
+namespace TensorSycl {
+namespace internal {
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+/*!
+ * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector
+ * contraction kernel on various hardware devices.
+ *
+ * \tparam Scalar: determines the element type of the tensor/vector
+ *
+ * \tparam StorageIndex  determines the Index type.
+ *
+ * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group
+ *
+ * \tparam CFactor: determines the number of contracting element to be process by each thread
+ *
+ * \tparam NCFactor: determines the number of non-contracting element to be process by each thread
+ */
+template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>
+struct TVPanelSize {
+  // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+  // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+  // TileSizeDimNC: determines the tile size for the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor;
+  // TileSizeDimC: determines the tile size for the contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC;
+  // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC;
+  // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC;
+  // BC : determines if supporting bank conflict is required
+  static EIGEN_CONSTEXPR bool BC = false;
+};
+#endif
+
+/*!
+ * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor
+ contraction kernel on various hardware devices.
+ *
+ * \tparam Scalar: determines the element type of the tensor
+ *
+ * \tparam StorageIndex: determines the Index type.
+ *
+ * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the
+ available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro).
+ *
+ * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the
+ available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro).
+ *
+ * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered
+ */
+
+template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>
+struct TTPanelSize {
+  // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK;
+  // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the
+  // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro//
+#ifndef EIGEN_SYCL_REG_M
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
+#else
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
+#endif
+// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the
+// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro
+#ifndef EIGEN_SYCL_REG_N
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
+#else
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
+#endif
+  // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+  // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+  // TileSizeDimM: determines the tile size for the m dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM;
+  // TileSizeDimN: determines the tile size for the n dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN;
+  // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize
+  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs =
+      ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN));
+  // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize
+  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs =
+      ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM));
+  // BC : determines if supporting bank conflict is required
+  static EIGEN_CONSTEXPR bool BC = true;
+  // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by
+  // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient  local memory)
+  static EIGEN_CONSTEXPR bool DoubleBuffer =
+#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER
+      false;
+#else
+      true;
+#endif
+};
+
+/* !
+ * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to
+ * specialize the contraction algorithm based on device support for dedicated local memory.
+ */
+enum class contraction_type { local, no_local };
+/* !
+ * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private).
+ */
+enum class data_source { global_mem, local_mem, private_mem };
+
+/*!
+ * \brief read, a template function used for loading the data from global
+ memory. This function is used to guarantee coalesced and vectorized load whenever possible
+ *
+ * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
+ vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam TensorMapper: determines the input tensor mapper type
+ *
+ * \tparam StorageIndex: determines the Index type
+
+ * \param tensorMapper: is the input tensor
+ *
+ * \param NCIndex: is the non-contracting dim index
+ *
+ * \param CIndex is the contracting dim index
+ *
+ * \param ld: is the leading dimension of the flattened tensor
+ */
+template <bool PacketLoad, bool is_coalesced_layout, bool, typename PacketType, typename TensorMapper,
+          typename StorageIndex>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<PacketLoad, PacketType>::type read(
+    const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) {
+  const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex;
+  const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex;
+  return tensorMapper.get_tensor().template packet<Unaligned>(row + (col * ld));
+}
+
+/*!
+ * \brief read, special overload of read function, when the read access is not vectorized
+ *
+ * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
+  vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+  contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+  when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam TensorMapper: determines the input tensor mapper type
+ *
+ * \tparam StorageIndex: determines the Index type
+
+ * \param tensorMapper: is the input tensor
+ *
+ * \param NCIndex: is the non-contracting dim index
+ *
+ * \param CIndex: is the contracting dim index
+ */
+template <bool PacketLoad, bool, bool IsRhs, typename PacketType, typename TensorMapper, typename StorageIndex>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!PacketLoad, PacketType>::type read(
+    const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) {
+  const StorageIndex row = (IsRhs) ? CIndex : NCIndex;
+  const StorageIndex col = (IsRhs) ? NCIndex : CIndex;
+  return tensorMapper(row, col);
+}
+
+/*!
+ * \brief write, a template function used for storing the data to local memory. This function is used to guarantee
+ * coalesced and vectorized store whenever possible.
+ *
+ * \tparam StorageIndex: determines the Index type
+ *
+ * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ *
+ * \param CIndex is the contracting dim index
+ */
+
+template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename ::Eigen::internal::enable_if<dt != data_source::global_mem, void>::type
+    write(PacketType &packet_data, DataScalar ptr) {
+  EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size;
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < PacketSize; i++) {
+    *ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data);
+    ptr += ld;
+  }
+}
+
+/*!
+ * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function
+ * is used to guarantee coalesced and vectorized store whenever possible.
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ */
+
+template <data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
+    Eigen::internal::unpacket_traits<PacketType>::size != 1 && dt == data_source::global_mem, void>::type
+write(PacketType &packet_data, DataScalar *ptr) {
+  ::Eigen::internal::pstoreu<DataScalar, PacketType>(ptr, packet_data);
+}
+
+/*!
+ * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled.
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ */
+template <data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
+    Eigen::internal::unpacket_traits<PacketType>::size == 1 && dt == data_source::global_mem, void>::type
+write(PacketType &packet_data, DataScalar *ptr) {
+  *ptr = packet_data;
+}
+
+/*!
+ * \brief check_boundary: is used to check the edge condition for non-internal blocks.
+ *
+ * \tparam is_internal: determines if the block is internal
+ */
+template <bool is_internal>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) {
+  return true;
+}
+
+/*!
+ * \brief check_boundary: specialization of the check_boundary for non-internal blocks.
+ *
+ * \param cond: true when the data is in range. Otherwise false
+ */
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary<false>(bool cond) {
+  return cond;
+}
+
+/*!
+ * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed
+ * by each workgroup.
+ *
+ * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed
+ *
+ * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be
+ * packetType; Otherwise it will be scalar Type
+ *
+ * \param elements_per_access determines the size of each element based on OutType
+ *
+ * \param is_coalesced_layout  determines whether or not the Tensor data in a memory can be access coalesced and
+ * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the
+ * Tensor Block for each workgroup
+ *
+ * \param c_stride  determines the stride of contracting dimension to access the next adjustment element within the
+ * Tensor Block for each workgroup
+ */
+template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType>
+struct BlockProperties {
+  static EIGEN_CONSTEXPR bool packet_load = packet_load_;
+  typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar;
+  static EIGEN_CONSTEXPR bool is_rhs = is_rhs_;
+  typedef typename Eigen::internal::conditional<packet_load, PacketType, OutScalar>::type OutType;
+  static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size;
+  static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs);
+  static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1);
+  static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access);
+};
+
+/*!
+ * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup.  Please see
+ * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup,
+ * work-items
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \param linearLocalThreadId: determines the linearized location of a thread within a work-group
+ *
+ * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when
+ * tall/skinny algorithm is used
+ *
+ * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of
+ * the flattened tensor.
+ *
+ * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the
+ * flattened tensor. It will be > 1 when tall/skinny algorithm is used.
+ *
+ * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a
+ * flattened tensor. The position determines the distance of each thread within the workgroup from each other
+ * independent from their global position.
+ *
+ * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a
+ * flattened tensor. The position determines the distance of each thread within the workgroup from each other
+ * independent from their global position.
+ *
+ * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a
+ * flattened tensor
+ *
+ * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a
+ * flattened tensor
+ *
+ * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the
+ * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used.
+ *
+ * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or
+ * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be
+ * resolve by compiler.
+ */
+template <typename StorageIndex>
+struct ThreadProperties {
+  const StorageIndex linearLocalThreadId;
+  const StorageIndex kGroupId;
+  const StorageIndex mGroupOffset;
+  const StorageIndex nGroupOffset;
+  const StorageIndex kGroupOffset;
+  const StorageIndex mLocalOffset;
+  const StorageIndex nLocalOffset;
+  const StorageIndex mGlobalOffset;
+  const StorageIndex nGlobalOffset;
+  StorageIndex kSize;
+  const bool is_internal;
+  // this is used to adjust the last block
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties(
+      const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_,
+      const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_,
+      const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_,
+      StorageIndex kSize_, const bool is_internal_)
+      : linearLocalThreadId(linearLocalThreadId_),
+        kGroupId(kGroupId_),
+        mGroupOffset(mGroupOffset_),
+        nGroupOffset(nGroupOffset_),
+        kGroupOffset(kGroupOffset_),
+        mLocalOffset(mLocalOffset_),
+        nLocalOffset(nLocalOffset_),
+        mGlobalOffset(mGlobalOffset_),
+        nGlobalOffset(nGlobalOffset_),
+        kSize(kSize_),
+        is_internal(is_internal_) {}
+};
+
+/*!
+ * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation.
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam LhsScalar: determines the left-hand-side scalar type
+ *
+ * \tparam RhsScalar: determines the right-hand-side scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix
+ *
+ * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Properties: determines the Contraction Panel properties
+ *
+ * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory
+ access is used to guarantee that always the memory access are coalesced.
+ *
+ * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output.
+ Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny
+ contraction is used. So in this case, a final reduction step is required to compute final output.
+
+ * \tparam contraction_tp: it is an enum value representing whether the local memroy/no local memory implementation of
+ the algorithm to be used
+ *
+ * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
+ *
+ * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
+ *
+ * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
+ *
+ * \param out_res: determines the output tensor containing the contraction result
+ *
+ * \param groupSizeM: a logical number determining the number of work-group for m dimension
+ *
+ * \param groupSizeN: a logical number determining the number of work-group for n dimension
+ *
+ * \param numTiles: determines total number of tiles on the k dimension
+ *
+ * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
+ */
+template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
+          typename RhsMapper, typename StorageIndex, typename Properties, typename TripleDim, bool Vectorizable,
+          typename input_mapper_properties, bool IsFinal, contraction_type contraction_tp>
+class TensorContractionKernel {
+ public:
+  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
+      PacketReturnType;
+  static EIGEN_CONSTEXPR int PacketSize =
+      Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
+  static EIGEN_CONSTEXPR bool is_lhs_transposed =
+      !::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous;
+  static EIGEN_CONSTEXPR bool is_rhs_transposed =
+      !::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous;
+
+  typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable,
+                          PacketReturnType>
+      LHSBlockProperties;
+
+  typedef BlockProperties<is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix && Vectorizable,
+                          PacketReturnType>
+      RHSBlockProperties;
+
+  static EIGEN_CONSTEXPR StorageIndex NStride =
+      contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride;
+
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+  typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr;
+  typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr;
+  typedef
+      typename ::Eigen::internal::conditional<contraction_tp == contraction_type::local, local_ptr, private_ptr>::type
+          tile_ptr;
+  static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local
+                                                 ? Properties::TileSizeDimM + Properties::BC
+                                                 : Properties::WorkLoadPerThreadM;
+  static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local
+                                                 ? Properties::TileSizeDimN + Properties::BC
+                                                 : Properties::WorkLoadPerThreadN;
+  static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+
+  /**
+   * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not
+   * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to
+   * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting
+   * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out
+   * different type of memory needed when local/no_local memory computation is called.
+   *
+   * \tparam contraction_type: it is an enum value representing whether the local memroy/no local memory implementation
+   of the algorithm to be used
+   * \tparam the private memory size
+   * \param ptr the tile memory pointer type
+   */
+  template <contraction_type, StorageIndex>
+  struct MemHolder {
+    tile_ptr ptr;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {}
+  };
+  /**
+   * \brief specialization of memHolder class when no local memory kernel is used.
+   */
+  template <StorageIndex MemSize>
+  struct MemHolder<contraction_type::no_local, MemSize> {
+    OutScalar ptr[MemSize] = {OutScalar{0}};
+  };
+  /**
+   * \brief TiledMemory: contains required memory pointer for loading  each tile of the TensorContraction panel from
+   * global memory to local/private memory when local/no_local algorithm used.
+   *
+   * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the
+   * selected contraction_type.
+   *
+   * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the
+   * selected contraction_type.
+   *
+   * \param lhs_extract_index: determins the position of each thread on a local memory for lhs input. When private
+   * memory is used this is set to zero as this is not applicable in case of private memory.
+   *
+   * \param rhs_extract_index: determins the position of each thread on a local memory for rhs input. When private
+   * memory is used this is set to zero as this is not applicable in case of private memory.
+   *
+   * \param lhs_scratch_compute : determines the  location to load for computation for lhs_local memory. This is the
+   * same as lhs_scratch_extract for private memory.
+   *
+   * \param rhs_scratch_compute : determines the  location to load for computation for rhs_local memory. This is the
+   * same as rhs_scratch_extract for private memory.
+   */
+  struct TiledMemory {
+    MemHolder<contraction_tp, Properties::WorkLoadPerThreadM * Properties::TileSizeDimK> lhs_scratch_extract;
+    MemHolder<contraction_tp, Properties::WorkLoadPerThreadN * Properties::TileSizeDimK> rhs_scratch_extract;
+    tile_ptr lhs_scratch_ptr_compute;
+    tile_ptr rhs_scratch_ptr_compute;
+    const std::pair<StorageIndex, StorageIndex> lhs_extract_index;
+    const std::pair<StorageIndex, StorageIndex> rhs_extract_index;
+    template <contraction_type tp = contraction_tp>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TiledMemory(const ThreadProperties<StorageIndex> &, local_ptr,
+                typename ::Eigen::internal::enable_if<tp == contraction_type::no_local>::type * = 0)
+        : lhs_scratch_extract{},
+          rhs_scratch_extract{},
+          lhs_scratch_ptr_compute(lhs_scratch_extract.ptr),
+          rhs_scratch_ptr_compute(rhs_scratch_extract.ptr),
+          lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})),
+          rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {}
+
+    template <contraction_type tp = contraction_tp>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TiledMemory(const ThreadProperties<StorageIndex> &thread_properties, local_ptr block_start_ptr,
+                typename ::Eigen::internal::enable_if<tp == contraction_type::local>::type * = 0)
+        : lhs_scratch_extract{block_start_ptr},
+          rhs_scratch_extract{lhs_scratch_extract.ptr +
+                              ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)},
+          lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset),
+          rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset),
+          lhs_extract_index(
+              local_id_extract<LHSBlockProperties, Properties::TileSizeDimM>(thread_properties.linearLocalThreadId)),
+          rhs_extract_index(
+              local_id_extract<RHSBlockProperties, Properties::TileSizeDimN>(thread_properties.linearLocalThreadId)) {}
+  };
+
+  Scratch scratch;
+  const LhsMapper lhs;
+  const RhsMapper rhs;
+  OutAccessor out_res;
+  const StorageIndex groupSizeM;
+  const StorageIndex groupSizeN;
+  const StorageIndex numTiles;
+  const TripleDim triple_dim;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
+                                                                const RhsMapper rhs_, OutAccessor out_res_,
+                                                                const StorageIndex groupSizeM_,
+                                                                const StorageIndex groupSizeN_,
+                                                                const StorageIndex numTiles_,
+                                                                const TripleDim triple_dim_)
+      : scratch(scratch_),
+        lhs(lhs_),
+        rhs(rhs_),
+        out_res(out_res_),
+        groupSizeM(groupSizeM_),
+        groupSizeN(groupSizeN_),
+        numTiles(numTiles_),
+        triple_dim(triple_dim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
+                                                                const RhsMapper rhs_, OutAccessor out_res_,
+                                                                const StorageIndex groupSizeM_,
+                                                                const StorageIndex numTiles_,
+                                                                const TripleDim triple_dim_)
+      : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
+    const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
+    const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
+    const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM;
+    const StorageIndex tmp = itemID.get_group(0) / groupSizeM;
+    const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN;
+    const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN;
+    const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM;
+    const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN;
+    const StorageIndex mLocalOffset = PacketSize * mLocalThreadId;
+    const StorageIndex nLocalOffset = NStride * nLocalThreadId;
+    const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset;
+    const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset;
+
+    const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK;
+    StorageIndex kGroupOffset = kGroupId * kSizePerWG;
+    const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM &&
+                             triple_dim.N - nGroupOffset >= Properties::TileSizeDimN &&
+                             triple_dim.K - kGroupOffset >= kSizePerWG;
+    // this is used to adjust the last block
+    StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset);
+    // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to
+    // tile
+    kGroupOffset += kSize;
+
+    auto thread_properties =
+        ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,
+                                       mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);
+
+    auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
+
+    (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
+                                    : compute_panel<false>(itemID, thread_properties, out_ptr);
+  }
+  // The compute block computes the contraction operation private block for each thread and store the resutl in the
+  // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
+  // it only compute the block on each thread's private memory space
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
+                                                                    PacketReturnType *privateRes) {
+    StorageIndex idx = 0;
+    EIGEN_CONSTEXPR StorageIndex lhs_stride =
+        contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {
+      auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)};
+      StorageIndex lhs_index = 0;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
+        PacketReturnType lhsPack{};
+        Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::set_packet(lhsPack,
+                                                                                             lhs_block_ptr + lhs_index);
+        privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]);
+
+        lhs_index += lhs_stride;
+        idx++;
+      }
+    }
+  }
+  // The store function write the computed contraction operation in the private memory of each thread to the global
+  // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base
+  // class.
+  template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
+                                                   StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) {
+    auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
+      return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
+    };
+    // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is
+    // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId*
+    // WorkLoadPerThreadN slice of N
+    EIGEN_CONSTEXPR StorageIndex GlobalNStride =
+        contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {
+      // output leading dimension
+      StorageIndex outputLD = 0;
+      // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local
+      // memory and extracting from local to global is the same as no transposed version. However, when local memory is
+      // not used and RHS is transposed we packetize the load for RHS.
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex nId = 0; nId < PrivateNStride; nId++) {
+        StorageIndex globalRow = mGlobalOffset;
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
+          PacketReturnType privetOut = privateRes[wLPTM];
+          if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) {
+            // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second
+            // StorageIndex Therefore it is always coalesced layout
+            write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow);
+          } else {
+            EIGEN_UNROLL_LOOP
+            for (StorageIndex mId = 0; mId < PacketSize; mId++) {
+              StorageIndex mOffset = globalRow + mId;
+              if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) {
+                out_ptr[mOffset + outputLD] =
+                    Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::scalarize(mId, privetOut);
+              }
+            }
+          }
+          globalRow += (PacketSize * Properties::LocalThreadSizeM);
+        }
+        outputLD += triple_dim.M;
+        privateRes += Properties::WorkLoadPerThreadM / PacketSize;
+      }
+      out_ptr += (GlobalNStride * outputLD);
+
+      nGlobalOffset += (PrivateNStride * GlobalNStride);
+    }
+  }
+  // when no local memory is used the following extract_block will be enabled
+  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename PrivateReg,
+            contraction_type contract_tp = contraction_tp>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<contract_tp == contraction_type::no_local>::type
+      extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
+                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
+    EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
+        InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
+    EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
+        InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;
+    const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
+
+    auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
+      return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
+              (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
+    };
+    const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
+    StorageIndex cIndex = cOffset;
+
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) {
+      StorageIndex ncIndex = ncOffset;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) {
+        if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) {
+          auto val =
+              read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                   InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld);
+
+          write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
+                data_source::private_mem>(val, private_ptr);
+        } else {
+          EIGEN_UNROLL_LOOP
+          for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+            const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
+            const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
+            OutScalar val =
+                (ncInd < NC && cInd < triple_dim.K)
+                    ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                          inpt, ncInd, cInd, ld)
+                    : OutScalar(0);
+            write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
+                  data_source::private_mem>(
+                val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) +
+                         ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC));
+          }
+        }
+
+        // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So
+        // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread.
+        ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1)
+                      ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC
+                      : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC);
+        private_ptr += InputBlockProperties::nc_stride;
+      }
+      // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC
+      private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC;
+      cIndex += InputBlockProperties::c_stride;
+    }
+  }
+  template <typename InputBlockProperties, StorageIndex TileSizeDimNC>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair<StorageIndex, StorageIndex> local_id_extract(
+      const StorageIndex &linearLocalThreadId) {
+    const StorageIndex localThreadNC =
+        (InputBlockProperties::is_coalesced_layout)
+            ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride)
+            : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    const StorageIndex localThreadC =
+        (InputBlockProperties::is_coalesced_layout)
+            ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride)
+            : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC);
+  }
+
+  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<db && ctp == contraction_type::local>::type
+      sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept {
+    db_offset = !db_offset;
+  }
+
+  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<!db && ctp == contraction_type::local>::type
+      sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept {
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+  }
+
+  template <contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<ctp == contraction_type::no_local>::type
+      sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept {
+    return;
+  }
+
+  template <bool need_sync, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::no_local>::type
+      sync_thread(const cl::sycl::nd_item<1> &
+#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
+                      itemID
+#endif
+                  ) noexcept {
+#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
+    itemID.barrier(cl::sycl::access::fence_spacce::local_space);
+#else
+    return;
+#endif
+  }
+  template <bool need_sync, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::local>::type
+      sync_thread(const cl::sycl::nd_item<1> &itemID) {
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+  }
+  template <bool need_sync>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!need_sync>::type sync_thread(
+      const cl::sycl::nd_item<1> &) {
+    return;
+  }
+
+  template <bool is_internal_block>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
+                                                                    ThreadProperties<StorageIndex> &thread_properties,
+                                                                    TiledMemory &tiled_input_block,
+                                                                    PacketReturnType *privateRes, bool &db_offset) {
+    // Tiling the Rhs block from global to local memory
+    extract_block<RHSBlockProperties, is_internal_block>(
+        rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
+        tiled_input_block.rhs_extract_index,
+        contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset,
+        thread_properties.kGroupOffset - thread_properties.kSize);
+
+    sync_thread<contraction_tp == contraction_type::no_local>(itemID);
+
+    // Tiling the Lhs block from global to local memory
+    extract_block<LHSBlockProperties, is_internal_block>(
+        lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK),
+        tiled_input_block.lhs_extract_index,
+        contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset,
+        thread_properties.kGroupOffset - thread_properties.kSize);
+
+    // itemID.barrier(cl::sycl::access::fence_space::local_space);
+    sync_thread<contraction_tp == contraction_type::local>(itemID);
+    // switch to compute mede
+    StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK);
+    StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR);
+    // Loop over the values of a single tile
+    for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) {
+      compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset,
+                             tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes);
+      lhs_offset += LSDL;
+      rhs_offset += LSDR;
+    }
+    // computing the K index for the next tile
+    thread_properties.kSize -= Properties::TileSizeDimK;
+    sync_mem(itemID, db_offset);
+  }
+
+  // when local memory is available the following compute_panel will be enabled
+  template <bool is_internal_block, typename OutPtr>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
+                                                           ThreadProperties<StorageIndex> &thread_properties,
+                                                           OutPtr out_ptr) {
+    auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
+    // Allocate register space
+    PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
+        PacketReturnType{0}};
+    bool db_offset = 0;
+
+    while (thread_properties.kSize >= Properties::TileSizeDimK) {
+      compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
+    }
+    if (thread_properties.kSize > 0) {
+      compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
+    }
+
+    // Storing the final results in the output
+    store<is_internal_block,
+          contraction_tp == contraction_type::local ? static_cast<StorageIndex>(1) : RHSBlockProperties::nc_stride>(
+        out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset,
+        thread_properties.nGlobalOffset);
+  }
+  // When local memory is available the following extract_block will be enabled
+  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename Local,
+            contraction_type contract_tp = contraction_tp>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<contract_tp == contraction_type::local>::type
+      extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
+                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
+    EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
+        InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
+    EIGEN_CONSTEXPR StorageIndex LoadPerThread =
+        InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;
+    EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
+    static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&
+                   (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),
+                  " LocalOffset must be divisable by stride");
+    const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
+    StorageIndex localThreadNC = local_index.first;
+    StorageIndex localThreadC = local_index.second;
+    auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
+      return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
+              (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
+    };
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) {
+      const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC);
+      const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC);
+      const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
+      if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) {
+        auto val =
+            read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                 InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld);
+        write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
+            val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
+                     (InputBlockProperties::c_stride * localThreadC * LSD));
+      } else {
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+          const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
+          const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
+          OutScalar val =
+              (nCInd < NC && cInd < triple_dim.K)
+                  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                        inpt, nCInd, cInd, ld)
+                  : OutScalar(0);
+
+          write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
+              val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
+                       (InputBlockProperties::is_coalesced_layout ? i : 0) +
+                       ((InputBlockProperties::c_stride * localThreadC +
+                         (InputBlockProperties::is_coalesced_layout ? 0 : i)) *
+                        LSD));
+        }
+      }
+      localThreadNC += (InputBlockProperties::is_coalesced_layout)
+                           ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride)
+                           : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+      localThreadC += (InputBlockProperties::is_coalesced_layout)
+                          ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride)
+                          : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    }
+  }
+};
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+
+/*!
+ * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special
+ * case of Tensor Tensor contraction.
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs)
+ *
+ * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs)
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Properties: determines the Contraction Panel properties
+ *
+ * \tparam KFactor: determines the number of elements in K dimension in a Tile
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector
+ *
+ * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output.
+ * Otherwise, the result of contraction will be written iin a temporary buffer.
+ *
+ * \param scratch: determines the local memory containing the vector block for each work-group
+ *
+ * \param vec: determines the vector input (tensor mapper)
+ *
+ * \param mat: determines the tensor input (tensor mapper)
+ *
+ * \param out_res: determines the output vector containing the contraction result
+ *
+ * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension
+ *
+ * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor
+ *
+ * \param contractDim: determines the size of non contracting dimension for the flattened tensor
+ *
+ */
+template <typename OutScalar, typename OutAccessor, typename VectorMapper, typename TensorMapper, typename StorageIndex,
+          typename Properties, StorageIndex KFactor, bool Vectorizable, bool is_lhs_vec, bool IsFinal>
+struct GeneralVectorTensor {
+  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
+      PacketReturnType;
+  static EIGEN_CONSTEXPR int PacketSize =
+      Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+
+  static EIGEN_CONSTEXPR StorageIndex OutScratchOffset =
+      KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
+
+  // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make
+  // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true.
+  typedef BlockProperties<is_lhs_vec ? false : true, is_lhs_vec ? false : true, Vectorizable, PacketReturnType>
+      VecBlockProperties;
+
+  Scratch scratch;
+  const VectorMapper vec;
+  const TensorMapper mat;
+  OutAccessor out_res;
+  const StorageIndex nonContractGroupSize;
+  const StorageIndex nonContractDim;
+  const StorageIndex contractDim;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_,
+                                                            const TensorMapper mat_, OutAccessor out_res_,
+                                                            const StorageIndex nonContractGroupSize_,
+                                                            const StorageIndex nonContractDim_,
+                                                            const StorageIndex contractDim_)
+      : scratch(scratch_),
+        vec(vec_),
+        mat(mat_),
+        out_res(out_res_),
+        nonContractGroupSize(nonContractGroupSize_),
+        nonContractDim(nonContractDim_),
+        contractDim(contractDim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    auto scratch_ptr = scratch.get_pointer();
+    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
+    StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
+                                            : linearLocalThreadId % Properties::LocalThreadSizeNC;
+    StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC
+                                         : linearLocalThreadId / Properties::LocalThreadSizeNC;
+    const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize;
+    const StorageIndex nonContractGroupId =
+        is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;
+    const StorageIndex contractGroupId =
+        is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;
+    auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim);
+
+    const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
+    const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
+    auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
+    const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId;
+    const StorageIndex globalContractDimOffset = contractGroupOffset + contractId;
+    auto local_output = scratch_ptr + OutScratchOffset;
+    const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC &&
+                             contractDim - contractGroupOffset >= Properties::TileSizeDimC;
+    is_internal
+        ? compute_panel<true>(itemID, vec, mat, local_output, out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+                              scratch_ptr, contractGroupOffset,
+#endif
+                              nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
+                              nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex)
+        : compute_panel<false>(itemID, vec, mat, local_output, out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+                               scratch_ptr, contractGroupOffset,
+#endif
+                               nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
+                               nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex);
+  }
+  template <bool is_internal_block, typename OutPtr>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(
+      const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output,
+      OutPtr out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      OutScalar *scratch_ptr, const StorageIndex contractGroupOffset,
+#endif
+      const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim,
+      StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId,
+      StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) {
+    OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)};
+    // Reading the vector
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+    const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId;
+    extract_block<VecBlockProperties, is_internal_block, KFactor,
+                  Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(vec, scratch_ptr, linearLocalThreadId,
+                                                                                vectorOffset, contractDim);
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+    auto in_scratch_ptr = scratch_ptr + contractId;
+#endif
+
+    StorageIndex privateOffsetC = 0;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) {
+      StorageIndex privateOffsetNC = 0;
+      bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim);
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      auto vecScalar = *in_scratch_ptr;
+#else
+      auto vecScalar = (check_boundary<is_internal_block>(contract_conds))
+                           ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC,
+                                 is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0))
+                           : OutScalar(0);
+#endif
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+        auto matScalar = (check_boundary<is_internal_block>(
+                             contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim)))
+                             ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC
+                                              : globalNonContractDimOffset + privateOffsetNC,
+                                   is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC
+                                              : globalContractDimOffset + privateOffsetC)
+                             : OutScalar(0);
+
+        outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]);
+        privateOffsetNC += Properties::LocalThreadSizeNC;
+      }
+      privateOffsetC += Properties::LocalThreadSizeC;
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      in_scratch_ptr += Properties::LocalThreadSizeC;
+#endif
+    }
+
+    auto out_scratch_ptr = local_output + outScratchIndex;
+    // Each block of 16*16 element in shared memory should reduce to 16*1
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+      *out_scratch_ptr = outScalar[j];
+
+      out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+    }
+    if (is_lhs_vec) {
+      nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC;
+      contractId = linearLocalThreadId / Properties::LocalThreadSizeNC;
+      outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
+    }
+
+    out_scratch_ptr = local_output + outScratchIndex;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) {
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (contractId < offset) {
+          StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset);
+          *out_scratch_ptr += out_scratch_ptr[myNeigbourId];
+        }
+      }
+      // moving to next 16 by 16 block
+      out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+    }
+
+    if (contractId == 0) {
+      out_scratch_ptr = local_output + nonContractId;
+      StorageIndex global_final_offset = nonContractGroupOffset + nonContractId;
+      out_ptr += global_final_offset;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+        if (check_boundary<is_internal_block>(global_final_offset < nonContractDim)) {
+          auto res = *out_scratch_ptr;
+
+          *out_ptr = res;
+          out_ptr += Properties::LocalThreadSizeNC;
+        }
+        // moving to next 16 by 16 block to ge the next 16 reduced elements
+        out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+        if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC;
+      }
+    }
+  }
+
+  template <typename InputBlockProperties, bool is_internal_block, int CFactor, int GroupSize, typename Input,
+            typename Local>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr,
+                                                                  const StorageIndex &linearLocalThreadId,
+                                                                  const StorageIndex &cOffset, const StorageIndex &C) {
+    local_ptr += InputBlockProperties::c_stride * linearLocalThreadId;
+    StorageIndex cIndex = cOffset;
+    for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) {
+      if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 < C)) {
+        auto val = read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                        InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, StorageIndex(0),
+                                                                                              cIndex, StorageIndex(1));
+        write<StorageIndex, 1, data_source::local_mem>(val, local_ptr);
+      } else {
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+          OutScalar val =
+              (cIndex + i < C)
+                  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                        inpt, StorageIndex(0), cIndex + i, StorageIndex(1))
+                  : OutScalar(0);
+          write<StorageIndex, 1, data_source::local_mem>(val, local_ptr + i);
+        }
+      }
+      local_ptr += InputBlockProperties::c_stride * GroupSize;
+      cIndex += InputBlockProperties::c_stride * GroupSize;
+    }
+  }
+};
+#endif
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+
+/*!
+ * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction
+ * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam LhsScalar: determines the left-hand-side scalar type
+ *
+ * \tparam RhsScalar: determines the right-hand-side scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix
+ *
+ * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
+ *
+ * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
+ *
+ * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
+ *
+ * \param out_res: determines the output tensor containing the contraction result
+ *
+ * \param rng: determins the total input data size
+ */
+template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
+          typename RhsMapper, typename StorageIndex, bool Vectorizable>
+struct GeneralScalarContraction {
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+  Scratch scratch;
+  const LhsMapper lhs;
+  const RhsMapper rhs;
+  OutAccessor out_res;
+  const StorageIndex rng;
+
+  EIGEN_DEVICE_FUNC
+  GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_,
+                           const StorageIndex rng_)
+      : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
+
+  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) {
+    auto out_ptr = out_res.get_pointer();
+    auto scratch_ptr = scratch.get_pointer().get();
+
+    StorageIndex globalid = itemID.get_global_id(0);
+    StorageIndex localid = itemID.get_local_id(0);
+    OutScalar accumulator = OutScalar(0);
+    for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator);
+    }
+    auto out_scratch_ptr = scratch_ptr + localid;
+    *out_scratch_ptr = accumulator;
+    for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]);
+      }
+    }
+    if (localid == 0) {
+      out_ptr[itemID.get_group(0)] = accumulator;
+    }
+  }
+};
+#endif
+
+}  // namespace internal
+}  // namespace TensorSycl
+
+template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>,
+                       Eigen::SyclDevice>
+    : public TensorContractionEvaluatorBase<TensorEvaluator<
+          const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> {
+  static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
+                "SYCL tensor contraction does not support output kernels.");
+
+  typedef Eigen::SyclDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index StorageIndex;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Base::Storage Storage;
+  typedef typename Base::EvaluatorPointerType EvaluatorPointerType;
+  struct TripleDim {
+    const StorageIndex M;
+    const StorageIndex N;
+    const StorageIndex K;
+    TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {}
+  };
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = false,
+  };
+
+  static EIGEN_CONSTEXPR int LDims = Base::LDims;
+  static EIGEN_CONSTEXPR int RDims = Base::RDims;
+  static EIGEN_CONSTEXPR int ContractDims = Base::ContractDims;
+
+  typedef array<StorageIndex, LDims> left_dim_mapper_t;
+  typedef array<StorageIndex, RDims> right_dim_mapper_t;
+
+  typedef array<StorageIndex, ContractDims> contract_t;
+  typedef array<StorageIndex, LDims - ContractDims> left_nocontract_t;
+  typedef array<StorageIndex, RDims - ContractDims> right_nocontract_t;
+
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<StorageIndex, NumDims> Dimensions;
+
+  typedef TensorEvaluator<typename Base::EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<typename Base::EvalRightArgType, Device> RightEvaluator;
+  typedef typename Eigen::internal::remove_const<typename LeftEvaluator::CoeffReturnType>::type LhsScalar;
+  typedef typename Eigen::internal::remove_const<typename RightEvaluator::CoeffReturnType>::type RhsScalar;
+
+  typedef typename LeftEvaluator::Dimensions LeftDimensions;
+  typedef typename RightEvaluator::Dimensions RightDimensions;
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered>
+  struct input_mapper_propertis {
+    static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
+    static EIGEN_CONSTEXPR bool is_rhs_matrix =
+        (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
+  };
+
+  TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {}
+
+  // We need to redefine this method to make nvcc happy
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) {
+    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (!data) {
+      this->m_result = this->m_device.get(
+          static_cast<Scalar *>(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar))));
+      data = this->m_result;
+    }
+    evalToSycl(data);
+    return (this->m_result != NULL);
+  }
+  const Eigen::SyclDevice &device() const { return this->m_device; }
+  void evalToSycl(typename Base::EvaluatorPointerType buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, true, true, Unaligned>(buffer);
+        } else {
+          evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      } else {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, false, true, Unaligned>(buffer);
+        } else {
+          evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    } else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, true, true, Unaligned>(buffer);
+        } else {
+          evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      } else {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, false, true, Unaligned>(buffer);
+        } else {
+          evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalTyped(typename Base::EvaluatorPointerType buffer) const {
+    const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
+    typedef internal::TensorContractionInputMapper<
+        LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t,
+        PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer>
+        LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator,
+                                                   right_nocontract_t, contract_t,
+                                                   PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer>
+        RhsMapper;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+    if (triple_dim.M == 1 && triple_dim.N == 1) {
+      launchSC(buffer, lhs, rhs, triple_dim.K);
+    } else
+#endif
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+        if (triple_dim.M != 1 && triple_dim.N == 1) {
+      LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K);
+    } else if (triple_dim.M == 1 && triple_dim.N != 1) {
+      LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K);
+    } else  // This is equivalent of if (m!=1 && n!=1)
+#endif
+    {
+      typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>
+          inpt_mapper_properties;
+#ifndef EIGEN_SYCL_DISABLE_SKINNY
+      bool skinny = false;
+      auto platform_name = this->device().getPlatformName();
+      // This is based on empirical calculation for AMD r9-nano and Fiji
+      if (platform_name.find("AMD") == 0) {
+        skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) &&
+                 ((triple_dim.M < 1024 && triple_dim.N < 1024) ||
+                  (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K)));
+      } else {
+        skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) ||
+                  ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) ||
+                  ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100));
+      }
+      if (skinny)
+        adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
+      else
+#endif  // EIGEN_SYCL_DISABLE_SKINNY
+        adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
+    }
+  }
+
+  template <bool skinny, typename input_mapper_properties, typename LhsMapper, typename RhsMapper>
+  void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                                    const TripleDim &triple_dim) const {
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+    if (device().has_local_memory()) {
+      typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 16> PanelParameters;
+      launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>(
+          buffer, lhs, rhs, triple_dim);
+    }
+#endif
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF
+    if (!(device().has_local_memory())) {
+      typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 4> PanelParameters;
+      launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>(
+          buffer, lhs, rhs, triple_dim);
+    }
+#endif
+  }
+
+  template <TensorSycl::internal::contraction_type ct, bool skinny, typename input_mapper_properties,
+            typename Properties, typename LhsMapper, typename RhsMapper>
+  void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                const TripleDim &triple_dim) const {
+    const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM);
+    const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN);
+    const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM;
+    const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN;
+
+    const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK);
+    StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK;
+    StorageIndex groupSizeK =
+        skinny
+            ? std::max(std::min(totalTilesK,
+                                (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) /
+                                    (groupSizeM * groupSizeN)),
+                       StorageIndex(1))
+            : StorageIndex(1);
+
+    const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK;
+
+    const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK;
+
+    const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+    const StorageIndex globalRange = totalGroupSize * localRange;
+
+    const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local)
+                                         ? ((Properties::DoubleBuffer + 1) *
+                                            (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) +
+                                               ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) *
+                                                (Properties::TileSizeDimN + Properties::BC))
+                                         : StorageIndex(1);
+
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (groupSizeK == 1) {
+      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
+                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
+                                                            PacketAccess, input_mapper_properties, true, ct>
+          ContractKernelName;
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim);
+    } else {
+      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
+                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
+                                                            PacketAccess, input_mapper_properties, false, ct>
+          ContractKernelName;
+      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+          device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
+          triple_dim);
+
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      auto op = Op();
+      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
+                                                               EvaluatorPointerType, Op>
+          ReductionKernel;
+
+      device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+          tmp_global_accessor, buffer,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(
+                                    Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
+                                cl::sycl::range<1>(localRange)),
+          StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK);
+
+      device().deallocate_temp(temp_pointer);
+    }
+  }
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+  template <bool is_lhs_vec, typename VectorMapper, typename TensorMapper, typename StorageIndex>
+  void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat,
+                                    StorageIndex NC, StorageIndex C) const {
+    const StorageIndex nonContractDim = NC;
+    EIGEN_CONSTEXPR StorageIndex NCFactor = 1;
+    EIGEN_CONSTEXPR StorageIndex CFactor = 1;
+    EIGEN_CONSTEXPR StorageIndex NCWindow = 16;
+    typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor>
+        Properties;
+    const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC);
+    const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC);
+    const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC);
+    const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC);
+    const StorageIndex globalRange =
+        (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC));
+    const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC;
+    const StorageIndex scratchSize =
+        (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (cNumGroups > 1) {
+      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
+                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
+                                                               is_lhs_vec, false>
+          ContractKernelName;
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
+
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
+                                                               EvaluatorPointerType, Op>
+          ReductionKernel;
+
+      device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+          tmp_global_accessor, buffer,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
+                                cl::sycl::range<1>(localRange)),
+          StorageIndex(1), Op(), nonContractDim, cNumGroups);
+
+      device().deallocate_temp(temp_pointer);
+    } else {
+      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
+                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
+                                                               is_lhs_vec, true>
+          ContractKernelName;
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
+    }
+  }
+#endif
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+  template <typename LhsMapper, typename RhsMapper>
+  EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                                    StorageIndex K) const {
+    EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+                          (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+                        "The Local thread size must be a power of 2 for the reduction "
+                        "operation");
+    EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+
+    // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread
+    // reduces at least 512 elementss individually, we get better performance.
+    const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1);
+    const StorageIndex global_range = num_work_group * local_range;
+
+    typedef Eigen::TensorSycl::internal::GeneralScalarContraction<
+        CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false>
+        ContractKernelName;
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+    if (num_work_group > 1) {
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
+                                                                                    thread_range, local_range, K);
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+                                                          EvaluatorPointerType, StorageIndex, local_range>
+          GenericRKernel;
+      device().template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+          tmp_global_accessor, buffer,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op());
+
+      device().deallocate_temp(temp_pointer);
+    } else {
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
+                                                                                    local_range, K);
+    }
+  }
+#endif
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    this->m_leftImpl.cleanup();
+    this->m_rightImpl.cleanup();
+
+    if (this->m_result) {
+      this->m_device.deallocate_temp(this->m_result);
+      this->m_result = NULL;
+    }
+  }
+  // The placeholder accessors must bound to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    this->m_leftImpl.bind(cgh);
+    this->m_rightImpl.bind(cgh);
+    this->m_result.bind(cgh);
+  }
+};
+}  // namespace Eigen
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index c70dea053038a43342e7a4eaab699b587e719d51..21be6ea425b4421bfa2c472e99a8e1a5c31af11a 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -15,57 +15,16 @@
 
 namespace Eigen {
 
-#ifdef EIGEN_USE_SIMPLE_THREAD_POOL
-namespace internal {
-
-template<typename LhsScalar, typename LhsMapper, typename Index>
-struct packLhsArg {
-  LhsScalar* blockA;
-  const LhsMapper& lhs;
-  const Index m_start;
-  const Index k_start;
-  const Index mc;
-  const Index kc;
-};
-
-template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index>
-struct packRhsAndKernelArg {
-  const MaxSizeVector<LhsScalar*>* blockAs;
-  RhsScalar* blockB;
-  const RhsMapper& rhs;
-  OutputMapper& output;
-  const Index m;
-  const Index k;
-  const Index n;
-  const Index mc;
-  const Index kc;
-  const Index nc;
-  const Index num_threads;
-  const Index num_blockAs;
-  const Index max_m;
-  const Index k_block_idx;
-  const Index m_block_idx;
-  const Index n_block_idx;
-  const Index m_blocks;
-  const Index n_blocks;
-  MaxSizeVector<Notification*>* kernel_notifications;
-  const MaxSizeVector<Notification*>* lhs_notifications;
-  const bool need_to_pack;
-};
-
-}  // end namespace internal
-#endif  // EIGEN_USE_SIMPLE_THREAD_POOL
-
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> > {
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > {
 
   typedef ThreadPoolDevice Device;
 
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
   typedef TensorContractionEvaluatorBase<Self> Base;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -112,31 +71,35 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) {}
 
-#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
-            bool rhs_inner_dim_reordered, int Alignment>
+  template <int Alignment>
   void evalProduct(Scalar* buffer) const {
-    typedef internal::TensorContractionInputMapper<
-        LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
-        contract_t, internal::packet_traits<LhsScalar>::size,
-        lhs_inner_dim_contiguous, false, Unaligned>
-        LhsMapper;
-    typedef internal::TensorContractionInputMapper<
-        RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
-        contract_t, internal::packet_traits<RhsScalar>::size,
-        rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
-        RhsMapper;
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-    typedef internal::gemm_pack_lhs<LhsScalar, Index,
-                                    typename LhsMapper::SubMapper, Traits::mr,
-                                    Traits::LhsProgress, ColMajor>
-        LhsPacker;
-    typedef internal::gemm_pack_rhs<
-        RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
-        RhsPacker;
-    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
-                                  Traits::mr, Traits::nr, false, false>
-        GebpKernel;
+    evalProductImpl<NoCallback, Alignment>(buffer, NoCallback());
+  }
+
+  template <typename EvalToCallback, int Alignment>
+  void evalProductAsync(Scalar* buffer, EvalToCallback done) const {
+    evalProductImpl<EvalToCallback, Alignment>(buffer, std::move(done));
+  }
+
+  template <typename DoneCallback, int Alignment>
+  void evalProductImpl(Scalar* buffer, DoneCallback done) const {
+    // This function computes a lot of heuristics in multiple steps, and it
+    // also has multiple exit points. To keep it sane, readable and all in one
+    // place, sync/async execution decision is made at runtime at the very end.
+    //
+    // (1) In sync mode we allocate Context on the stack, submit computations
+    //     to the device thread pool, and block on a barrier until it is
+    //     completed.
+    //
+    // (2) In async mode we allocate Context on the heap, and after all tasks
+    //     are finished, we call provided the done callback, and delete a
+    //     context from the heap.
+    //
+    // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state
+    // and temporary buffers, requried for executing the tensor contraction.
+    // They are responsible for cleaning it up after contraction is done.
+    static const bool IsEvalInSyncMode =
+        std::is_same<DoneCallback, NoCallback>::value;
 
     const Index m = this->m_i_size;
     const Index n = this->m_j_size;
@@ -172,14 +135,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // Again, we don't know number of threads yet, so we use 2.
     Index bm, bn, bk;
     if (shard_by_col) {
-      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
                                           internal::ShardByCol>
           blocking(k, m, n, 2);
       bm = blocking.mc();
       bn = blocking.nc();
       bk = blocking.kc();
     } else {
-      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
                                           internal::ShardByRow>
           blocking(k, m, n, 2);
       bm = blocking.mc();
@@ -195,35 +158,45 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
         contractionCost(m, n, bm, bn, bk, shard_by_col, false);
     int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
         static_cast<double>(n) * m, cost, this->m_device.numThreads());
+    int num_threads_by_k = numThreadsInnerDim(m, n, k);
+    if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) {
+      // We are in the scenario where it is more effective to shard by the
+      // inner dimension.
+      if (IsEvalInSyncMode) {
+        EvalShardedByInnerDimContext<DoneCallback> ctx(
+            this, num_threads_by_k, buffer, m, n, k, std::move(done));
+        ctx.template run<Alignment>();
+      } else {
+        auto* ctx = new EvalShardedByInnerDimContext<DoneCallback>(
+            this, num_threads_by_k, buffer, m, n, k, std::move(done));
+        ctx->template runAsync<Alignment>();
+      }
+
+      return;
+    }
 
     // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
     // model is not tuned. Remove this when the cost model is tuned.
     if (n == 1) num_threads = 1;
 
     if (num_threads == 1) {
-      // The single-threaded algorithm should be faster in this case.
-      if (n == 1)
-        this->template evalGemv<lhs_inner_dim_contiguous,
-                                rhs_inner_dim_contiguous,
-                                rhs_inner_dim_reordered, Alignment>(buffer);
-      else
-        this->template evalGemm<lhs_inner_dim_contiguous,
-                                rhs_inner_dim_contiguous,
-                                rhs_inner_dim_reordered, Alignment>(buffer);
+      TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential,
+                                  Unaligned, (buffer));
+      if (!IsEvalInSyncMode) done();
       return;
     }
 
     // Now that we know number of threads, recalculate sharding and blocking.
     shard_by_col = shardByCol(m, n, num_threads);
     if (shard_by_col) {
-      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
                                           internal::ShardByCol>
           blocking(k, m, n, num_threads);
       bm = blocking.mc();
       bn = blocking.nc();
       bk = blocking.kc();
     } else {
-      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
                                           internal::ShardByRow>
           blocking(k, m, n, num_threads);
       bm = blocking.mc();
@@ -255,6 +228,26 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     Index nm = divup(nm0, gm);
     Index nn = divup(nn0, gn);
 
+    // If there is enough concurrency in the sharding dimension, we choose not
+    // to paralellize by the other dimension, and execute all kernels in sync
+    // mode. This reduces parallelism from the nm x nn down to nn
+    // (shard_by_col==true) or nm (shard_by_col==false).
+    const Index sharding_dim_tasks = shard_by_col ? nn : nm;
+    const int num_worker_threads = this->m_device.numThreadsInPool();
+
+    // With small number of threads we want to make sure that we do not reduce
+    // parallelism too much. With large number of threads we trade maximum
+    // parallelism for better memory locality.
+    const float oversharding_factor =
+        num_worker_threads <= 4  ? 8.0 :
+        num_worker_threads <= 8  ? 4.0 :
+        num_worker_threads <= 16 ? 2.0 :
+        num_worker_threads <= 32 ? 1.0 :
+        num_worker_threads <= 64 ? 0.8 : /* num_worker_threads > 64 */ 0.6;
+
+    const bool parallelize_by_sharding_dim_only =
+        sharding_dim_tasks >= oversharding_factor * num_worker_threads;
+
     // Last by not least, decide whether we want to issue both lhs and rhs
     // packing in parallel; or issue lhs packing first, and then issue rhs
     // packing when lhs packing completes (for !shard_by_col lhs and rhs are
@@ -270,40 +263,139 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // But don't do it if we will use each rhs only once. Locality seems to be
     // more important in this case.
     if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
+    // Also don't get in the way of parallelize_by_sharding_dim_only
+    // optimization.
+    if (parallelize_by_sharding_dim_only) parallel_pack = false;
+
+    // TODO(ezhulnev): With if contexpr we don't need SyncEvalParallelContext.
+    if (IsEvalInSyncMode) {
+#define CONTEXT_ARGS                                                        \
+  (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
+   nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only,      \
+   NoCallback())                                                            \
+      .run()
+      TENSOR_CONTRACTION_DISPATCH(SyncEvalParallelContext, Alignment,
+                                  CONTEXT_ARGS);
+#undef CONTEXT_ARGS
 
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides,
-                  this->m_i_strides, this->m_left_contracting_strides,
-                  this->m_k_strides);
+    } else {
+#define CONTEXT_ARGS                                                        \
+  (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
+   nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only,      \
+   std::move(done))
+      TENSOR_CONTRACTION_ASYNC_DISPATCH(EvalParallelContext, DoneCallback,
+                                        Alignment, CONTEXT_ARGS, run());
+#undef CONTEXT_ARGS
+    }
+  }
 
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides,
-                  this->m_j_strides, this->m_right_contracting_strides,
-                  this->m_k_strides);
+  // ------------------------------------------------------------------------ //
 
-    Context<LhsPacker, RhsPacker, GebpKernel, LhsMapper, RhsMapper,
-            OutputMapper>(this->m_device, num_threads, lhs, rhs, buffer, m, n,
-                          k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0,
-                          shard_by_col, parallel_pack)
-        .run();
-  }
+  // Dummy struct to represent an empty DoneCallback.
+
+  struct NoCallback {
+    void operator()() {
+      eigen_assert(false && "NoCallback should never be called");
+    }
+  };
+
+  // ------------------------------------------------------------------------ //
 
-  // Context coordinates a single parallel gemm operation.
-  template <typename LhsPacker, typename RhsPacker, typename GebpKernel,
-            typename LhsMapper, typename RhsMapper, typename OutputMapper>
-  class Context {
+  template <typename DoneCallback, typename Context>
+  class EvalParallelNotification;
+
+  // Synchronous evaluation notification that blocks caller thread in Wait().
+  template <typename Context>
+  class EvalParallelNotification<NoCallback, Context> {
+   public:
+    EvalParallelNotification(Context*, NoCallback) {}
+    void Notify() { done_.Notify(); }
+    void Wait() { done_.Wait(); }
+   private:
+    Eigen::Notification done_;
+  };
+
+  // Asynchronous evaluation notification that does not block in Wait().
+  template <typename DoneCallback, typename Context>
+  class EvalParallelNotification {
+   public:
+    EvalParallelNotification(Context* ctx, DoneCallback done)
+        : ctx_(ctx), done_(std::move(done)) {}
+
+    void Notify() {
+      // Make a copy of done callback, because it will be destructed when we
+      // will delete context in the next line (EvalParallelNotification is a
+      // data member of EvalParallelContext class).
+      DoneCallback done_copy = std::move(done_);
+
+      // Delete parallel evaluation context.
+      delete ctx_;
+
+      // Now safely call the done callback.
+      done_copy();
+    }
+
+    void Wait() {}
+
+   private:
+    Context* ctx_;
+    DoneCallback done_;
+  };
+
+  // Context orchestrates sync/async parallel contraction evaluation. When it is
+  // executed in asynchronous mode, it owns all the shared state that might be
+  // accessible by block packing and kernel tasks.
+
+  template <typename DoneCallback, bool lhs_inner_dim_contiguous,
+            bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
+            int Alignment>
+  class EvalParallelContext {
    public:
-    Context(const Device& device, int num_threads, LhsMapper& lhs,
-            RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
-            Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
-            Index gn, Index nm0, Index nn0, bool shard_by_col,
-            bool parallel_pack)
-        : device_(device),
-          lhs_(lhs),
-          rhs_(rhs),
+    typedef internal::TensorContractionInputMapper<
+        LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+        contract_t, internal::packet_traits<LhsScalar>::size,
+        lhs_inner_dim_contiguous, false, Unaligned>
+        LhsMapper;
+    typedef internal::TensorContractionInputMapper<
+        RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+        contract_t, internal::packet_traits<RhsScalar>::size,
+        rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
+        RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    typedef internal::TensorContractionKernel<
+        Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+        TensorContractionKernel;
+
+    typedef typename TensorContractionKernel::LhsBlock LhsBlock;
+    typedef typename TensorContractionKernel::RhsBlock RhsBlock;
+    typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
+
+    EvalParallelContext(const Self* self, int num_threads, Scalar* buffer,
+                        Index tm, Index tn, Index tk, Index bm, Index bn,
+                        Index bk, Index nm, Index nn, Index nk, Index gm,
+                        Index gn, Index nm0, Index nn0, bool shard_by_col,
+                        bool parallel_pack,
+                        bool parallelize_by_sharding_dim_only,
+                        DoneCallback done)
+        : created_by_thread_id_(std::this_thread::get_id()),
+          done_(this, std::move(done)),
+          device_(self->m_device),
+          lhs_(self->m_leftImpl, self->m_left_nocontract_strides,
+               self->m_i_strides, self->m_left_contracting_strides,
+               self->m_k_strides),
+          rhs_(self->m_rightImpl, self->m_right_nocontract_strides,
+               self->m_j_strides, self->m_right_contracting_strides,
+               self->m_k_strides),
           buffer_(buffer),
           output_(buffer, tm),
+          output_kernel_(self->m_output_kernel),
+          tensor_contraction_params_(self->m_tensor_contraction_params),
           num_threads_(num_threads),
           shard_by_col_(shard_by_col),
           parallel_pack_(parallel_pack),
+          parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only),
           m_(tm),
           n_(tn),
           k_(tk),
@@ -316,13 +408,29 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
           gm_(gm),
           gn_(gn),
           nm0_(nm0),
-          nn0_(nn0)
-  {
+          nn0_(nn0),
+          kernel_(m_, k_, n_, bm_, bk_, bn_),
+          num_thread_local_allocations_(0),
+          // We reserve 2X more capacity for a thread local values, than the
+          // number of threads in the pool to efficiently handle task stealing
+          // by threads that are not managed by the pool.
+          thread_local_capacity(2 * (parallelize_by_sharding_dim_only_
+                                         ? device_.numThreadsInPool()
+                                         : 0)),
+          // We will use only one of the Lhs/Rhs thread local storage depending
+          // on the shard_by_col value and we parallelize by sharding dim ONLY.
+          lhs_thread_local_blocks_(shard_by_col_ ? 0 : thread_local_capacity,
+                                   {*this}, {*this}),
+          rhs_thread_local_blocks_(shard_by_col_ ? thread_local_capacity : 0,
+                                   {*this}, {*this}) {
+      // These two options are mutually exclusive.
+      eigen_assert(!(parallel_pack && parallelize_by_sharding_dim_only));
+
       for (Index x = 0; x < P; x++) {
         // Normal number of notifications for k slice switch is
         // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
         // nm_ + nn_ notifications, because they will not receive notifications
-        // from preceeding kernels.
+        // from preceding kernels.
         state_switch_[x] =
             x == 0
                 ? 1
@@ -344,57 +452,97 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       }
 
       // Allocate memory for packed rhs/lhs matrices.
-      size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
-      size_t lhs_size =
-          divup<size_t>(bm_ * bk_ * sizeof(LhsScalar), align) * align;
-      size_t rhs_size =
-          divup<size_t>(bn_ * bk_ * sizeof(RhsScalar), align) * align;
-      packed_mem_ = static_cast<char*>(internal::aligned_malloc(
-          (nm0_ * lhs_size + nn0_ * rhs_size) * std::min<size_t>(nk_, P - 1)));
-      char* mem = static_cast<char*>(packed_mem_);
-      for (Index x = 0; x < numext::mini<Index>(nk_, P - 1); x++) {
-        packed_lhs_[x].resize(nm0_);
-        for (Index m = 0; m < nm0_; m++) {
-          packed_lhs_[x][m] = reinterpret_cast<LhsScalar*>(mem);
-          mem += lhs_size;
-        }
-        packed_rhs_[x].resize(nn0_);
-        for (Index n = 0; n < nn0_; n++) {
-          packed_rhs_[x][n] = reinterpret_cast<RhsScalar*>(mem);
-          mem += rhs_size;
+      packed_mem_ = kernel_.allocateSlices(            //
+          device_,                                     //
+          /*num_lhs=*/nm0_,                            //
+          /*num_rhs=*/nn0_,                            //
+          /*num_slices=*/std::min<Index>(nk_, P - 1),  //
+          packed_lhs_, packed_rhs_);
+
+      if (parallelize_by_sharding_dim_only_) {
+        const int num_worker_threads = device_.numThreadsInPool();
+
+        if (shard_by_col) {
+          can_use_thread_local_packed_ = new std::atomic<bool>[nn_];
+          for (int i = 0; i < nn_; ++i)
+            can_use_thread_local_packed_[i].store(true,
+                                                  std::memory_order_relaxed);
+
+          Index num_blocks = num_worker_threads * gn_;
+          thread_local_pre_alocated_mem_ = kernel_.allocateSlices(  //
+              device_,                                              //
+              /*num_lhs=*/0,                                        //
+              /*num_rhs=*/num_blocks,                               //
+              /*num_slices=*/1,                                     //
+              /*lhs_blocks=*/nullptr, &rhs_thread_local_pre_allocated_);
+
+        } else {
+          can_use_thread_local_packed_ = new std::atomic<bool>[nm_];
+          for (int i = 0; i < nm_; ++i)
+            can_use_thread_local_packed_[i].store(true,
+                                                  std::memory_order_relaxed);
+
+          Index num_blocks = num_worker_threads * gm_;
+          thread_local_pre_alocated_mem_ = kernel_.allocateSlices(  //
+              device_,                                              //
+              /*num_lhs=*/num_blocks,                               //
+              /*num_rhs=*/0,                                        //
+              /*num_slices=*/1, &lhs_thread_local_pre_allocated_,   //
+              /*rhs_blocks=*/nullptr);
         }
       }
     }
 
-    ~Context() {
+    ~EvalParallelContext() {
       for (Index x = 0; x < P; x++) {
         for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
         delete[] state_kernel_[x];
       }
-      internal::aligned_free(packed_mem_);
+      kernel_.deallocate(device_, packed_mem_);
+      if (parallelize_by_sharding_dim_only_) {
+        kernel_.deallocate(device_, thread_local_pre_alocated_mem_);
+        delete[] can_use_thread_local_packed_;
+      }
     }
 
     void run() {
       // Kick off packing of the first slice.
       signal_switch(0, 1);
+
       // Wait for overall completion.
-      // TODO(dvyukov): this wait can lead to deadlock.
-      // If nthreads contractions are concurrently submitted from worker
-      // threads, this wait will block all worker threads and the system will
-      // deadlock.
+      //
+      // If parallel evaluation is executed in async mode, this is a no-op, and
+      // Wait() will return immediately. In synchronous mode it will block the
+      // caller thread until it will receive notification from last task.
+      //
+      // In async mode, last task when completed will call done callback from
+      // the same thread, and will delete this context.
+      //
+      // TODO(dvyukov): This wait can lead to deadlock if contraction is
+      // evaluated in synchronous mode. If nthreads contractions are
+      // concurrently submitted from worker threads, this wait will block all
+      // worker threads and the system will deadlock.
       done_.Wait();
     }
 
    private:
-    Notification done_;
+    std::thread::id created_by_thread_id_;
+
+    // This notification is specialized on the type of DoneCallback and can be
+    // blocking or non-blocking.
+    EvalParallelNotification<DoneCallback, EvalParallelContext> done_;
+
     const Device& device_;
-    LhsMapper& lhs_;
-    RhsMapper& rhs_;
+    LhsMapper lhs_;
+    RhsMapper rhs_;
     Scalar* const buffer_;
     OutputMapper output_;
+    OutputKernelType output_kernel_;
+    TensorContractionParams tensor_contraction_params_;
     const int num_threads_;
     const bool shard_by_col_;
     const bool parallel_pack_;
+    const bool parallelize_by_sharding_dim_only_;
     // Matrix sizes.
     const Index m_;
     const Index n_;
@@ -414,6 +562,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // coarsening).
     const Index nm0_;
     const Index nn0_;
+    // Tensor contraction kernel.
+    TensorContractionKernel kernel_;
 
     // Parallelization strategy.
     //
@@ -450,9 +600,215 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // actively executing + one to track completion of kernels in the second
     // slice.
     static const Index P = 3;
-    void* packed_mem_;
-    std::vector<LhsScalar*> packed_lhs_[P - 1];
-    std::vector<RhsScalar*> packed_rhs_[P - 1];
+
+    // Handle to the allocated temporary storage for Lhs/Rhs blocks.
+    BlockMemHandle packed_mem_;
+    std::vector<LhsBlock> packed_lhs_[P - 1];
+    std::vector<RhsBlock> packed_rhs_[P - 1];
+
+    // If we choose to parallelize only by the sharding dimension, each thread
+    // will have it's own "thead local" (not a c++ thread local storage) memory
+    // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory
+    // can't be passed to a kernel that might execute on a different thread.
+    //
+    // In practice when we are ready to pack memory for the sharding dimension
+    // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice
+    // already computed (99% of the time), and we can pack data into the thread
+    // local storage, and guarantee that all the kernels will be executed
+    // immediately in the same thread. This significantly increases L1 cache hit
+    // ratio and reduces pressure on the memory bus.
+    //
+    // It's still possible that kernel for the K-th slice will be ready before
+    // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_
+    // and packed_rhs_ to allow kernels to be executed later on a thread
+    // different from the thread that was used for packing.
+
+    // Handle for pre-allocated thread local memory buffers.
+    BlockMemHandle thread_local_pre_alocated_mem_;
+
+    // Only one of these will be initialized depending on shard_by_col value
+    // (the size will be `num_worker_threads * num_grains_in_the_sharding_dim`).
+    std::vector<LhsBlock> lhs_thread_local_pre_allocated_;
+    std::vector<RhsBlock> rhs_thread_local_pre_allocated_;
+
+    // How many thread local blocks were already allocated.
+    std::atomic<int> num_thread_local_allocations_;
+    const int thread_local_capacity;
+
+    // We will use pre-allocated Lhs/Rhs blocks defined above, if the number of
+    // unique threads in a system is below or equal to the number of threads in
+    // a thread pool. We will fallback on dynamic memory allocation after that.
+
+    // ThreadLocalBlocks is a container for Lhs or Rhs thread local buffers. Its
+    // size is equal to the grain size in Lhs/Rhs sharding dimension.
+    template <typename BlockType>
+    class ThreadLocalBlocks {
+     public:
+      ThreadLocalBlocks() = default;
+
+      ThreadLocalBlocks(BlockType* base, size_t grain_size)
+          : is_pre_allocated_(true),
+            thread_local_pre_allocated_base_(base),
+            grain_size_(grain_size) {}
+
+      ThreadLocalBlocks(BlockMemHandle mem_handle,
+                        std::vector<BlockType> blocks)
+          : is_pre_allocated_(false),
+            mem_handle_(std::move(mem_handle)),
+            blocks_(std::move(blocks)) {}
+
+      BlockType& block(int grain_index) {
+        eigen_assert(grain_index >= 0);
+        eigen_assert(static_cast<size_t>(grain_index) < size());
+        return is_pre_allocated_ ? thread_local_pre_allocated_base_[grain_index]
+                                 : blocks_[grain_index];
+      }
+
+      void Release(EvalParallelContext& ctx) const {
+        if (!is_pre_allocated_) {
+          ctx.kernel_.deallocate(ctx.device_, mem_handle_);
+        }
+      }
+
+      size_t size() const {
+        return is_pre_allocated_ ? grain_size_ : blocks_.size();
+      }
+
+     private:
+      bool is_pre_allocated_;
+
+      // Reuse pre-allocated thread local buffers.
+      BlockType* thread_local_pre_allocated_base_ = nullptr;
+      size_t grain_size_ = 0;
+
+      // These will be initialized only if `is_pre_allocated == false`.
+      BlockMemHandle mem_handle_{};
+      std::vector<BlockType> blocks_;
+    };
+
+    // ThreadLocalBlocksInitialize callable does custom thread local blocks
+    // initialization, and will reuse pre-allocated buffers if possible, or will
+    // dynamically allocate new memory.
+    //
+    // Lhs/Rhs blocks might be of the same type, so we have to pass explicitly
+    // for what side do we plan to do block allocation.
+    template <typename BlockType, bool is_rhs>
+    class ThreadLocalBlocksInitialize {
+      static constexpr bool kIsLhs =
+          !is_rhs && std::is_same<BlockType, LhsBlock>::value;
+      static const bool kIsRhs =
+          is_rhs && std::is_same<BlockType, RhsBlock>::value;
+      static_assert(kIsLhs || kIsRhs, "Unkown block type");
+
+      using Blocks = ThreadLocalBlocks<BlockType>;
+
+     public:
+      ThreadLocalBlocksInitialize(EvalParallelContext& ctx)
+          : ctx_(ctx),
+            num_worker_threads_(ctx_.device_.numThreadsInPool()) {}
+
+      void operator()(Blocks& blocks) {
+        const int n = ctx_.num_thread_local_allocations_.fetch_add(
+            1, std::memory_order_relaxed);
+
+        if (n >= num_worker_threads_) {
+          ThreadLocalBlocksAllocator<is_rhs>::allocate(ctx_, blocks);
+        } else {
+          ThreadLocalBlocksAllocator<is_rhs>::reuse(ctx_, n, blocks);
+        }
+      }
+
+     private:
+      // NOTE(ezhulenev): Without 'if constexpr' we have to put calls to
+      // TensorContractionKernel::allocateSlices into template specializations.
+      // Also explicit specializations are not allowed at class scope in C++03,
+      // EvalCtx type parameter is just a workaround for that limitation.
+      template <bool pack_rhs, typename EvalCtx = EvalParallelContext>
+      struct ThreadLocalBlocksAllocator;
+
+      template <typename EvalCtx>
+      struct ThreadLocalBlocksAllocator</*pack_rhs=*/true, EvalCtx> {
+        static void allocate(EvalCtx& ctx, Blocks& blocks) {
+          std::vector<RhsBlock> rhs_blocks;
+          BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(
+              ctx.device_,
+              /*num_lhs=*/0,
+              /*num_rhs=*/ctx.gn_,
+              /*num_slices=*/1,
+              /*lhs_blocks=*/nullptr, /*rhs_blocks=*/&rhs_blocks);
+
+          blocks = ThreadLocalBlocks<RhsBlock>(std::move(mem_handle),
+                                               std::move(rhs_blocks));
+        }
+
+        static void reuse(EvalCtx& ctx, int index, Blocks& blocks) {
+          RhsBlock* ptr = &ctx.rhs_thread_local_pre_allocated_[ctx.gn_ * index];
+          blocks = ThreadLocalBlocks<RhsBlock>(ptr, ctx.gn_);
+        }
+      };
+
+      template <typename EvalCtx>
+      struct ThreadLocalBlocksAllocator</*pack_rhs=*/false, EvalCtx> {
+        static void allocate(EvalCtx& ctx, Blocks& blocks) {
+          std::vector<LhsBlock> lhs_blocks;
+          BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(
+              ctx.device_,
+              /*num_lhs=*/ctx.gm_,
+              /*num_rhs=*/0,
+              /*num_slices=*/1,
+              /*lhs_blocks=*/&lhs_blocks, /*rhs_blocks=*/nullptr);
+
+          blocks = ThreadLocalBlocks<LhsBlock>(std::move(mem_handle),
+                                               std::move(lhs_blocks));
+        }
+
+        static void reuse(EvalCtx& ctx, int index, Blocks& blocks) {
+          LhsBlock* ptr = &ctx.lhs_thread_local_pre_allocated_[ctx.gm_ * index];
+          blocks = ThreadLocalBlocks<LhsBlock>(ptr, ctx.gm_);
+        }
+      };
+
+      EvalParallelContext& ctx_;
+      const int num_worker_threads_;
+    };
+
+    template <typename BlockType>
+    class ThreadLocalBlocksRelease {
+     public:
+      using Blocks = ThreadLocalBlocks<BlockType>;
+      ThreadLocalBlocksRelease(EvalParallelContext& ctx) : ctx_(ctx) {}
+      void operator()(Blocks& blocks) { blocks.Release(ctx_); }
+
+     private:
+      EvalParallelContext& ctx_;
+    };
+
+    // ThreadLocalBlocks initialization callables.
+    using ThreadLocalLhsInit =
+        ThreadLocalBlocksInitialize<LhsBlock, /*is_rhs=*/false>;
+    using ThreadLocalRhsInit =
+        ThreadLocalBlocksInitialize<RhsBlock, /*is_rhs=*/true>;
+
+    // ThreadLocalBlocks release callables.
+    using ThreadLocalLhsRelease = ThreadLocalBlocksRelease<LhsBlock>;
+    using ThreadLocalRhsRelease = ThreadLocalBlocksRelease<RhsBlock>;
+
+    // Thread local containers for Lhs/Rhs block packs. In practice only one of
+    // them will be used, depending on the shard_by_col value.
+    Eigen::ThreadLocal<ThreadLocalBlocks<LhsBlock>, ThreadLocalLhsInit,
+                       ThreadLocalLhsRelease>
+        lhs_thread_local_blocks_;
+    Eigen::ThreadLocal<ThreadLocalBlocks<RhsBlock>, ThreadLocalRhsInit,
+                       ThreadLocalRhsRelease>
+        rhs_thread_local_blocks_;
+
+    // After a particular shard for Kth slice missed thread local execution
+    // opportunity (K-1 slice didn't complete kernels execution), we can no
+    // longer schedule K+1 and following slices in thread local mode, because
+    // there is no more guarantee that previous kernels were executed
+    // sequentially in the same thread (size is nn_ or nm_).
+    std::atomic<bool>* can_use_thread_local_packed_;
+
     std::atomic<uint8_t>** state_kernel_[P];
     // state_switch_ is frequently modified by worker threads, while other
     // fields are read-only after constructor. Let's move it to a separate cache
@@ -461,69 +817,168 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     std::atomic<Index> state_packing_ready_[P];
     std::atomic<Index> state_switch_[P];
 
+    LhsBlock& packed_lhs(Index m, Index k, Index m1, bool use_thread_local) {
+      if (use_thread_local) {
+        eigen_assert(!shard_by_col_);
+        ThreadLocalBlocks<LhsBlock>& blocks = lhs_thread_local_blocks_.local();
+
+        Index grain_index = m1 - m * gm_;
+        return blocks.block(internal::convert_index<int>(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index?
+      } else {
+        return packed_lhs_[k % (P - 1)][m1];
+      }
+    }
+
+    RhsBlock& packed_rhs(Index n, Index k, Index n1, bool use_thread_local) {
+      if (use_thread_local) {
+        eigen_assert(shard_by_col_);
+        ThreadLocalBlocks<RhsBlock>& blocks = rhs_thread_local_blocks_.local();
+
+        Index grain_index = n1 - n * gn_;
+        return blocks.block(internal::convert_index<int>(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index?
+      } else {
+        return packed_rhs_[k % (P - 1)][n1];
+      }
+    }
+
+    // In following two methods (pack_lhs and pack_rhs), if we know for sure
+    // that we'll be able to immediately call a kernel with packed data, and do
+    // not submit it to the thread pool, we can use thread local memory for
+    // packed data.
+    //
+    // We can only reliably check it if we are running all kernels in sync mode
+    // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to
+    // run, it's guaranteed that all kernels with larger values of m (n) are
+    // also ready, because we execute them in the same order for all K slices.
+
     void pack_lhs(Index m, Index k) {
+      bool use_thread_local = false;
+
+      if (parallelize_by_sharding_dim_only_ && !shard_by_col_ &&
+          can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) {
+        if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) {
+          use_thread_local = true;
+        } else {
+          // If we can't guarantee that all kernels in `k` slice will be
+          // executed sequentially in current thread, it's no longer safe to use
+          // thread local memory in following slices along the k dimensions.
+          eigen_assert(k > 0);
+          can_use_thread_local_packed_[m].store(false,
+                                                std::memory_order_relaxed);
+        }
+      }
+
       const Index mend = m * gm_ + gm(m);
       for (Index m1 = m * gm_; m1 < mend; m1++)
-        LhsPacker()(packed_lhs_[k % (P - 1)][m1],
-                    lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
+        kernel_.packLhs(&packed_lhs(m, k, m1, use_thread_local),
+                        lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
 
       if (!parallel_pack_ && shard_by_col_) {
+        assert(!use_thread_local);
         signal_packing(k);
       } else {
         signal_switch(k + 1);
-        for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0);
+        for (Index n = nn_ - 1; n >= 0; n--) {
+          bool sync = parallelize_by_sharding_dim_only_ || n == 0;
+          signal_kernel(m, n, k, sync, use_thread_local);
+        }
       }
     }
 
     void pack_rhs(Index n, Index k) {
+      bool use_thread_local = false;
+
+      if (parallelize_by_sharding_dim_only_ && shard_by_col_ &&
+          can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) {
+        if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) {
+          use_thread_local = true;
+        } else {
+          // If we can't guarantee that all kernels in `k` slice will be
+          // executed sequentially in current thread, it's no longer safe to use
+          // thread local memory in followig slices along the k dimensions.
+          eigen_assert(k > 0);
+          can_use_thread_local_packed_[n].store(false,
+                                                std::memory_order_relaxed);
+        }
+      }
+
       const Index nend = n * gn_ + gn(n);
       for (Index n1 = n * gn_; n1 < nend; n1++) {
-        if (k == 0) {
-          // Zero the output memory in parallel.
-          // On 10000x2x10000 mm zeroing can easily take half of time.
-          // Zero (bn x m) row. Safe to do here because all kernels that will
-          // write to this memory depend on completion of this task.
-          // Note: don't call device_.memset() here. device_.memset() blocks on
-          // thread pool worker thread, which can lead to underutilization and
-          // deadlocks.
+        if (!TensorContractionKernel::HasBeta && k == 0) {
+          // Zero the output memory in parallel, only if contraction kernel does
+          // not support `beta`. Otherwise we will pass beta 0.0 to the first
+          // call to the `TensorContractionKernel::invoke()`.
+          //
+          // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn
+          // x m) row. Safe to do here because all kernels that will write to
+          // this memory depend on completion of this task. Note: don't call
+          // device_.memset() here. device_.memset() blocks on thread pool
+          // worker thread, which can lead to underutilization and deadlocks.
           memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
         }
-        RhsPacker()(packed_rhs_[k % (P - 1)][n1],
-                    rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
+        kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local),
+                        rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
       }
 
       if (parallel_pack_ || shard_by_col_) {
         signal_switch(k + 1);
-        for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0);
+        for (Index m = nm_ - 1; m >= 0; m--) {
+          bool sync = parallelize_by_sharding_dim_only_ || m == 0;
+          signal_kernel(m, n, k, sync, use_thread_local);
+        }
       } else {
+        assert(!use_thread_local);
         signal_packing(k);
       }
     }
 
-    void kernel(Index m, Index n, Index k) {
+    void kernel(Index m, Index n, Index k, bool use_thread_local) {
       // Note: order of iteration matters here. Iteration over m is innermost
-      // because we want to reuse the same packed rhs in consequetive tasks
+      // because we want to reuse the same packed rhs in consecutive tasks
       // (rhs fits into L2$ while lhs only into L3$).
       const Index nend = n * gn_ + gn(n);
       const Index mend = m * gm_ + gm(m);
+
+      // NOTE: output = alpha * LHS * RHS + beta * output.
+      const Scalar alpha = Scalar(1);
+      const Scalar beta =
+          (TensorContractionKernel::HasBeta && k == 0) ? Scalar(0) : Scalar(1);
+
       if (shard_by_col_) {
         for (Index n1 = n * gn_; n1 < nend; n1++) {
-          for (Index m1 = m * gm_; m1 < mend; m1++)
-            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
-                         packed_lhs_[k % (P - 1)][m1],
-                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
-                         Scalar(1), -1, -1, 0, 0);
+          for (Index m1 = m * gm_; m1 < mend; m1++) {
+            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
+            kernel_.invoke(
+                output_mapper,
+                packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
+                packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1),
+                bk(k), bn(n1), alpha, beta);
+
+            // We are done with the last task for the [m1, n1] block.
+            if (k + 1 == nk_) {
+              output_kernel_(output_mapper, tensor_contraction_params_,
+                             m1 * bm_, n1 * bn_, bm(m1), bn(n1));
+            }
+          }
         }
       } else {
         for (Index m1 = m * gm_; m1 < mend; m1++)
           for (Index n1 = n * gn_; n1 < nend; n1++) {
-            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
-                         packed_lhs_[k % (P - 1)][m1],
-                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
-                         Scalar(1), -1, -1, 0, 0);
+            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
+            kernel_.invoke(
+                output_mapper,
+                packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
+                packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1),
+                bk(k), bn(n1), alpha, beta);
+
+            // We are done with the last task for the [m1, n1] block.
+            if (k + 1 == nk_) {
+              output_kernel_(output_mapper, tensor_contraction_params_,
+                             m1 * bm_, n1 * bn_, bm(m1), bn(n1));
+            }
           }
       }
-      signal_kernel(m, n, k + 1, false);
+      signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false);
       signal_switch(k + 2);
     }
 
@@ -536,16 +991,23 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       enqueue_packing(k, shard_by_col_);
     }
 
-    void signal_kernel(Index m, Index n, Index k, bool sync) {
+    void signal_kernel(Index m, Index n, Index k, bool sync,
+                       bool use_thread_local) {
       std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
       Index s = state->load();
       eigen_assert(s > 0);
-      if (s != 1 && state->fetch_sub(1) != 1) return;
+      if (s != 1 && state->fetch_sub(1) != 1) {
+        eigen_assert(!use_thread_local);
+        return;
+      }
       state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
-      if (sync)
-        kernel(m, n, k);
-      else
-        device_.enqueueNoNotification([=]() { kernel(m, n, k); });
+      if (sync) {
+        kernel(m, n, k, use_thread_local);
+      } else {
+        eigen_assert(!use_thread_local);
+        device_.enqueueNoNotification(
+            [=]() { kernel(m, n, k, use_thread_local); });
+      }
     }
 
     void signal_switch(Index k, Index v = 1) {
@@ -595,11 +1057,32 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
         else
           pack_lhs(start, k);
       } else {
-        Index mid = (start + end) / 2;
-        device_.enqueueNoNotification(
-            [=]() { enqueue_packing_helper(mid, end, k, rhs); });
-        device_.enqueueNoNotification(
-            [=]() { enqueue_packing_helper(start, mid, k, rhs); });
+        while (end - start > 1) {
+          Index mid = (start + end) / 2;
+          device_.enqueueNoNotification(
+              [=]() { enqueue_packing_helper(mid, end, k, rhs); });
+          end = mid;
+        }
+
+        // Decide if we want to run first packing task (start == 0) in
+        // async mode if we parallelize only by sharding dim:
+        // (1) pack_lhs and pack_rhs call signal_switch before completing
+        //     all calls to signal_kernel, which in sync mode might lead
+        //     to the execution of the first kernel of the k+1 slice, before
+        //     completing a call to the last kernel of the k slice.
+        // (2) all pack tasks for sharded dim must be executed in a thread
+        //     pool to get pre-allocated thead local buffers.
+        bool pack_async =
+          (start == 0) &&
+          (parallelize_by_sharding_dim_only_&& shard_by_col_ == rhs) &&
+          (k > 0 || std::this_thread::get_id() == created_by_thread_id_);
+
+        if (pack_async) {
+          device_.enqueueNoNotification(
+              [=]() { enqueue_packing_helper(start, end, k, rhs); });
+        } else {
+          enqueue_packing_helper(start, end, k, rhs);
+        }
       }
     }
 
@@ -611,10 +1094,364 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
     Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
 
-    Context(const Context&) = delete;
-    void operator=(const Context&) = delete;
+    EvalParallelContext(const EvalParallelContext&) = delete;
+    void operator=(const EvalParallelContext&) = delete;
+  };
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+            bool rhs_inner_dim_reordered, int Alignment>
+  using SyncEvalParallelContext =
+      EvalParallelContext<NoCallback, lhs_inner_dim_contiguous,
+                          rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
+                          Alignment>;
+
+  // ------------------------------------------------------------------------ //
+
+  // EvalShardedByInnerDimContext orchestrates sync/async contraction
+  // evaluation, when we shard by inner dimension. When it is executed in
+  // asynchronous mode, it owns all the shared state that might be accessible by
+  // block processing tasks.
+
+  template <typename DoneCallback>
+  struct EvalShardedByInnerDimContext {
+    EvalShardedByInnerDimContext(const Self* self, int num_threads,
+                                 Scalar* result_buffer,
+                                 Index m_size, Index n_size, Index k_size,
+                                 DoneCallback done_callback)
+        : evaluator(self),
+          m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous),
+          m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous),
+          m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered),
+          result(result_buffer),
+          m(m_size),
+          n(n_size),
+          k(k_size),
+          done(std::move(done_callback)),
+          buffer_size_bytes(m * n * sizeof(Scalar)),
+          block_size(blockSize(k, num_threads)),
+          num_blocks(divup<Index>(k, block_size)),
+          num_pending_blocks(internal::convert_index<int>(num_blocks)),
+          l0_ranges(divup<Index>(num_blocks, l0_size)),
+          l0_state(l0_ranges),
+          block_buffers(num_blocks) {
+      // Keep count of pending gemm tasks for each l0 range.
+      for (int i = 0; i < l0_ranges; ++i) {
+        const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i);
+        l0_state.emplace_back(internal::convert_index<int>(num_pending_tasks));
+      }
+
+      // Allocate temporary buffers for each block.
+      for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) {
+        Scalar* buf = block_idx == 0
+                          ? result
+                          : static_cast<Scalar*>(evaluator->m_device.allocate(
+                                buffer_size_bytes));
+        block_buffers.emplace_back(buf);
+      }
+    }
+
+    ~EvalShardedByInnerDimContext() {
+      for (Index i = 1; i < num_blocks; ++i) {
+        evaluator->m_device.deallocate(block_buffers[i]);
+      }
+    }
+
+    template <int Alignment>
+    void run() {
+      Barrier barrier(internal::convert_index<int>(num_blocks));
+      eval<Alignment>(barrier, 0, num_blocks);
+      barrier.Wait();
+
+      // Aggregate partial sums from l0 ranges.
+      aggregateL0Blocks<Alignment>();
+
+      // Apply output kernel.
+      applyOutputKernel();
+    }
+
+    template <int Alignment>
+    void runAsync() {
+      evalAsync<Alignment>(0, num_blocks);
+    }
+
+   private:
+    // The underlying GEMM kernel assumes that k is a multiple of
+    // the packet size and subtle breakage occurs if this is violated.
+    static const Index packet_size = internal::packet_traits<RhsScalar>::size;
+
+    const Self* evaluator;  // TensorContraction evaluator
+
+    // These fields required fromTENSOR_CONTRACTION_DISPATCH macro.
+    bool m_lhs_inner_dim_contiguous;
+    bool m_rhs_inner_dim_contiguous;
+    bool m_rhs_inner_dim_reordered;
+
+    Scalar* result;
+
+    Index m;
+    Index n;
+    Index k;
+
+    DoneCallback done;
+
+    // ----------------------------------------------------------------------//
+    // Algorithm parameters.
+
+    // We will compute partial results into the buffers of this size.
+    Index buffer_size_bytes;
+
+    Index block_size;
+    Index num_blocks;
+
+    // Keep track of pending tasks when evaluate in async mode.
+    std::atomic<int> num_pending_blocks;
+
+    // We compute partial gemm results in parallel, and to get the final result
+    // we need to add them all together. For the large number of threads (>= 48)
+    // this adds a very expensive sequential step at the end.
+    //
+    // We split the [0, num_blocks) into small ranges, and when a task for the
+    // block finishes its partial gemm computation, it checks if it was the last
+    // gemm in the range, and if so, it will add all blocks of the range.
+    //
+    // After all tasks done, we need to add only these pre-aggregated blocks.
+
+    // For now we use just a single level of ranges to compute pre-aggregated
+    // partial sums, but in general we can use more layers to compute tree
+    // aggregation in parallel and reduce the size of the sequential step.
+    //
+    // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make
+    // sense only if number of threads >= ~128?
+    static const Index l0_size = 4;
+    Index l0_ranges;
+
+    // Keep count of pending gemm tasks for each l0 range.
+    MaxSizeVector<std::atomic<int>> l0_state;  // [0, l0_ranges)
+
+    // Buffers allocated for each temporary block computation.
+    MaxSizeVector<Scalar*> block_buffers;  // [0, num_blocks)
+
+    template <int Alignment>
+    void processBlock(Index block_idx, Index begin, Index end) {
+      Scalar* buf = block_buffers[block_idx];
+
+      TENSOR_CONTRACTION_DISPATCH(
+          evaluator->template evalGemmPartialWithoutOutputKernel, Alignment,
+          (buf, begin, end,
+           /*num_threads=*/internal::convert_index<int>(num_blocks)));
+
+      // Check if it was the last task in l0 range.
+      const Index l0_index = block_idx / l0_size;
+      const int v = l0_state[l0_index].fetch_sub(1);
+      eigen_assert(v >= 1);
+
+      // If we processed the last block of the range, we can aggregate all
+      // partial results into the first block of the range.
+      if (v == 1) {
+        const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index);
+        const Index dst_block_idx = l0_index * l0_size;
+
+        if (rng_size == l0_size) {
+          addAllToBuffer<Alignment>(
+              m * n,
+              /*src_buf0=*/block_buffers[dst_block_idx + 1],
+              /*src_buf1=*/block_buffers[dst_block_idx + 2],
+              /*src_buf2=*/block_buffers[dst_block_idx + 3],
+              /*dst_buf= */ block_buffers[dst_block_idx]);
+        } else {
+          // Aggregate blocks of potentially incomplete last range.
+          for (int i = 1; i < rng_size; ++i) {
+            addToBuffer<Alignment>(m * n,
+                                   /*src_buf=*/block_buffers[dst_block_idx + i],
+                                   /*dst_buf=*/block_buffers[dst_block_idx]);
+          }
+        }
+      }
+    }
+
+    // Aggregate partial sums from l0 ranges.
+    template <int Alignment>
+    void aggregateL0Blocks() const {
+      Index l0_index = 1;
+
+      for (; l0_index + 2 < l0_ranges; l0_index += 3) {
+        addAllToBuffer<Alignment>(
+            m * n,
+            /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size],
+            /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size],
+            /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size],
+            /*dst_buf= */ block_buffers[0]);
+      }
+
+      for (; l0_index < l0_ranges; ++l0_index) {
+        addToBuffer<Alignment>(m * n, block_buffers[l0_index * l0_size],
+                               block_buffers[0]);
+      }
+    }
+
+    void applyOutputKernel() const {
+      typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+      evaluator->m_output_kernel(
+          OutputMapper(result, m), evaluator->m_tensor_contraction_params,
+          static_cast<Eigen::Index>(0), static_cast<Eigen::Index>(0), m, n);
+    }
+
+    // Compute block size with accounting for potentially incomplete last block.
+    Index actualBlockSize(Index block_idx) const {
+      return block_idx + 1 < num_blocks
+                 ? block_size
+                 : k + block_size - block_size * num_blocks;
+    };
+
+    // Compute range size with accounting for potentially incomplete last range.
+    Index actualRangeSize(Index num_ranges, Index range_size,
+                          Index range_idx) const {
+      eigen_assert(range_idx < num_ranges);
+      return range_idx + 1 < num_ranges
+                 ? range_size
+                 : num_blocks + range_size - range_size * num_ranges;
+    };
+
+    template <int Alignment>
+    EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf,
+                                                Scalar* tgt_buf) {
+      const int output_packet_size =
+          internal::unpacket_traits<PacketReturnType>::size;
+      size_t i = 0;
+      const size_t num_packets = n / output_packet_size;
+      for (; i < output_packet_size * num_packets; i += output_packet_size) {
+        const PacketReturnType src_val =
+            internal::pload<PacketReturnType>(src_buf + i);
+        const PacketReturnType tgt_val =
+            internal::ploadt<PacketReturnType, Alignment>(tgt_buf + i);
+        const PacketReturnType sum = internal::padd(src_val, tgt_val);
+        internal::pstoret<Scalar, PacketReturnType, Alignment>(tgt_buf + i,
+                                                               sum);
+      }
+      for (; i < n; ++i) {
+        tgt_buf[i] += src_buf[i];
+      }
+    }
+
+    template <int Alignment>
+    EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n,
+                                                   const Scalar* src_buf0,
+                                                   const Scalar* src_buf1,
+                                                   const Scalar* src_buf2,
+                                                   Scalar* dst_buf) {
+      using ::Eigen::internal::padd;
+      using ::Eigen::internal::pload;
+      using ::Eigen::internal::ploadt;
+      using ::Eigen::internal::pstoret;
+
+      const int output_packet_size =
+          internal::unpacket_traits<PacketReturnType>::size;
+
+      size_t i = 0;
+      const size_t num_packets = n / output_packet_size;
+      for (; i < output_packet_size * num_packets; i += output_packet_size) {
+        const auto src_val0 = pload<PacketReturnType>(src_buf0 + i);
+        const auto src_val1 = pload<PacketReturnType>(src_buf1 + i);
+        const auto src_val2 = pload<PacketReturnType>(src_buf2 + i);
+
+        const auto dst_val = ploadt<PacketReturnType, Alignment>(dst_buf + i);
+        const auto sum =
+            padd(padd(dst_val, src_val0), padd(src_val1, src_val2));
+
+        pstoret<Scalar, PacketReturnType, Alignment>(dst_buf + i, sum);
+      }
+      for (; i < n; ++i) {
+        dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i];
+      }
+    }
+
+    template <int Alignment>
+    void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) {
+      while (end_block_idx - start_block_idx > 1) {
+        Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
+        evaluator->m_device.enqueueNoNotification(
+            [this, &barrier, mid_block_idx, end_block_idx]() {
+              eval<Alignment>(barrier, mid_block_idx, end_block_idx);
+            });
+        end_block_idx = mid_block_idx;
+      }
+
+      Index block_idx = start_block_idx;
+      Index block_start = block_idx * block_size;
+      Index block_end = block_start + actualBlockSize(block_idx);
+
+      processBlock<Alignment>(block_idx, block_start, block_end);
+      barrier.Notify();
+    }
+
+    template <int Alignment>
+    void evalAsync(Index start_block_idx, Index end_block_idx) {
+      while (end_block_idx - start_block_idx > 1) {
+        Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
+        evaluator->m_device.enqueueNoNotification(
+            [this, mid_block_idx, end_block_idx]() {
+              evalAsync<Alignment>(mid_block_idx, end_block_idx);
+            });
+        end_block_idx = mid_block_idx;
+      }
+
+      Index block_idx = start_block_idx;
+
+      Index block_start = block_idx * block_size;
+      Index block_end = block_start + actualBlockSize(block_idx);
+
+      processBlock<Alignment>(block_idx, block_start, block_end);
+
+      int v = num_pending_blocks.fetch_sub(1);
+      eigen_assert(v >= 1);
+
+      if (v == 1) {
+        // Aggregate partial sums from l0 ranges.
+        aggregateL0Blocks<Alignment>();
+
+        // Apply output kernel.
+        applyOutputKernel();
+
+        // NOTE: If we call `done` callback before deleting this (context),
+        // it might deallocate Self* pointer captured by context, and we'll
+        // fail in destructor trying to deallocate temporary buffers.
+
+        // Move done call back from context before it will be destructed.
+        DoneCallback done_copy = std::move(done);
+
+        // We are confident that we are the last one who touches context.
+        delete this;
+
+        // Now safely call the done callback.
+        done_copy();
+      }
+    }
+
+    // Cost model doesn't capture well the cost associated with constructing
+    // tensor contraction mappers and computing loop bounds in gemm_pack_lhs
+    // and gemm_pack_rhs, so we specify minimum desired block size.
+    static Index blockSize(Index k, int num_threads) {
+      const auto round_up = [=](Index index) -> Index {
+        const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
+        return divup<Index>(index, kmultiple) * kmultiple;
+      };
+
+      const Index target_block_size = round_up(divup<Index>(k, num_threads));
+      const Index desired_min_block_size = 12 * packet_size;
+
+      return numext::mini<Index>(
+          k, numext::maxi<Index>(desired_min_block_size, target_block_size));
+    }
+
+    EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete;
+    void operator=(const EvalShardedByInnerDimContext&) = delete;
   };
 
+  // ------------------------------------------------------------------------ //
+
+  // Below are the function used by evalProductImpl heuristics, trying to select
+  // optimcal parameters for parallelization algorithm.
+
   // Decide whether we want to shard m x n contraction by columns or by rows.
   static bool shardByCol(Index m, Index n, Index num_threads) {
     // Note: we are comparing both n and m against Traits::nr, it is not
@@ -718,304 +1555,15 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     return 0;
   }
 
-#else  // EIGEN_USE_SIMPLE_THREAD_POOL
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalProduct(Scalar* buffer) const {
-    if (this->m_j_size == 1) {
-      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
-      return;
-    }
-
-    evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
-  }
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalGemm(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
-
-    const int lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
-    const int rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
-
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                   LeftEvaluator, left_nocontract_t,
-                                                   contract_t, lhs_packet_size,
-                                                   lhs_inner_dim_contiguous,
-                                                   false, Unaligned> LhsMapper;
-
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                   RightEvaluator, right_nocontract_t,
-                                                   contract_t, rhs_packet_size,
-                                                   rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
-    // TODO: packing could be faster sometimes if we supported row major tensor mappers
-    typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr,
-                                    Traits::LhsProgress, ColMajor> LhsPacker;
-    typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker;
-
-    // TODO: replace false, false with conjugate values?
-    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
-                                  Traits::mr, Traits::nr, false, false> GebpKernel;
-
-    typedef internal::packLhsArg<LhsScalar, LhsMapper, Index> packLArg;
-    typedef internal::packRhsAndKernelArg<LhsScalar, RhsScalar, RhsMapper, OutputMapper, Index> packRKArg;
-
-    // initialize data mappers
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
-                  this->m_left_contracting_strides, this->m_k_strides);
-
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
-                  this->m_right_contracting_strides, this->m_k_strides);
-
-    OutputMapper output(buffer, m);
-
-    // compute block sizes (which depend on number of threads)
-    const Index num_threads = this->m_device.numThreads();
-    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, num_threads);
-    Index mc = blocking.mc();
-    Index nc = blocking.nc();
-    Index kc = blocking.kc();
-    eigen_assert(mc <= m);
-    eigen_assert(nc <= n);
-    eigen_assert(kc <= k);
-
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
-    const Index k_blocks = CEIL_DIV(k, kc);
-    const Index n_blocks = CEIL_DIV(n, nc);
-    const Index m_blocks = CEIL_DIV(m, mc);
-    const Index sizeA = mc * kc;
-    const Index sizeB = kc * nc;
-
-    /*    cout << "m: " << m << " n: " << n << " k: " << k << endl;
-    cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl;
-    cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl;
-    cout << "num threads: " << num_threads << endl;
-    */
-
-    // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB
-    //       aren't 16 byte aligned segfaults will happen due to SIMD instructions
-    // note: You can get away with allocating just a single blockA and offsets and meet the
-    //       the alignment requirements with the assumption that
-    //       (Traits::mr * sizeof(ResScalar)) % 16 == 0
-    const Index numBlockAs = numext::mini(num_threads, m_blocks);
-    MaxSizeVector<LhsScalar *> blockAs(num_threads);
-    for (int i = 0; i < num_threads; i++) {
-      blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
-    }
-
-    // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread
-    // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful.
-    //       Other options: (1) reuse memory when a thread finishes. con: tricky
-    //                      (2) allocate block B memory in each thread. con: overhead
-    MaxSizeVector<RhsScalar *> blockBs(n_blocks);
-    for (int i = 0; i < n_blocks; i++) {
-      blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
-    }
-
-    // lhs_notifications starts with all null Notifications
-    MaxSizeVector<Notification*> lhs_notifications(num_threads, nullptr);
-
-    // this should really be numBlockAs * n_blocks;
-    const Index num_kernel_notifications = num_threads * n_blocks;
-    MaxSizeVector<Notification*> kernel_notifications(num_kernel_notifications,
-                                                    nullptr);
-
-    for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
-      const Index k_start = k_block_idx * kc;
-      // make sure we don't overshoot right edge of left matrix
-      const Index actual_kc = numext::mini(k_start + kc, k) - k_start;
-
-      for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) {
-        const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs);
-
-        for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) {
-          const Index m_start = mt_block_idx * mc;
-          const Index actual_mc = numext::mini(m_start + mc, m) - m_start;
-          eigen_assert(actual_mc > 0);
-
-          Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
-
-          for (int i = 0; i < n_blocks; ++i) {
-            Index notification_id = (blockAId * n_blocks + i);
-            // Wait for any current kernels using this slot to complete
-            // before using it.
-            if (kernel_notifications[notification_id]) {
-              wait_until_ready(kernel_notifications[notification_id]);
-              delete kernel_notifications[notification_id];
-            }
-            kernel_notifications[notification_id] = new Notification();
-          }
-          const packLArg arg = {
-            blockAs[blockAId], // blockA
-            lhs,        // lhs
-            m_start,    // m
-            k_start,    // k
-            actual_mc,  // mc
-            actual_kc,  // kc
-          };
-
-          // Delete any existing notification since we may be
-          // replacing it.  The algorithm should ensure that there are
-          // no existing waiters on this notification.
-          delete lhs_notifications[blockAId];
-          lhs_notifications[blockAId] =
-          this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg);
-        }
-
-        // now start kernels.
-        const Index m_base_start = m_block_idx * mc;
-        const bool need_to_pack = m_block_idx == 0;
-
-        for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) {
-          const Index n_start = n_block_idx * nc;
-          const Index actual_nc = numext::mini(n_start + nc, n) - n_start;
-
-          // first make sure the previous kernels are all done before overwriting rhs. Also wait if
-          // we're going to start new k. In both cases need_to_pack is true.
-          if (need_to_pack) {
-            for (Index i = num_blocks; i < num_threads; ++i) {
-              Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
-              Index future_id = (blockAId * n_blocks + n_block_idx);
-              wait_until_ready(kernel_notifications[future_id]);
-            }
-          }
-
-          packRKArg arg = {
-            &blockAs, // blockA
-            blockBs[n_block_idx], // blockB
-            rhs,          // rhs
-            output,       // output
-            m_base_start, // m
-            k_start,      // k
-            n_start,      // n
-            mc,           // mc
-            actual_kc,    // kc
-            actual_nc,    // nc
-            num_threads,
-            numBlockAs,
-            m,
-            k_block_idx,
-            m_block_idx,
-            n_block_idx, // n_block_idx
-            m_blocks, // m_blocks
-            n_blocks, // n_blocks
-            &kernel_notifications, // kernel notifications
-            &lhs_notifications,    // lhs notifications
-            need_to_pack, // need_to_pack
-          };
-
-          // We asynchronously kick off this function, which ends up
-          // notifying the appropriate kernel_notifications objects,
-          // which this thread waits on before exiting.
-          this->m_device.enqueueNoNotification(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
-        }
-      }
-    }
-
-    // Make sure all the kernels are done.
-    for (size_t i = 0; i < kernel_notifications.size(); ++i) {
-      wait_until_ready(kernel_notifications[i]);
-      delete kernel_notifications[i];
-    }
-
-    // No need to wait for lhs notifications since they should have
-    // already been waited on.  Just clean them up.
-    for (size_t i = 0; i < lhs_notifications.size(); ++i) {
-      delete lhs_notifications[i];
-    }
-
-    // deallocate all of the memory for both A and B's
-    for (size_t i = 0; i < blockAs.size(); i++) {
-      this->m_device.deallocate(blockAs[i]);
-    }
-    for (size_t i = 0; i < blockBs.size(); i++) {
-      this->m_device.deallocate(blockBs[i]);
-    }
-
-#undef CEIL_DIV
-  }
-
-  /*
-   * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing
-   * the LHS block, check that all of the kernels that worked on the same
-   * mt_block_idx in the previous m_block are done.
-   */
-  template <typename packLArg, typename LhsPacker>
-  static void packLhs(const packLArg arg) {
-    // perform actual packing
-    LhsPacker pack_lhs;
-    pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc);
-  }
-
-  /*
-   * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that
-   * all kernels in the previous block are done.
-   * Then for each LHS future, we wait on the future and then call GEBP
-   * on the area packed by the future (which starts at
-   * blockA + future_idx * mt * kc) on the LHS and with the full packed
-   * RHS block.
-   * The output of this GEBP is written to output(m + i * mt, n).
-   */
-  template <typename packRKArg, typename RhsPacker, typename GebpKernel>
-  static void packRhsAndKernel(packRKArg arg) {
-    if (arg.need_to_pack) {
-      RhsPacker pack_rhs;
-      pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc);
-    }
-
-    GebpKernel gebp;
-    for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) {
-      const Index m_base_start = arg.m + arg.mc*mt_block_idx;
-      if (m_base_start < arg.max_m) {
-        Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
-        wait_until_ready((*arg.lhs_notifications)[blockAId]);
-        const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start;
-        gebp(arg.output.getSubMapper(m_base_start, arg.n),
-             (*arg.blockAs)[blockAId], arg.blockB,
-             actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0);
-
-        // Notify that the kernel is done.
-        const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
-        (*arg.kernel_notifications)[set_idx]->Notify();
-      }
-    }
-  }
-#endif  // EIGEN_USE_SIMPLE_THREAD_POOL
-
   TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
                                bool shard_by_col, bool prepacked) const {
     const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
                                           PacketType<RhsScalar, Device>::size);
     const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
     const double kd = static_cast<double>(bk);
-    // Peak VFMA bandwidth is 0.5. However if we have not enough data for
-    // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
-    // experimentally.
-    double computeBandwidth = bk == 1 ? 4.0 :
-          (shard_by_col ? bn : bm) < Traits::nr ||
-          (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5;
-#ifndef EIGEN_VECTORIZE_FMA
-    // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
-    // However for MULPS/ADDPS we have dependent sequence of 2 such instructions,
-    // so overall bandwidth is 1.0.
-    if (computeBandwidth == 0.5) computeBandwidth = 1.0;
-#endif
+    double compute_bandwidth = computeBandwidth(false, bm, bn, bk);
     // Computations.
-    TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size);
+    TensorOpCost cost = TensorOpCost(0, 0, kd * compute_bandwidth, true, packed_size);
     // Output stores.
     cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
     if (prepacked) {
@@ -1035,6 +1583,94 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       rhsCost.dropMemoryCost();
     return cost + lhsCost + rhsCost;
   }
+
+  // Decide whether we want to shard m x k x n contraction over the inner
+  // (contraction) dimension (k).
+  static bool shardByInnerDim(Index m, Index n, Index k, int num_threads,
+                              int num_threads_by_k) {
+    std::ptrdiff_t bufsize = m * n * sizeof(Scalar);
+    bool shard_by_k = false;
+    if (n == 1 ||                // If mat*vec or...
+        num_threads_by_k < 2 ||  // running single threaded or...
+        num_threads_by_k <
+            num_threads ||  // sharding by k gives less parallelism or...
+        bufsize > l3CacheSize() / num_threads_by_k ||  // need more buffer space
+        // than L3 cache or...
+        k / num_threads_by_k < 2 * Traits::nr) {  // k per thread is tiny.
+      shard_by_k = false;
+    } else if (numext::maxi(m, n) / num_threads <
+                   Traits::nr ||  // both other dimensions are tiny or...
+               // k per thread is not small and...
+               (k / num_threads_by_k > 8 * Traits::nr &&
+                // one of the outer dimensions is tiny or sharding by k offers
+                // more parallelism.
+                (numext::mini(m, n) < 2 * Traits::nr ||
+                 num_threads_by_k > num_threads))) {
+      shard_by_k = true;
+    }
+    return shard_by_k;
+  }
+
+  TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const {
+    // Compute cost.
+    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size);
+    // Output stores.
+    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
+    TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m;
+    TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * n;
+    // Since the inner gemm kernel is always sharded by column, the lhs
+    // load cost is negligible.
+    lhsCost.dropMemoryCost();
+    return cost + lhsCost + rhsCost;
+  }
+
+  int numThreadsInnerDim(Index m, Index n, Index k) const {
+    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    TensorOpCost cost = contractionCostPerInnerDim(m, n, k);
+    double total_parallel_cost =
+        TensorCostModel<ThreadPoolDevice>::totalCost(k, cost);
+    // Cost of reduction step accumulating the m*n per-thread buffers into the
+    // result.
+    double reduction_cost = TensorCostModel<ThreadPoolDevice>::totalCost(
+        m * n, TensorOpCost(2, 1, 1, true, output_packet_size));
+    int num_threads = 1;
+    double min_cost = total_parallel_cost;
+    double kPerThreadOverHead = 3000;
+    double kFixedOverHead = 100000;
+    for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) {
+      double sequential_cost =
+          kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead);
+      double parallel_cost = total_parallel_cost / nt + sequential_cost;
+      if (parallel_cost < min_cost) {
+        num_threads = nt;
+        min_cost = parallel_cost;
+      }
+    }
+    return num_threads;
+  }
+
+  double computeBandwidth(bool shard_by_col, Index bm, Index bn,
+                          Index bk) const {
+    // Peak VFMA bandwidth is 0.5. However if we have not enough data for
+    // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
+    // experimentally.
+    double computeBandwidth =
+        bk == 1 ? 4.0
+                : (shard_by_col ? bn : bm) < Traits::nr ||
+                          (shard_by_col ? bm : bn) < Traits::mr
+                      ? 2.0
+                      : 0.5;
+#ifndef EIGEN_VECTORIZE_FMA
+    // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
+    // However for MULPS/ADDPS we have dependent sequence of 2 such
+    // instructions,
+    // so overall bandwidth is 1.0.
+    if (computeBandwidth == 0.5) computeBandwidth = 1.0;
+#endif
+    return computeBandwidth;
+  }
+
 };
 
 } // end namespace Eigen
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index 860a6949a9bde65f51e8b3f51f95304db62b25db..09d2da9a8d2ed240aa1c9bc9ba45bb54d2d7fabd 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -32,6 +32,7 @@ struct traits<TensorConversionOp<TargetType, XprType> >
   static const int NumDimensions = traits<XprType>::NumDimensions;
   static const int Layout = traits<XprType>::Layout;
   enum { Flags = 0 };
+  typedef typename TypeConversion<Scalar, typename traits<XprType>::PointerType>::type PointerType;
 };
 
 template<typename TargetType, typename XprType>
@@ -50,7 +51,10 @@ struct nested<TensorConversionOp<TargetType, XprType>, 1, typename eval<TensorCo
 
 
 template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
-struct PacketConverter {
+struct PacketConverter;
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 1> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketConverter(const TensorEvaluator& impl)
       : m_impl(impl) {}
@@ -108,7 +112,33 @@ struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 4, 1> {
 };
 
 template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
-struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 8, 1> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl) {}
+
+  template<int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+    SrcPacket src2 = m_impl.template packet<LoadMode>(index + 1 * SrcPacketSize);
+    SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
+    SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
+    SrcPacket src5 = m_impl.template packet<LoadMode>(index + 4 * SrcPacketSize);
+    SrcPacket src6 = m_impl.template packet<LoadMode>(index + 5 * SrcPacketSize);
+    SrcPacket src7 = m_impl.template packet<LoadMode>(index + 6 * SrcPacketSize);
+    SrcPacket src8 = m_impl.template packet<LoadMode>(index + 7 * SrcPacketSize);
+    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4, src5, src6, src7, src8);
+    return result;
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int TgtCoeffRatio>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, TgtCoeffRatio> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketConverter(const TensorEvaluator& impl)
       : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {}
@@ -128,6 +158,7 @@ struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {
       typedef typename internal::unpacket_traits<TgtPacket>::type TgtType;
       internal::scalar_cast_op<SrcType, TgtType> converter;
       EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < TgtPacketSize; ++i) {
         values[i] = converter(m_impl.coeff(index+i));
       }
@@ -163,19 +194,114 @@ class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprT
     typename XprType::Nested m_xpr;
 };
 
-template <bool SameType, typename Eval, typename Scalar> struct ConversionSubExprEval {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) {
+template <bool SameType, typename Eval, typename EvalPointerType> struct ConversionSubExprEval {
+  static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) {
     impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 };
 
-template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eval, Scalar> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) {
+template <typename Eval, typename EvalPointerType> struct ConversionSubExprEval<true, Eval, EvalPointerType> {
+  static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) {
     return impl.evalSubExprsIfNeeded(data);
   }
 };
 
+#ifdef EIGEN_USE_THREADS
+template <bool SameType, typename Eval, typename EvalPointerType,
+          typename EvalSubExprsCallback>
+struct ConversionSubExprEvalAsync {
+  static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) {
+    impl.evalSubExprsIfNeededAsync(nullptr, std::move(done));
+  }
+};
+
+template <typename Eval, typename EvalPointerType,
+          typename EvalSubExprsCallback>
+struct ConversionSubExprEvalAsync<true, Eval, EvalPointerType,
+                                  EvalSubExprsCallback> {
+  static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) {
+    impl.evalSubExprsIfNeededAsync(data, std::move(done));
+  }
+};
+#endif
+
+namespace internal {
+
+template <typename SrcType, typename TargetType, bool IsSameT>
+struct CoeffConv {
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    internal::scalar_cast_op<SrcType, TargetType> converter;
+    return converter(impl.coeff(index));
+  }
+};
+
+template <typename SrcType, typename TargetType>
+struct CoeffConv<SrcType, TargetType, true> {
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    return impl.coeff(index);
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode, bool ActuallyVectorize, bool IsSameT>
+struct PacketConv {
+  typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+
+  static const int PacketSize = internal::unpacket_traits<TargetPacket>::size;
+
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    internal::scalar_cast_op<SrcType, TargetType> converter;
+    EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = converter(impl.coeff(index+i));
+    }
+    TargetPacket rslt = internal::pload<TargetPacket>(values);
+    return rslt;
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode, bool IsSameT>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, true, IsSameT> {
+  typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+    const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+    PacketConverter<TensorEvaluator<ArgType, Device>, SrcPacket, TargetPacket,
+                    SrcCoeffRatio, TgtCoeffRatio> converter(impl);
+    return converter.template packet<LoadMode>(index);
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/false, /*IsSameT=*/true> {
+  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+  static const int PacketSize = internal::unpacket_traits<TargetPacket>::size;
+
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index+i);
+    return internal::pload<TargetPacket>(values);
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/true, /*IsSameT=*/true> {
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    return impl.template packet<LoadMode>(index);
+  }
+};
+
+}  // namespace internal
 
 // Eval as rvalue
 template<typename TargetType, typename ArgType, typename Device>
@@ -189,44 +315,98 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
   typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename PacketType<SrcType, Device>::type PacketSourceType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static const bool IsSameType = internal::is_same<TargetType, SrcType>::value;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = false,
-    PacketAccess = true,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      =
+    #ifndef EIGEN_USE_SYCL
+                        true,
+    #else
+                        TensorEvaluator<ArgType, Device>::PacketAccess &
+                        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
+    #endif
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = false
+  };
+
+  static const int NumDims = internal::array_size<Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  struct TensorConversionOpBlockFactory {
+    template <typename ArgXprType>
+    struct XprType {
+      typedef TensorConversionOp<TargetType, const ArgXprType> type;
+    };
+
+    template <typename ArgXprType>
+    typename XprType<ArgXprType>::type expr(const ArgXprType& expr) const {
+      return typename XprType<ArgXprType>::type(expr);
+    }
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  typedef internal::TensorUnaryExprBlock<TensorConversionOpBlockFactory,
+                                         ArgTensorBlock>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : m_impl(op.expression(), device)
   {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data)
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
   {
-    return ConversionSubExprEval<internal::is_same<TargetType, SrcType>::value, TensorEvaluator<ArgType, Device>, Scalar>::run(m_impl, data);
+    return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, EvaluatorPointerType>::run(m_impl, data);
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType data, EvalSubExprsCallback done) {
+    ConversionSubExprEvalAsync<IsSameType, TensorEvaluator<ArgType, Device>,
+                               EvaluatorPointerType,
+        EvalSubExprsCallback>::run(m_impl, data, std::move(done));
   }
+#endif
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
+  EIGEN_STRONG_INLINE void cleanup()
   {
     m_impl.cleanup();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    internal::scalar_cast_op<SrcType, TargetType> converter;
-    return converter(m_impl.coeff(index));
+    return internal::CoeffConv<SrcType, TargetType, IsSameType>::run(m_impl,index);
   }
 
   template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    const bool Vectorizable = TensorEvaluator<ArgType, Device>::PacketAccess &
-        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast;
-    return PacketConv<LoadMode, Vectorizable>::run(m_impl, index);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
+  packet(Index index) const {
+    // If we are not going to do the cast, we just need to check that base
+    // TensorEvaluator has packet access. Otherwise we also need to make sure,
+    // that we have an implementation of vectorized cast.
+    const bool Vectorizable =
+        IsSameType
+        ? TensorEvaluator<ArgType, Device>::PacketAccess
+        : int(TensorEvaluator<ArgType, Device>::PacketAccess) &
+          int(internal::type_casting_traits<SrcType, TargetType>::VectorizedCast);
+
+    return internal::PacketConv<PacketSourceType, PacketReturnType, LoadMode,
+                                Vectorizable, IsSameType>::run(m_impl, index);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -244,33 +424,30 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
     }
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return m_impl.getResourceRequirements();
+  }
 
-  protected:
-  template <int LoadMode, bool ActuallyVectorize>
-  struct PacketConv {
-    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-      internal::scalar_cast_op<SrcType, TargetType> converter;
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-      for (int i = 0; i < PacketSize; ++i) {
-        values[i] = converter(impl.coeff(index+i));
-      }
-      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-      return rslt;
-    }
-  };
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    return TensorBlock(m_impl.block(desc, scratch),
+                         TensorConversionOpBlockFactory());
+  }
 
-  template <int LoadMode>
-  struct PacketConv<LoadMode, true> {
-    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-      const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
-      const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
-      PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
-                      SrcCoeffRatio, TgtCoeffRatio> converter(impl);
-      return converter.template packet<LoadMode>(index);
-    }
-  };
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  /// required by sycl in order to extract the sycl accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
+ protected:
   TensorEvaluator<ArgType, Device> m_impl;
 };
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index abdf742c6ef859c8f02d36981a2ddbd14b482f42..b20f80ba2a07166413e4e05d90c9995086dd34f1 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -54,8 +54,8 @@ class IndexMapper {
       }
     }
 
-    array<Index, NumDims> cudaInputDimensions;
-    array<Index, NumDims> cudaOutputDimensions;
+    array<Index, NumDims> gpuInputDimensions;
+    array<Index, NumDims> gpuOutputDimensions;
     array<Index, NumDims> tmp = dimensions;
     array<Index, NumDims> ordering;
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
@@ -65,8 +65,8 @@ class IndexMapper {
       const Index index = i + offset;
       ordering[index] = indices[i];
       tmp[indices[i]] = -1;
-      cudaInputDimensions[index] = input_dims[indices[i]];
-      cudaOutputDimensions[index] = dimensions[indices[i]];
+      gpuInputDimensions[index] = input_dims[indices[i]];
+      gpuOutputDimensions[index] = dimensions[indices[i]];
     }
 
     int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
@@ -75,8 +75,8 @@ class IndexMapper {
     for (int i = 0; i < NumDims; ++i) {
       if (tmp[i] >= 0) {
         ordering[written] = i;
-        cudaInputDimensions[written] = input_dims[i];
-        cudaOutputDimensions[written] = dimensions[i];
+        gpuInputDimensions[written] = input_dims[i];
+        gpuOutputDimensions[written] = dimensions[i];
         ++written;
       }
     }
@@ -89,37 +89,37 @@ class IndexMapper {
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = 0; i < NumDims; ++i) {
         if (i > NumKernelDims) {
-          m_cudaInputStrides[i] =
-              m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
-          m_cudaOutputStrides[i] =
-              m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
+          m_gpuInputStrides[i] =
+              m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
+          m_gpuOutputStrides[i] =
+              m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
         } else {
-          m_cudaInputStrides[i] = 1;
-          m_cudaOutputStrides[i] = 1;
+          m_gpuInputStrides[i] = 1;
+          m_gpuOutputStrides[i] = 1;
         }
       }
     } else {
       for (int i = NumDims - 1; i >= 0; --i) {
-        if (i + 1 < offset) {
-          m_cudaInputStrides[i] =
-              m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
-          m_cudaOutputStrides[i] =
-              m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
+        if (static_cast<size_t>(i + 1) < offset) {
+          m_gpuInputStrides[i] =
+              m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
+          m_gpuOutputStrides[i] =
+              m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
         } else {
-          m_cudaInputStrides[i] = 1;
-          m_cudaOutputStrides[i] = 1;
+          m_gpuInputStrides[i] = 1;
+          m_gpuOutputStrides[i] = 1;
         }
       }
     }
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int d = NumDims - 1; d > NumKernelDims; --d) {
-        const Index idx = p / m_cudaInputStrides[d];
+        const Index idx = p / m_gpuInputStrides[d];
         inputIndex += idx * m_inputStrides[d];
-        p -= idx * m_cudaInputStrides[d];
+        p -= idx * m_gpuInputStrides[d];
       }
       inputIndex += p * m_inputStrides[NumKernelDims];
     } else {
@@ -128,22 +128,22 @@ class IndexMapper {
         limit = NumDims - NumKernelDims - 1;
       }
       for (int d = 0; d < limit; ++d) {
-        const Index idx = p / m_cudaInputStrides[d];
+        const Index idx = p / m_gpuInputStrides[d];
         inputIndex += idx * m_inputStrides[d];
-        p -= idx * m_cudaInputStrides[d];
+        p -= idx * m_gpuInputStrides[d];
       }
       inputIndex += p * m_inputStrides[limit];
     }
     return inputIndex;
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
     Index outputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int d = NumDims - 1; d > NumKernelDims; --d) {
-        const Index idx = p / m_cudaOutputStrides[d];
+        const Index idx = p / m_gpuOutputStrides[d];
         outputIndex += idx * m_outputStrides[d];
-        p -= idx * m_cudaOutputStrides[d];
+        p -= idx * m_gpuOutputStrides[d];
       }
       outputIndex += p * m_outputStrides[NumKernelDims];
     } else {
@@ -152,44 +152,44 @@ class IndexMapper {
         limit = NumDims - NumKernelDims - 1;
       }
       for (int d = 0; d < limit; ++d) {
-        const Index idx = p / m_cudaOutputStrides[d];
+        const Index idx = p / m_gpuOutputStrides[d];
         outputIndex += idx * m_outputStrides[d];
-        p -= idx * m_cudaOutputStrides[d];
+        p -= idx * m_gpuOutputStrides[d];
       }
       outputIndex += p * m_outputStrides[limit];
     }
     return outputIndex;
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_inputStrides[offset];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_outputStrides[offset];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
@@ -197,7 +197,7 @@ class IndexMapper {
            k * m_inputStrides[offset + 2];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
@@ -209,8 +209,8 @@ class IndexMapper {
   static const int NumDims = internal::array_size<InputDims>::value;
   array<Index, NumDims> m_inputStrides;
   array<Index, NumDims> m_outputStrides;
-  array<Index, NumDims> m_cudaInputStrides;
-  array<Index, NumDims> m_cudaOutputStrides;
+  array<Index, NumDims> m_gpuInputStrides;
+  array<Index, NumDims> m_gpuOutputStrides;
 };
 
 
@@ -231,6 +231,8 @@ struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   static const int NumDimensions = traits<InputXprType>::NumDimensions;
   static const int Layout = traits<InputXprType>::Layout;
+  typedef typename conditional<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
+  typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>::type PointerType;
 
   enum {
     Flags = 0
@@ -300,17 +302,25 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
+    IsAligned = int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
+    PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) & int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
+    BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -374,12 +384,12 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
     m_inputImpl.evalSubExprsIfNeeded(NULL);
     preloadKernel();
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_inputImpl.cleanup();
     if (m_local_kernel) {
       m_device.deallocate((void*)m_kernel);
@@ -465,7 +475,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
                                        PacketSize));
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 
  private:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@@ -521,11 +531,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
       m_local_kernel = false;
     } else {
       size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
-      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
       typedef TensorEvalToOp<const KernelArgType> EvalTo;
       EvalTo evalToTmp(local, m_kernelArg);
-      const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
-      internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
+      const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
 
       m_kernel = local;
       m_local_kernel = true;
@@ -544,14 +554,14 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   KernelArgType m_kernelArg;
   const Scalar* m_kernel;
   bool m_local_kernel;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
 };
 
 
 
 
 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
 
 template <int StaticKernelSize>
 struct GetKernelSize {
@@ -568,13 +578,17 @@ struct GetKernelSize<Dynamic> {
 
 template <typename InputEvaluator, typename Index, typename InputDims,
           int StaticKernelSize>
-__global__ void EigenConvolutionKernel1D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
     InputEvaluator eval,
     const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
         indexMapper,
     const float* __restrict kernel, const int numPlanes, const int numX,
     const int maxX, const int kernelSize, float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
   extern __shared__ float s[];
+#endif
 
   const int first_x = blockIdx.x * maxX;
   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
@@ -586,18 +600,18 @@ __global__ void EigenConvolutionKernel1D(
 
   for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
     // Load inputs to shared memory
-    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = threadIdx.y * num_x_input;
     #pragma unroll
     for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-      const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
+      const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
       s[i + plane_kernel_offset] = eval.coeff(tensor_index);
     }
 
     __syncthreads();
 
     // Compute the convolution
-    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
 
     #pragma unroll
     for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
@@ -607,7 +621,7 @@ __global__ void EigenConvolutionKernel1D(
       for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
         result += s[k + kernel_offset] * kernel[k];
       }
-      const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
+      const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
       buffer[tensor_index] = result;
     }
     __syncthreads();
@@ -616,14 +630,18 @@ __global__ void EigenConvolutionKernel1D(
 
 template <typename InputEvaluator, typename Index, typename InputDims,
           int StaticKernelSizeX, int StaticKernelSizeY>
-__global__ void EigenConvolutionKernel2D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
     InputEvaluator eval,
     const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
         indexMapper,
     const float* __restrict kernel, const int numPlanes, const int numX,
     const int maxX, const int numY, const int maxY, const int kernelSizeX,
     const int kernelSizeY, float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
   extern __shared__ float s[];
+#endif
 
   const int first_x = blockIdx.x * maxX;
   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
@@ -640,7 +658,7 @@ __global__ void EigenConvolutionKernel2D(
 
   for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
 
-    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = threadIdx.z * num_y_input;
 
     // Load inputs to shared memory
@@ -649,7 +667,7 @@ __global__ void EigenConvolutionKernel2D(
       const int input_offset = num_x_input * (j + plane_kernel_offset);
       #pragma unroll
       for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-        const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
+        const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
         s[i + input_offset] = eval.coeff(tensor_index);
       }
     }
@@ -657,7 +675,7 @@ __global__ void EigenConvolutionKernel2D(
     __syncthreads();
 
     // Convolution
-    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
 
     #pragma unroll
     for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
@@ -673,7 +691,7 @@ __global__ void EigenConvolutionKernel2D(
             result += s[k + input_offset] * kernel[k + kernel_offset];
           }
         }
-        const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
+        const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
         buffer[tensor_index] = result;
       }
     }
@@ -683,7 +701,7 @@ __global__ void EigenConvolutionKernel2D(
 };
 
 template <typename InputEvaluator, typename Index, typename InputDims>
-__global__ void EigenConvolutionKernel3D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
     InputEvaluator eval,
     const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
         indexMapper,
@@ -691,7 +709,11 @@ __global__ void EigenConvolutionKernel3D(
     const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
     const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
     const size_t kernelSizeZ, float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
   extern __shared__ float s[];
+#endif
 
   // Load inputs to shared memory
   const int first_x = blockIdx.x * maxX;
@@ -708,13 +730,13 @@ __global__ void EigenConvolutionKernel3D(
 
   for (int p = 0; p < numPlanes; ++p) {
 
-    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = 0;
 
     for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
       for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
         for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-          const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
+          const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
           s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
         }
       }
@@ -726,7 +748,7 @@ __global__ void EigenConvolutionKernel3D(
     const int num_z_output = last_z - first_z + 1;
     const int num_y_output = last_y - first_y + 1;
     const int num_x_output = last_x - first_x + 1;
-    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
 
     for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
       for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
@@ -739,7 +761,7 @@ __global__ void EigenConvolutionKernel3D(
               }
             }
           }
-          const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
+          const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
           buffer[tensor_index] = result;
         }
       }
@@ -764,13 +786,19 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   enum {
     IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
     PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
-      : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType& op, const GpuDevice& device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -852,9 +880,9 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
 
     const int maxSharedMem = m_device.sharedMemPerBlock();
-    const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
-    const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
-    const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
+    const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
+    const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
+    const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
     const int warpSize = 32;
 
     switch (NumKernelDims) {
@@ -889,7 +917,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         }
 
         const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
-        assert(shared_mem <= maxSharedMem);
+        gpu_assert(shared_mem <= maxSharedMem);
 
         const int num_x_blocks = ceil(numX, maxX);
         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
@@ -906,15 +934,15 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
             m_inputImpl.dimensions(), kernel_dims, indices);
         switch(kernel_size) {
           case 4: {
-            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
             break;
           }
           case 7: {
-            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
             break;
           }
           default: {
-            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
           }
         }
         break;
@@ -946,7 +974,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
 
         const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
-        assert(shared_mem <= maxSharedMem);
+        gpu_assert(shared_mem <= maxSharedMem);
 
         const int num_x_blocks = ceil(numX, maxX);
         const int num_y_blocks = ceil(numY, maxY);
@@ -967,11 +995,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
           case 4: {
             switch (kernel_size_y) {
               case 7: {
-                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
+                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
                 break;
               }
               default: {
-                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
+                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
                 break;
               }
             }
@@ -980,18 +1008,18 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
           case 7: {
             switch (kernel_size_y) {
               case 4: {
-                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
+                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
                 break;
               }
               default: {
-                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
+                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
                 break;
               }
             }
             break;
           }
           default: {
-            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
             break;
           }
         }
@@ -1026,7 +1054,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
 
         const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
-        assert(shared_mem <= maxSharedMem);
+        gpu_assert(shared_mem <= maxSharedMem);
 
         //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
         const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
@@ -1037,7 +1065,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
             m_inputImpl.dimensions(), kernel_dims, indices);
 
-        LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
+        LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
         break;
       }
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
new file mode 100644
index 0000000000000000000000000000000000000000..033318fdcc86e1e492f1c3277147f9de919e681e
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -0,0 +1,544 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
+
+namespace Eigen {
+
+/** \class TensorConvolution
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor convolution class.
+ *
+ *
+ */
+
+enum class convolution_type { CONV1D, CONV2D, CONV3D };
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
+struct EigenConvolutionKernel;
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV1D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
+  const size_t kernelSize;
+  const cl::sycl::range<2> input_range;
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
+                         const size_t kernelSize_, const cl::sycl::range<2> input_range_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernelSize(kernelSize_),
+        input_range(input_range_) {}
+
+  template <typename BooleanDim2>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
+    return (boolean_check[0] && boolean_check[1]);
+  }
+  void operator()(cl::sycl::nd_item<2> itemID) {
+    auto buffer_ptr = buffer_acc.get_pointer();
+    auto kernel_ptr = kernel_filter.get_pointer();
+    // the required row to be calculated for the for each plane in shered memory
+    const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
+    const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
+    const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
+    const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
+    /// fill the shared memory
+    for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
+      const size_t local_index = i + plane_kernel_offset;
+      const size_t tensor_index =
+          plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
+
+      local_acc[local_index] =
+          (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
+              ? device_evaluator.coeff(tensor_index)
+              : CoeffReturnType(0);
+    }
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+    // calculate the convolution // output start x
+    const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
+    if (boundary_check(itemID.get_global_id() < input_range)) {
+      CoeffReturnType result = static_cast<CoeffReturnType>(0);
+      const size_t index = plane_kernel_offset + itemID.get_local_id(0);
+      for (size_t k = 0; k < kernelSize; ++k) {
+        result += (local_acc[k + index] * kernel_ptr[k]);
+      }
+      const size_t tensor_index =
+          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
+          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
+      buffer_ptr[tensor_index] = result;
+    }
+  }
+};
+
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV2D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
+  const cl::sycl::range<2> kernel_size;
+  const cl::sycl::range<3> input_range;
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
+                         const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernel_size(kernel_size_),
+        input_range(input_range_) {}
+  template <typename BooleanDim3>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
+  }
+
+  void operator()(cl::sycl::nd_item<3> itemID) {
+    auto buffer_ptr = buffer_acc.get_pointer();
+    auto kernel_ptr = kernel_filter.get_pointer();
+    // the required row to be calculated for the for each plane in shered memory
+    const auto num_input = cl::sycl::range<2>{
+        (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
+
+    const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
+    const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
+
+    const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
+                                                 itemID.get_group(1) * itemID.get_local_range()[1]};
+      
+    // fill the local memory
+    bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
+    for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
+      const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
+      bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); 
+      for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
+        const size_t local_index = i + local_input_offset;
+        const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+                                                             i + input_offset[0], j + input_offset[1]);
+        local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
+                                  in_range_dim1 && in_range_dim2)
+                                     ? device_evaluator.coeff(tensor_index)
+                                     : CoeffReturnType(0);
+      }
+    }
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+    // output offset start for each thread
+    const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
+                                                  itemID.get_group(1) * itemID.get_local_range()[1]};
+
+    if (boundary_check(itemID.get_global_id() < input_range)) {
+      CoeffReturnType result = static_cast<CoeffReturnType>(0);
+
+      for (size_t j = 0; j < kernel_size[1]; j++) {
+        size_t kernel_offset = kernel_size[0] * j;
+        const size_t index =
+            (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
+        for (size_t i = 0; i < kernel_size[0]; i++) {
+          result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
+        }
+      }
+      const size_t tensor_index =
+          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
+          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
+                                                             itemID.get_local_id(1) + output_offset[1]);
+
+      buffer_ptr[tensor_index] = result;
+    }
+  }
+};
+
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV3D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
+  const cl::sycl::range<3> kernel_size;
+  const cl::sycl::range<3> input_range;
+  const size_t numP;
+
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
+                         const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
+                         const size_t numP_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernel_size(kernel_size_),
+        input_range(input_range_),
+        numP(numP_) {}
+  template <typename BooleanDim3>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
+  }
+  void operator()(cl::sycl::nd_item<3> itemID) {
+    auto buffer_ptr = buffer_acc.get_pointer();
+    auto kernel_ptr = kernel_filter.get_pointer();
+    const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
+
+    const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
+
+    const auto output_offset =
+          cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
+
+    for (size_t p = 0; p < numP; p++) {
+      /// fill the shared memory
+      const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+      for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
+        size_t local_index_dim2 = num_input[0] * num_input[1] * k;
+        bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
+        for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
+          bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
+          size_t local_index_dim1 = (num_input[0] * j)  + local_index_dim2;
+          for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
+            bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
+            const size_t local_index = local_index_dim1 + i;
+            const size_t tensor_index =
+                plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+                                         i + input_offset[0], j + input_offset[1], k + input_offset[2]);
+            local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
+          }
+        }
+      }
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+      // calculate the convolution
+
+      if (boundary_check(itemID.get_global_id() < input_range)) {
+        CoeffReturnType result = static_cast<CoeffReturnType>(0);
+        for (size_t k = 0; k < kernel_size[2]; k++) {
+          for (size_t j = 0; j < kernel_size[1]; j++) {
+            for (size_t i = 0; i < kernel_size[0]; i++) {
+              const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
+              const size_t local_index =
+                  ((i + itemID.get_local_id(0)) +
+                   num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
+
+              result += (local_acc[local_index] * kernel_ptr[kernel_index]);
+            }
+          }
+        }
+        const size_t tensor_index =
+            indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
+            indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
+        buffer_ptr[tensor_index] = result;
+      }
+
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+    }
+  }
+};
+
+template <typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static const int NumDims =
+      internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
+  static const int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
+  typedef const Eigen::SyclDevice Device;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
+  typedef typename InputArgType::Scalar Scalar;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
+
+  enum {
+    IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
+                TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
+    PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
+      : m_inputImpl(op.inputExpression(), device),
+        m_kernelArg(op.kernelExpression()),
+        m_kernelImpl(op.kernelExpression(), device),
+        m_indices(op.indices()),
+        m_buf(NULL),
+        m_kernel(NULL),
+        m_local_kernel(false),
+        m_device(device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
+                         static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
+        m_kernelImpl.dimensions();
+
+    m_dimensions = m_inputImpl.dimensions();
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = op.indices()[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      m_dimensions[index] = result_dim;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    preloadKernel();
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      executeEval(data);
+      return false;
+    } else {
+      m_buf = (EvaluatorPointerType)m_device.get(
+          (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
+      executeEval(m_buf);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_buf) {
+      m_device.deallocate_temp(m_buf);
+      m_buf = NULL;
+    }
+    if (m_local_kernel) {
+      m_device.deallocate_temp(m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+  /// used by sycl in order to build the sycl buffer
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
+  /// used by sycl in order to build the sycl buffer
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    typename KernelStorage::Type in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(m_device.get(local), m_kernelArg);
+      const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
+    typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
+    typedef typename InputEvaluator::Dimensions InputDims;
+    switch (NumKernelDims) {
+      case 1: {
+        const size_t numX = dimensions()[m_indices[0]];
+        const size_t numP = dimensions().TotalSize() / numX;
+        const auto input_dim = std::array<size_t, 2>{numX, numP};
+        auto global_range = cl::sycl::range<2>{};
+        auto local_range = cl::sycl::range<2>{};
+        const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+        const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        const array<Index, 1> indices{{m_indices[0]}};
+        const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
+        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
+            ConvKernel;
+
+        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+            m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
+            indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
+        break;
+      }
+
+      case 2: {
+        auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
+        auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
+        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
+        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
+        const size_t numP = dimensions().TotalSize() / (numX * numY);
+        auto input_dim = std::array<size_t, 3>{numX, numY, numP};
+
+        auto global_range = cl::sycl::range<3>{};
+        auto local_range = cl::sycl::range<3>{};
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+
+        const size_t local_memory_size =
+            (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
+        const array<Index, 2> kernel_dims{
+            {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
+        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
+            ConvKernel;
+        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+            m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
+            indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
+        break;
+      }
+
+      case 3: {
+        auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
+
+        auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
+
+        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
+        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
+        const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
+        auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
+        const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
+
+        const array<Index, 3> indices{
+            {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
+        const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
+                                           m_kernelImpl.dimensions()[kernel_index[1]],
+                                           m_kernelImpl.dimensions()[kernel_index[2]]}};
+
+        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+        auto global_range = cl::sycl::range<3>{};
+        auto local_range = cl::sycl::range<3>{};
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+        auto local_memory_range = (local_range + kernel_size - 1);
+        const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
+
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
+            ConvKernel;
+        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+            m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
+            indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
+        break;
+      }
+
+      default: {
+        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
+                            THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_buf != NULL);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return m_buf[index];
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
+    eigen_assert(m_buf != NULL);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+    // model.
+    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+    // We ignore the use of fused multiply-add.
+    const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double firstIndex_compute_cost =
+        NumDims *
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
+    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
+  }
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_kernelImpl.bind(cgh);
+    m_inputImpl.bind(cgh);
+    m_buf.bind(cgh);
+    m_kernel.bind(cgh);
+  }
+
+ private:
+  // No assignment (copies are needed by the kernels)
+  TensorEvaluator &operator=(const TensorEvaluator &);
+  TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
+  KernelArgType m_kernelArg;
+  TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
+  Indices m_indices;
+  Dimensions m_dimensions;
+  EvaluatorPointerType m_buf;
+  typename KernelStorage::Type m_kernel;
+  bool m_local_kernel;
+  const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
+};  // namespace Eigen
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index 83c449cf192713e8049d24053d25cee1412c70d1..195267ce8e533b4a300c2c4e49efa5545a5710bf 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -174,8 +174,11 @@ class TensorCostModel {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
       double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
     double cost = totalCost(output_size, cost_per_coeff);
-    int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
-    return numext::mini(max_threads, numext::maxi(1, threads));
+    double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
+    // Make sure we don't invoke undefined behavior when we convert to an int.
+    threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
+    return numext::mini(max_threads,
+                        numext::maxi<int>(1, static_cast<int>(threads)));
   }
 
   // taskSize assesses parallel task size.
@@ -186,14 +189,13 @@ class TensorCostModel {
     return totalCost(output_size, cost_per_coeff) / kTaskSize;
   }
 
- private:
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
       double output_size, const TensorOpCost& cost_per_coeff) {
     // Cost of memory fetches from L2 cache. 64 is typical cache line size.
     // 11 is L2 cache latency on Haswell.
     // We don't know whether data is in L1, L2 or L3. But we are most interested
     // in single-threaded computational time around 100us-10ms (smaller time
-    // is too small for parallelization, larger time is not intersting
+    // is too small for parallelization, larger time is not interesting
     // either because we are probably using all available threads already).
     // And for the target time range, L2 seems to be what matters. Data set
     // fitting into L1 is too small to take noticeable time. Data set fitting
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index e020d076f02db07f9be90b8c466584eb0fbd362a..95a8a84ee410bf852d059381b3b1c0dbcb8140c1 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -30,12 +30,13 @@ struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = traits<XprType>::NumDimensions;
   static const int Layout = traits<XprType>::Layout;
+  typedef typename traits<XprType>::PointerType PointerType;
 };
 
 template<typename CustomUnaryFunc, typename XprType>
 struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense>
 {
-  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>& type;
+  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename CustomUnaryFunc, typename XprType>
@@ -86,18 +87,26 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
   typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<XprType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
       : m_op(op), m_device(device), m_result(NULL)
   {
     m_dimensions = op.func().dimensions(op.expression());
@@ -105,21 +114,21 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     if (data) {
       evalTo(data);
       return false;
     } else {
-      m_result = static_cast<CoeffReturnType*>(
-          m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
+          m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))));
       evalTo(m_result);
       return true;
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    if (m_result != NULL) {
-      m_device.deallocate(m_result);
+  EIGEN_STRONG_INLINE void cleanup() {
+    if (m_result) {
+      m_device.deallocate_temp(m_result);
       m_result = NULL;
     }
   }
@@ -138,19 +147,25 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_result.bind(cgh);
+  }
+#endif
 
  protected:
-  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
-    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(
-        data, m_dimensions);
+  void evalTo(EvaluatorPointerType data) {
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions);
     m_op.func().eval(m_op.expression(), result, m_device);
   }
 
   Dimensions m_dimensions;
   const ArgType m_op;
-  const Device& m_device;
-  CoeffReturnType* m_result;
+  const Device EIGEN_DEVICE_REF m_device;
+  EvaluatorPointerType m_result;
 };
 
 
@@ -180,6 +195,8 @@ struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   static const int NumDimensions = traits<LhsXprType>::NumDimensions;
   static const int Layout = traits<LhsXprType>::Layout;
+  typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                                typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
 };
 
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
@@ -242,18 +259,27 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<LhsXprType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_op(op), m_device(device), m_result(NULL)
   {
     m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
@@ -261,20 +287,21 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     if (data) {
       evalTo(data);
       return false;
     } else {
-      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
+        m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType))));
       evalTo(m_result);
       return true;
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     if (m_result != NULL) {
-      m_device.deallocate(m_result);
+      m_device.deallocate_temp(m_result);
       m_result = NULL;
     }
   }
@@ -293,18 +320,25 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_result.bind(cgh);
+  }
+#endif
 
  protected:
-  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
-    TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions);
+  void evalTo(EvaluatorPointerType data) {
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(m_device.get(data), m_dimensions);
     m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
   }
 
   Dimensions m_dimensions;
   const XprType m_op;
-  const Device& m_device;
-  CoeffReturnType* m_result;
+  const Device EIGEN_DEVICE_REF m_device;
+  EvaluatorPointerType m_result;
 };
 
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index 29e50a3b2df90dd5c2d3a7a2d3c43f8aded882fe..96fa46c86dc0ec6030f6ff95d55f18a9476c1a8d 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -28,6 +28,8 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
   public:
     TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
 
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorDevice)
+
     template<typename OtherDerived>
     EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
       typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
@@ -63,6 +65,73 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
     ExpressionType& m_expression;
 };
 
+/** \class TensorAsyncDevice
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Pseudo expression providing an operator = that will evaluate its
+ * argument asynchronously on the specified device. Currently only
+ * ThreadPoolDevice implements proper asynchronous execution, while the default
+ * and GPU devices just run the expression synchronously and call m_done() on
+ * completion..
+ *
+ * Example:
+ *    auto done = []() { ... expression evaluation done ... };
+ *    C.device(thread_pool_device, std::move(done)) = A + B;
+ */
+
+template <typename ExpressionType, typename DeviceType, typename DoneCallback>
+class TensorAsyncDevice {
+ public:
+  TensorAsyncDevice(const DeviceType& device, ExpressionType& expression,
+                    DoneCallback done)
+      : m_device(device), m_expression(expression), m_done(std::move(done)) {}
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
+    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+    typedef internal::TensorExecutor<const Assign, DeviceType> Executor;
+
+    Assign assign(m_expression, other);
+    Executor::run(assign, m_device);
+    m_done();
+
+    return *this;
+  }
+
+ protected:
+  const DeviceType& m_device;
+  ExpressionType& m_expression;
+  DoneCallback m_done;
+};
+
+
+#ifdef EIGEN_USE_THREADS
+template <typename ExpressionType, typename DoneCallback>
+class TensorAsyncDevice<ExpressionType, ThreadPoolDevice, DoneCallback> {
+ public:
+  TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression,
+                    DoneCallback done)
+      : m_device(device), m_expression(expression), m_done(std::move(done)) {}
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
+    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+    typedef internal::TensorAsyncExecutor<const Assign, ThreadPoolDevice, DoneCallback> Executor;
+
+    // WARNING: After assignment 'm_done' callback will be in undefined state.
+    Assign assign(m_expression, other);
+    Executor::runAsync(assign, m_device, std::move(m_done));
+
+    return *this;
+  }
+
+ protected:
+  const ThreadPoolDevice& m_device;
+  ExpressionType& m_expression;
+  DoneCallback m_done;
+};
+#endif
+
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 4f5767bc7f7296f237f260d7b538e9eb34b97d5a..f7792393353cda937b40b53cddf467b2360e3307 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -1,337 +1,6 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
-#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
-
-namespace Eigen {
-
-static const int kCudaScratchSize = 1024;
-
-// This defines an interface that GPUDevice can take to use
-// CUDA streams underneath.
-class StreamInterface {
- public:
-  virtual ~StreamInterface() {}
-
-  virtual const cudaStream_t& stream() const = 0;
-  virtual const cudaDeviceProp& deviceProperties() const = 0;
-
-  // Allocate memory on the actual device where the computation will run
-  virtual void* allocate(size_t num_bytes) const = 0;
-  virtual void deallocate(void* buffer) const = 0;
-
-  // Return a scratchpad buffer of size 1k
-  virtual void* scratchpad() const = 0;
-
-  // Return a semaphore. The semaphore is initially initialized to 0, and
-  // each kernel using it is responsible for resetting to 0 upon completion
-  // to maintain the invariant that the semaphore is always equal to 0 upon
-  // each kernel start.
-  virtual unsigned int* semaphore() const = 0;
-};
-
-static cudaDeviceProp* m_deviceProperties;
-static bool m_devicePropInitialized = false;
-
-static void initializeDeviceProp() {
-  if (!m_devicePropInitialized) {
-    // Attempts to ensure proper behavior in the case of multiple threads
-    // calling this function simultaneously. This would be trivial to
-    // implement if we could use std::mutex, but unfortunately mutex don't
-    // compile with nvcc, so we resort to atomics and thread fences instead.
-    // Note that if the caller uses a compiler that doesn't support c++11 we
-    // can't ensure that the initialization is thread safe.
-#if __cplusplus >= 201103L
-    static std::atomic<bool> first(true);
-    if (first.exchange(false)) {
-#else
-    static bool first = true;
-    if (first) {
-      first = false;
-#endif
-      // We're the first thread to reach this point.
-      int num_devices;
-      cudaError_t status = cudaGetDeviceCount(&num_devices);
-      if (status != cudaSuccess) {
-        std::cerr << "Failed to get the number of CUDA devices: "
-                  << cudaGetErrorString(status)
-                  << std::endl;
-        assert(status == cudaSuccess);
-      }
-      m_deviceProperties = new cudaDeviceProp[num_devices];
-      for (int i = 0; i < num_devices; ++i) {
-        status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
-        if (status != cudaSuccess) {
-          std::cerr << "Failed to initialize CUDA device #"
-                    << i
-                    << ": "
-                    << cudaGetErrorString(status)
-                    << std::endl;
-          assert(status == cudaSuccess);
-        }
-      }
-
-#if __cplusplus >= 201103L
-      std::atomic_thread_fence(std::memory_order_release);
-#endif
-      m_devicePropInitialized = true;
-    } else {
-      // Wait for the other thread to inititialize the properties.
-      while (!m_devicePropInitialized) {
-#if __cplusplus >= 201103L
-        std::atomic_thread_fence(std::memory_order_acquire);
-#endif
-        sleep(1);
-      }
-    }
-  }
-}
-
-static const cudaStream_t default_stream = cudaStreamDefault;
-
-class CudaStreamDevice : public StreamInterface {
- public:
-  // Use the default stream on the current device
-  CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
-    cudaGetDevice(&device_);
-    initializeDeviceProp();
-  }
-  // Use the default stream on the specified device
-  CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
-    initializeDeviceProp();
-  }
-  // Use the specified stream. Note that it's the
-  // caller responsibility to ensure that the stream can run on
-  // the specified device. If no device is specified the code
-  // assumes that the stream is associated to the current gpu device.
-  CudaStreamDevice(const cudaStream_t* stream, int device = -1)
-      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
-    if (device < 0) {
-      cudaGetDevice(&device_);
-    } else {
-      int num_devices;
-      cudaError_t err = cudaGetDeviceCount(&num_devices);
-      EIGEN_UNUSED_VARIABLE(err)
-      assert(err == cudaSuccess);
-      assert(device < num_devices);
-      device_ = device;
-    }
-    initializeDeviceProp();
-  }
-
-  virtual ~CudaStreamDevice() {
-    if (scratch_) {
-      deallocate(scratch_);
-    }
-  }
-
-  const cudaStream_t& stream() const { return *stream_; }
-  const cudaDeviceProp& deviceProperties() const {
-    return m_deviceProperties[device_];
-  }
-  virtual void* allocate(size_t num_bytes) const {
-    cudaError_t err = cudaSetDevice(device_);
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-    void* result;
-    err = cudaMalloc(&result, num_bytes);
-    assert(err == cudaSuccess);
-    assert(result != NULL);
-    return result;
-  }
-  virtual void deallocate(void* buffer) const {
-    cudaError_t err = cudaSetDevice(device_);
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-    assert(buffer != NULL);
-    err = cudaFree(buffer);
-    assert(err == cudaSuccess);
-  }
-
-  virtual void* scratchpad() const {
-    if (scratch_ == NULL) {
-      scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
-    }
-    return scratch_;
-  }
-
-  virtual unsigned int* semaphore() const {
-    if (semaphore_ == NULL) {
-      char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
-      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
-      cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
-      EIGEN_UNUSED_VARIABLE(err)
-      assert(err == cudaSuccess);
-    }
-    return semaphore_;
-  }
-
- private:
-  const cudaStream_t* stream_;
-  int device_;
-  mutable void* scratch_;
-  mutable unsigned int* semaphore_;
-};
-
-struct GpuDevice {
-  // The StreamInterface is not owned: the caller is
-  // responsible for its initialization and eventual destruction.
-  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
-    eigen_assert(stream);
-  }
-  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
-    eigen_assert(stream);
-  }
-  // TODO(bsteiner): This is an internal API, we should not expose it.
-  EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
-    return stream_->stream();
-  }
-
-  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    return stream_->allocate(num_bytes);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-    stream_->deallocate(buffer);
-  }
-
-  EIGEN_STRONG_INLINE void* scratchpad() const {
-    return stream_->scratchpad();
-  }
-
-  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
-    return stream_->semaphore();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
-    cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
-                                      stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-#else
-  eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
-  }
-
-  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-    cudaError_t err =
-        cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-  }
-
-  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-    cudaError_t err =
-        cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
-#ifndef __CUDA_ARCH__
-    cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-#else
-  eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
-  }
-
-  EIGEN_STRONG_INLINE size_t numThreads() const {
-    // FIXME
-    return 32;
-  }
-
-  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-    // FIXME
-    return 48*1024;
-  }
-
-  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-    // We won't try to take advantage of the l2 cache for the time being, and
-    // there is no l3 cache on cuda devices.
-    return firstLevelCacheSize();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
-#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
-    cudaError_t err = cudaStreamSynchronize(stream_->stream());
-    if (err != cudaSuccess) {
-      std::cerr << "Error detected in CUDA stream: "
-                << cudaGetErrorString(err)
-                << std::endl;
-      assert(err == cudaSuccess);
-    }
-#else
-    assert(false && "The default device should be used instead to generate kernel code");
+#if defined(__clang__) || defined(__GNUC__)
+#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file"
 #endif
-  }
-
-  EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
-    return stream_->deviceProperties().multiProcessorCount;
-  }
-  EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
-    return stream_->deviceProperties().maxThreadsPerBlock;
-  }
-  EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
-    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
-  }
-  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
-    return stream_->deviceProperties().sharedMemPerBlock;
-  }
-  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-    return stream_->deviceProperties().major;
-  }
-  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
-    return stream_->deviceProperties().minor;
-  }
-
-  EIGEN_STRONG_INLINE int maxBlocks() const {
-    return max_blocks_;
-  }
-
-  // This function checks if the CUDA runtime recorded an error for the
-  // underlying stream device.
-  inline bool ok() const {
-#ifdef __CUDACC__
-    cudaError_t error = cudaStreamQuery(stream_->stream());
-    return (error == cudaSuccess) || (error == cudaErrorNotReady);
-#else
-    return false;
-#endif
-  }
-
- private:
-  const StreamInterface* stream_;
-  int max_blocks_;
-};
-
-#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
-  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
-  assert(cudaGetLastError() == cudaSuccess);
-
-
-// FIXME: Should be device and kernel specific.
-#ifdef __CUDACC__
-static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
-#ifndef __CUDA_ARCH__
-  cudaError_t status = cudaDeviceSetSharedMemConfig(config);
-  EIGEN_UNUSED_VARIABLE(status)
-  assert(status == cudaSuccess);
-#else
-  EIGEN_UNUSED_VARIABLE(config)
-#endif
-}
-#endif
-
-}  // end namespace Eigen
 
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
+#include "TensorDeviceGpu.h"
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index 9d141395b7a40728bb5d9c2068022ad74bf75e24..46b9d3ab2aeadf6853e8402fd6b2339793cc72a5 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -20,6 +20,12 @@ struct DefaultDevice {
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
     internal::aligned_free(buffer);
+  }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
+    return allocate(num_bytes);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
+    deallocate(buffer);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
     ::memcpy(dst, src, n);
@@ -33,11 +39,18 @@ struct DefaultDevice {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
     ::memset(buffer, c, n);
   }
+  template<typename Type>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
+    return data;
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
-#ifndef __CUDA_ARCH__
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
     // Running on the host CPU
     return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return 64;
 #else
     // Running on a CUDA device
     return 32;
@@ -45,9 +58,12 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-#ifndef __CUDA_ARCH__
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
     // Running on the host CPU
     return l1CacheSize();
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return 48*1024; // FIXME : update this number for HIP
 #else
     // Running on a CUDA device, return the amount of shared memory available.
     return 48*1024;
@@ -55,9 +71,12 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-#ifndef __CUDA_ARCH__
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
     // Running single threaded on the host CPU
     return l3CacheSize();
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return firstLevelCacheSize(); // FIXME : update this number for HIP
 #else
     // Running on a CUDA device
     return firstLevelCacheSize();
@@ -65,13 +84,17 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
     // Running single threaded on the host CPU
     // Should return an enum that encodes the ISA supported by the CPU
     return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    // return 1 as major for HIP
+    return 1;
 #else
     // Running on a CUDA device
-    return __CUDA_ARCH__ / 100;
+    return EIGEN_CUDA_ARCH / 100;
 #endif
   }
 };
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec2e3cb143aaaea2b2ef996109cb8cdf25f902c1
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@@ -0,0 +1,389 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
+
+// This header file container defines fo gpu* macros which will resolve to
+// their equivalent hip* or cuda* versions depending on the compiler in use
+// A separate header (included at the end of this file) will undefine all 
+#include "TensorGpuHipCudaDefines.h"
+
+namespace Eigen {
+
+static const int kGpuScratchSize = 1024;
+
+// This defines an interface that GPUDevice can take to use
+// HIP / CUDA streams underneath.
+class StreamInterface {
+ public:
+  virtual ~StreamInterface() {}
+
+  virtual const gpuStream_t& stream() const = 0;
+  virtual const gpuDeviceProp_t& deviceProperties() const = 0;
+
+  // Allocate memory on the actual device where the computation will run
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+
+  // Return a scratchpad buffer of size 1k
+  virtual void* scratchpad() const = 0;
+
+  // Return a semaphore. The semaphore is initially initialized to 0, and
+  // each kernel using it is responsible for resetting to 0 upon completion
+  // to maintain the invariant that the semaphore is always equal to 0 upon
+  // each kernel start.
+  virtual unsigned int* semaphore() const = 0;
+};
+
+class GpuDeviceProperties {
+ public:
+  GpuDeviceProperties() : 
+      initialized_(false), first_(true), device_properties_(nullptr) {}
+ 
+  ~GpuDeviceProperties() {
+    if (device_properties_) {
+      delete[] device_properties_;
+    }
+  }
+  
+  EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const {
+    return device_properties_[device];
+  }
+
+  EIGEN_STRONG_INLINE bool isInitialized() const {
+    return initialized_;
+  }
+
+  void initialize() {
+    if (!initialized_) {
+      // Attempts to ensure proper behavior in the case of multiple threads
+      // calling this function simultaneously. This would be trivial to
+      // implement if we could use std::mutex, but unfortunately mutex don't
+      // compile with nvcc, so we resort to atomics and thread fences instead.
+      // Note that if the caller uses a compiler that doesn't support c++11 we
+      // can't ensure that the initialization is thread safe.
+      if (first_.exchange(false)) {
+        // We're the first thread to reach this point.
+        int num_devices;
+        gpuError_t status = gpuGetDeviceCount(&num_devices);
+        if (status != gpuSuccess) {
+          std::cerr << "Failed to get the number of GPU devices: "
+                    << gpuGetErrorString(status)
+                    << std::endl;
+          gpu_assert(status == gpuSuccess);
+        }
+        device_properties_ = new gpuDeviceProp_t[num_devices];
+        for (int i = 0; i < num_devices; ++i) {
+          status = gpuGetDeviceProperties(&device_properties_[i], i);
+          if (status != gpuSuccess) {
+            std::cerr << "Failed to initialize GPU device #"
+                      << i
+                      << ": "
+                      << gpuGetErrorString(status)
+                      << std::endl;
+            gpu_assert(status == gpuSuccess);
+          }
+        }
+
+        std::atomic_thread_fence(std::memory_order_release);
+        initialized_ = true;
+      } else {
+        // Wait for the other thread to inititialize the properties.
+        while (!initialized_) {
+          std::atomic_thread_fence(std::memory_order_acquire);
+          std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+        }
+      }
+    }
+  }
+
+ private:
+  volatile bool initialized_;
+  std::atomic<bool> first_;
+  gpuDeviceProp_t* device_properties_;
+};
+
+EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
+  static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
+  if (!deviceProperties->isInitialized()) {
+    deviceProperties->initialize();
+  }
+  return *deviceProperties;
+}
+
+EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
+  return GetGpuDeviceProperties().get(device);
+}
+
+static const gpuStream_t default_stream = gpuStreamDefault;
+
+class GpuStreamDevice : public StreamInterface {
+ public:
+  // Use the default stream on the current device
+  GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
+    gpuGetDevice(&device_);
+  }
+  // Use the default stream on the specified device
+  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
+  // Use the specified stream. Note that it's the
+  // caller responsibility to ensure that the stream can run on
+  // the specified device. If no device is specified the code
+  // assumes that the stream is associated to the current gpu device.
+  GpuStreamDevice(const gpuStream_t* stream, int device = -1)
+      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+    if (device < 0) {
+      gpuGetDevice(&device_);
+    } else {
+      int num_devices;
+      gpuError_t err = gpuGetDeviceCount(&num_devices);
+      EIGEN_UNUSED_VARIABLE(err)
+      gpu_assert(err == gpuSuccess);
+      gpu_assert(device < num_devices);
+      device_ = device;
+    }
+  }
+
+  virtual ~GpuStreamDevice() {
+    if (scratch_) {
+      deallocate(scratch_);
+    }
+  }
+
+  const gpuStream_t& stream() const { return *stream_; }
+  const gpuDeviceProp_t& deviceProperties() const {
+    return GetGpuDeviceProperties(device_);
+  }
+  virtual void* allocate(size_t num_bytes) const {
+    gpuError_t err = gpuSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+    void* result;
+    err = gpuMalloc(&result, num_bytes);
+    gpu_assert(err == gpuSuccess);
+    gpu_assert(result != NULL);
+    return result;
+  }
+  virtual void deallocate(void* buffer) const {
+    gpuError_t err = gpuSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+    gpu_assert(buffer != NULL);
+    err = gpuFree(buffer);
+    gpu_assert(err == gpuSuccess);
+  }
+
+  virtual void* scratchpad() const {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  virtual unsigned int* semaphore() const {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+      EIGEN_UNUSED_VARIABLE(err)
+      gpu_assert(err == gpuSuccess);
+    }
+    return semaphore_;
+  }
+
+ private:
+  const gpuStream_t* stream_;
+  int device_;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+};
+
+struct GpuDevice {
+  // The StreamInterface is not owned: the caller is
+  // responsible for its initialization and eventual destruction.
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+    eigen_assert(stream);
+  }
+  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
+    eigen_assert(stream);
+  }
+  // TODO(bsteiner): This is an internal API, we should not expose it.
+  EIGEN_STRONG_INLINE const gpuStream_t& stream() const {
+    return stream_->stream();
+  }
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return stream_->allocate(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    stream_->deallocate(buffer);
+  }
+
+  EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
+    return stream_->allocate(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
+    stream_->deallocate(buffer);
+  }
+
+  template<typename Type>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
+    return data;
+  }
+
+  EIGEN_STRONG_INLINE void* scratchpad() const {
+    return stream_->scratchpad();
+  }
+
+  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+    return stream_->semaphore();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
+                                      stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+#else
+    EIGEN_UNUSED_VARIABLE(dst);
+    EIGEN_UNUSED_VARIABLE(src);
+    EIGEN_UNUSED_VARIABLE(n);
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    gpuError_t err =
+        gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+  }
+
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    gpuError_t err =
+        gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+#else
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    // FIXME
+    return 32;
+  }
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    // FIXME
+    return 48*1024;
+  }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on hip/cuda devices.
+    return firstLevelCacheSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuStreamSynchronize(stream_->stream());
+    if (err != gpuSuccess) {
+      std::cerr << "Error detected in GPU stream: "
+                << gpuGetErrorString(err)
+                << std::endl;
+      gpu_assert(err == gpuSuccess);
+    }
+#else
+    gpu_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const {
+    return stream_->deviceProperties().multiProcessorCount;
+  }
+  EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const {
+    return stream_->deviceProperties().maxThreadsPerBlock;
+  }
+  EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
+    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+  }
+  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+    return stream_->deviceProperties().sharedMemPerBlock;
+  }
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    return stream_->deviceProperties().major;
+  }
+  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
+    return stream_->deviceProperties().minor;
+  }
+
+  EIGEN_STRONG_INLINE int maxBlocks() const {
+    return max_blocks_;
+  }
+
+  // This function checks if the GPU runtime recorded an error for the
+  // underlying stream device.
+  inline bool ok() const {
+#ifdef EIGEN_GPUCC
+    gpuError_t error = gpuStreamQuery(stream_->stream());
+    return (error == gpuSuccess) || (error == gpuErrorNotReady);
+#else
+    return false;
+#endif
+  }
+
+ private:
+  const StreamInterface* stream_;
+  int max_blocks_;
+};
+
+#if defined(EIGEN_HIPCC)
+
+#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
+  hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
+  gpu_assert(hipGetLastError() == hipSuccess);
+
+#else
+ 
+#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
+  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
+  gpu_assert(cudaGetLastError() == cudaSuccess);
+
+#endif
+ 
+// FIXME: Should be device and kernel specific.
+#ifdef EIGEN_GPUCC
+static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+  gpuError_t status = gpuDeviceSetSharedMemConfig(config);
+  EIGEN_UNUSED_VARIABLE(status)
+  gpu_assert(status == gpuSuccess);
+#else
+  EIGEN_UNUSED_VARIABLE(config)
+#endif
+}
+#endif
+
+}  // end namespace Eigen
+
+// undefine all the gpu* macros we defined at the beginning of the file
+#include "TensorGpuHipCudaUndefines.h"
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index 7c039890e2e920f1aeeb7e856f43607fcec14074..df591c21d86b787c6bd9d3f8cadb1bb7fd3669e5 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -14,109 +14,1035 @@
 
 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
+#include <unordered_set>
 
 namespace Eigen {
-struct SyclDevice {
-  /// class members
-  /// sycl queue
-  mutable cl::sycl::queue m_queue;
-  /// std::map is the container used to make sure that we create only one buffer
-  /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice.
-  /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it.
-  mutable std::map<const void *, std::shared_ptr<void>> buffer_map;
-  /// creating device by using selector
-  template<typename dev_Selector> SyclDevice(dev_Selector s)
-  :
-#ifdef EIGEN_EXCEPTIONS
-  m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) {
-    for (const auto& e : l) {
-      try {
-        std::rethrow_exception(e);
-      } catch (cl::sycl::exception e) {
-          std::cout << e.what() << std::endl;
+
+namespace TensorSycl {
+namespace internal {
+
+/// Cache all the device information needed
+struct SyclDeviceInfo {
+  SyclDeviceInfo(cl::sycl::queue queue)
+      : local_mem_type(
+            queue.get_device()
+                .template get_info<cl::sycl::info::device::local_mem_type>()),
+        max_work_item_sizes(
+            queue.get_device()
+                .template get_info<
+                    cl::sycl::info::device::max_work_item_sizes>()),
+        max_mem_alloc_size(
+            queue.get_device()
+                .template get_info<
+                    cl::sycl::info::device::max_mem_alloc_size>()),
+        max_compute_units(queue.get_device()
+                              .template get_info<
+                                  cl::sycl::info::device::max_compute_units>()),
+        max_work_group_size(
+            queue.get_device()
+                .template get_info<
+                    cl::sycl::info::device::max_work_group_size>()),
+        local_mem_size(
+            queue.get_device()
+                .template get_info<cl::sycl::info::device::local_mem_size>()),
+        platform_name(queue.get_device()
+                          .get_platform()
+                          .template get_info<cl::sycl::info::platform::name>()),
+        device_name(queue.get_device()
+                        .template get_info<cl::sycl::info::device::name>()),
+        device_vendor(
+            queue.get_device()
+                .template get_info<cl::sycl::info::device::vendor>()) {}
+
+  cl::sycl::info::local_mem_type local_mem_type;
+  cl::sycl::id<3> max_work_item_sizes;
+  unsigned long max_mem_alloc_size;
+  unsigned long max_compute_units;
+  unsigned long max_work_group_size;
+  size_t local_mem_size;
+  std::string platform_name;
+  std::string device_name;
+  std::string device_vendor;
+};
+
+}  // end namespace internal
+}  // end namespace TensorSycl
+
+typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
+// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
+// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
+// TensorFlow via the Eigen SYCL Backend.
+EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
+    -> decltype(cl::sycl::device::get_devices()) {
+#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
+  return {cl::sycl::device(cl::sycl::default_selector())};
+#else
+  std::vector<cl::sycl::device> supported_devices;
+  auto platform_list = cl::sycl::platform::get_platforms();
+  for (const auto &platform : platform_list) {
+    auto device_list = platform.get_devices();
+    auto platform_name =
+        platform.template get_info<cl::sycl::info::platform::name>();
+    std::transform(platform_name.begin(), platform_name.end(),
+                   platform_name.begin(), ::tolower);
+    for (const auto &device : device_list) {
+      auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
+      std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
+      bool unsupported_condition =
+          (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
+           vendor.find("apu") == std::string::npos) ||
+          (platform_name.find("experimental") != std::string::npos) ||
+          device.is_host();
+      if (!unsupported_condition) {
+        supported_devices.push_back(device);
+      }
+    }
+  }
+  return supported_devices;
+#endif
+}
+
+class QueueInterface {
+ public:
+  /// Creating device by using cl::sycl::selector or cl::sycl::device.
+  template <typename DeviceOrSelector>
+  explicit QueueInterface(
+      const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
+      unsigned num_threads = std::thread::hardware_concurrency())
+      : m_queue(dev_or_sel, handler),
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+        m_prog(m_queue.get_context(), get_sycl_supported_devices()),
+#endif
+        m_thread_pool(num_threads),
+        m_device_info(m_queue) {
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+    m_prog.build_with_kernel_type<DeviceOrSelector>();
+    auto f = [&](cl::sycl::handler &cgh) {
+      cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
+                                        [=]() {})
+    };
+    EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
+#endif
+  }
+
+  template <typename DeviceOrSelector>
+  explicit QueueInterface(
+      const DeviceOrSelector &dev_or_sel,
+      unsigned num_threads = std::thread::hardware_concurrency())
+      : QueueInterface(dev_or_sel,
+                       [this](cl::sycl::exception_list l) {
+                         this->exception_caught_ = this->sycl_async_handler(l);
+                       },
+                       num_threads) {}
+
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+  EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
+#endif
+
+  /// Attach an existing buffer to the pointer map, Eigen will not reuse it
+  EIGEN_STRONG_INLINE void *attach_buffer(
+      cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return static_cast<void *>(pMapper.add_pointer(buf));
+  }
+
+  /// Detach previously attached buffer
+  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    TensorSycl::internal::SYCLfree<false>(p, pMapper);
+  }
+
+  /// Allocating device pointer. This pointer is actually an 8 bytes host
+  /// pointer used as key to access the sycl device buffer. The reason is that
+  /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode
+  /// expressions. So we create a key pointer to be used in Eigen expression
+  /// construction. When we convert the Eigen construction into the sycl
+  /// construction we use this pointer as a key in our buffer_map and we make
+  /// sure that we dedicate only one buffer only for this pointer. The device
+  /// pointer would be deleted by calling deallocate function.
+  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
+#if EIGEN_MAX_ALIGN_BYTES > 0
+    size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
+    if (align > 0) {
+      num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
+    }
+#endif
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
+  }
+
+  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
+#if EIGEN_MAX_ALIGN_BYTES > 0
+    size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
+    if (align > 0) {
+      num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
+    }
+#endif
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+    if (scratch_buffers.empty()) {
+      return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
+      ;
+    } else {
+      for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
+        auto buff = pMapper.get_buffer(*it);
+        if (buff.get_size() >= num_bytes) {
+          auto ptr = *it;
+          scratch_buffers.erase(it);
+          return ptr;
+        } else {
+          ++it;
         }
+      }
+      return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
     }
-  }))
 #else
-  m_queue(cl::sycl::queue(s))
+    return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
 #endif
-  {}
-  // destructor
-  ~SyclDevice() { deallocate_all(); }
+  }
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
+      cl::sycl::access::mode::read_write, data_t>
+  get(data_t *data) const {
+    return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
+  }
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
+      TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
+                                        data_t>
+          data) const {
+    return static_cast<data_t *>(data.get_virtual_pointer());
+  }
 
-  template <typename T> void deallocate(T *p) const {
-    auto it = buffer_map.find(p);
-    if (it != buffer_map.end()) {
-      buffer_map.erase(it);
-      internal::aligned_free(p);
+  EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+    scratch_buffers.insert(p);
+#else
+    TensorSycl::internal::SYCLfree(p, pMapper);
+#endif
+  }
+  template <cl::sycl::access::mode AcMd, typename T>
+  EIGEN_STRONG_INLINE void deallocate_temp(
+      const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
+    deallocate_temp(p.get_virtual_pointer());
+  }
+
+  /// This is used to deallocate the device pointer. p is used as a key inside
+  /// the map to find the device buffer and delete it.
+  EIGEN_STRONG_INLINE void deallocate(void *p) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    TensorSycl::internal::SYCLfree(p, pMapper);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_all() const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    TensorSycl::internal::SYCLfreeAll(pMapper);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+    scratch_buffers.clear();
+#endif
+  }
+
+  /// The memcpyHostToDevice is used to copy the data from host to device
+  /// The destination pointer could be deleted before the copy happend which is
+  /// why a callback function is needed. By default if none is provided, the
+  /// function is blocking.
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(
+      void *dst, const void *src, size_t n,
+      std::function<void()> callback) const {
+    static const auto write_mode = cl::sycl::access::mode::discard_write;
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
+        write_accessor;
+    if (n == 0) {
+      if (callback) callback();
+      return;
     }
+    n /= sizeof(buffer_scalar_t);
+    auto f = [&](cl::sycl::handler &cgh) {
+      write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
+      buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
+      auto non_deleter = [](buffer_scalar_t const *) {};
+      std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
+      cgh.copy(s_ptr, dst_acc);
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    synchronize_and_callback(e, callback);
   }
-  void deallocate_all() const {
-    std::map<const void *, std::shared_ptr<void>>::iterator it=buffer_map.begin();
-    while (it!=buffer_map.end()) {
-      auto p=it->first;
-      buffer_map.erase(it);
-      internal::aligned_free(const_cast<void*>(p));
-      it=buffer_map.begin();
+
+  /// The memcpyDeviceToHost is used to copy the data from device to host.
+  /// The source pointer could be deleted before the copy happend which is
+  /// why a callback function is needed. By default if none is provided, the
+  /// function is blocking.
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
+      void *dst, const void *src, size_t n,
+      std::function<void()> callback) const {
+    static const auto read_mode = cl::sycl::access::mode::read;
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
+        read_accessor;
+    if (n == 0) {
+      if (callback) callback();
+      return;
     }
-    buffer_map.clear();
+    n /= sizeof(buffer_scalar_t);
+    auto f = [&](cl::sycl::handler &cgh) {
+      read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
+      buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
+      auto non_deleter = [](buffer_scalar_t *) {};
+      std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
+      cgh.copy(src_acc, s_ptr);
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    synchronize_and_callback(e, callback);
   }
 
-  /// creation of sycl accessor for a buffer. This function first tries to find
-  /// the buffer in the buffer_map. If found it gets the accessor from it, if not,
-  ///the function then adds an entry by creating a sycl buffer for that particular pointer.
-  template <cl::sycl::access::mode AcMd, typename T> inline cl::sycl::accessor<T, 1, AcMd, cl::sycl::access::target::global_buffer>
-  get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const {
-    return (get_sycl_buffer<T>(num_bytes, ptr)->template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
+  /// The memcpy function.
+  /// No callback is required here as both arguments are on the device
+  /// and SYCL can handle the dependency.
+  EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
+    static const auto read_mode = cl::sycl::access::mode::read;
+    static const auto write_mode = cl::sycl::access::mode::discard_write;
+    if (n == 0) {
+      return;
+    }
+    n /= sizeof(buffer_scalar_t);
+    auto f = [&](cl::sycl::handler &cgh) {
+      auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
+      auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
+      cgh.copy(src_acc, dst_acc);
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    async_synchronize(e);
   }
 
-  template<typename T> inline  std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const {
-    using Type = cl::sycl::buffer<T, 1>;
-    std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> ret = buffer_map.insert(std::pair<const void *, std::shared_ptr<void>>(ptr, std::shared_ptr<void>(new Type(cl::sycl::range<1>(num_bytes)),
-      [](void *dataMem) { delete static_cast<Type*>(dataMem); })));
-    (static_cast<Type*>(buffer_map.at(ptr).get()))->set_final_data(nullptr);
-    return ret;
+  /// the memset function.
+  /// No callback is required here as both arguments are on the device
+  /// and SYCL can handle the dependency.
+  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
+    static const auto write_mode = cl::sycl::access::mode::discard_write;
+    if (n == 0) {
+      return;
+    }
+    n /= sizeof(buffer_scalar_t);
+    auto f = [&](cl::sycl::handler &cgh) {
+      auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
+      // The cast to uint8_t is here to match the behaviour of the standard
+      // memset. The cast to buffer_scalar_t is needed to match the type of the
+      // accessor (in case buffer_scalar_t is not uint8_t)
+      cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    async_synchronize(e);
   }
 
-  template <typename T> inline cl::sycl::buffer<T, 1>* get_sycl_buffer(size_t num_bytes,const T * ptr) const {
-    return static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(ptr, num_bytes).first->second.get());
+  /// Get a range accessor to the virtual pointer's device memory. This range
+  /// accessor will allow access to the memory from the pointer to the end of
+  /// the buffer.
+  ///
+  /// NOTE: Inside a kernel the range accessor will always be indexed from the
+  /// start of the buffer, so the offset in the accessor is only used by
+  /// methods like handler::copy and will not be available inside a kernel.
+  template <cl::sycl::access::mode AcMd, typename T>
+  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
+  get_range_accessor(const void *ptr) const {
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
+    typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
+    typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
+
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+
+    auto original_buffer = pMapper.get_buffer(ptr);
+    const ptrdiff_t offset = pMapper.get_offset(ptr);
+    const ptrdiff_t typed_offset = offset / sizeof(T);
+    eigen_assert(typed_offset >= 0);
+    const auto typed_size = original_buffer.get_size() / sizeof(T);
+    auto buffer = original_buffer.template reinterpret<
+        typename Eigen::internal::remove_const<T>::type>(
+        cl::sycl::range<1>(typed_size));
+    const ptrdiff_t size = buffer.get_count() - typed_offset;
+    eigen_assert(size >= 0);
+    typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
+                               1, AcMd, global_access, is_place_holder>
+        placeholder_accessor_t;
+    const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
+    return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
+                                           cl::sycl::id<1>(typed_offset)),
+                    static_cast<size_t>(typed_offset),
+                    reinterpret_cast<std::intptr_t>(start_ptr));
   }
 
-  /// allocating memory on the cpu
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const {
-    return internal::aligned_malloc(8);
+  /// Get a range accessor to the virtual pointer's device memory with a
+  /// specified size.
+  template <cl::sycl::access::mode AcMd, typename Index>
+  EIGEN_STRONG_INLINE cl::sycl::accessor<
+      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
+                     const Index n_bytes) const {
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    eigen_assert(n_bytes >= 0);
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    auto buffer = pMapper.get_buffer(ptr);
+    const ptrdiff_t offset = pMapper.get_offset(ptr);
+    eigen_assert(offset >= 0);
+    eigen_assert(offset + n_bytes <= buffer.get_size());
+    return buffer.template get_access<AcMd, global_access>(
+        cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
   }
 
-  // some runtime conditions that can be applied here
-  bool isDeviceSuitable() const { return true; }
+  /// Creation of sycl accessor for a buffer. This function first tries to find
+  /// the buffer in the buffer_map. If found it gets the accessor from it, if
+  /// not, the function then adds an entry by creating a sycl buffer for that
+  /// particular pointer.
+  template <cl::sycl::access::mode AcMd>
+  EIGEN_STRONG_INLINE cl::sycl::accessor<
+      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return pMapper.get_buffer(ptr)
+        .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
+            cgh);
+  }
+
+  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
+      const void *ptr) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return pMapper.get_buffer(ptr);
+  }
+
+  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return pMapper.get_offset(ptr);
+  }
+
+  template <typename OutScalar, typename sycl_kernel, typename Lhs,
+            typename Rhs, typename OutPtr, typename Range, typename Index,
+            typename... T>
+  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
+                                                  const Rhs &rhs, OutPtr outptr,
+                                                  Range thread_range,
+                                                  Index scratchSize,
+                                                  T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      // binding the placeholder accessors to a commandgroup handler
+      lhs.bind(cgh);
+      rhs.bind(cgh);
+      outptr.bind(cgh);
+      typedef cl::sycl::accessor<OutScalar, 1,
+                                 cl::sycl::access::mode::read_write,
+                                 cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+          program().template get_kernel<sycl_kernel>(),
+#endif
+          thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+    async_synchronize(e);
+  }
+
+  template <typename OutScalar, typename sycl_kernel, typename InPtr,
+            typename OutPtr, typename Range, typename Index, typename... T>
+  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
+                                                 OutPtr &outptr,
+                                                 Range thread_range,
+                                                 Index scratchSize,
+                                                 T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      // binding the placeholder accessors to a commandgroup handler
+      inptr.bind(cgh);
+      outptr.bind(cgh);
+      typedef cl::sycl::accessor<OutScalar, 1,
+                                 cl::sycl::access::mode::read_write,
+                                 cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+          program().template get_kernel<sycl_kernel>(),
+#endif
+          thread_range, sycl_kernel(scratch, inptr, outptr, var...));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+    async_synchronize(e);
+  }
+
+    template <typename OutScalar, typename sycl_kernel, typename InPtr,
+           typename Range, typename Index, typename... T>
+  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
+                                                 Range thread_range,
+                                                 Index scratchSize,
+                                                 T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      // binding the placeholder accessors to a commandgroup handler
+      inptr.bind(cgh);
+      typedef cl::sycl::accessor<OutScalar, 1,
+                                 cl::sycl::access::mode::read_write,
+                                 cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+          program().template get_kernel<sycl_kernel>(),
+#endif
+          thread_range, sycl_kernel(scratch, inptr, var...));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+    async_synchronize(e);
+  }
+
+
+  EIGEN_STRONG_INLINE void synchronize() const {
+#ifdef EIGEN_EXCEPTIONS
+    m_queue.wait_and_throw();
+#else
+    m_queue.wait();
+#endif
+  }
+
+
+  EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
+    set_latest_event(e);
+#ifndef EIGEN_SYCL_ASYNC_EXECUTION
+    synchronize();
+#endif
+  }
+
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
+                                              Index &rng, Index &GRange) const {
+    tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+                                           EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                        static_cast<Index>(tileSize));
+    rng = n;
+    if (rng == 0) rng = static_cast<Index>(1);
+    GRange = rng;
+    if (tileSize > GRange)
+      tileSize = GRange;
+    else if (GRange > tileSize) {
+      Index xMode = static_cast<Index>(GRange % tileSize);
+      if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
+    }
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+      cl::sycl::range<2> &local_range) const {
+    std::array<Index, 2> input_range = input_dim;
+    Index max_workgroup_Size =
+        static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    max_workgroup_Size =
+        std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+                                    EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                 static_cast<Index>(max_workgroup_Size));
+    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+    local_range[1] =
+        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
+    input_range[1] = input_dim[1];
+    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[1] = input_range[1];
+    if (local_range[1] > global_range[1])
+      local_range[1] = global_range[1];
+    else if (global_range[1] > local_range[1]) {
+      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+      if (xMode != 0)
+        global_range[1] += static_cast<Index>(local_range[1] - xMode);
+    }
+    local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
+    input_range[0] = input_dim[0];
+    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+    global_range[0] = input_range[0];
+    if (local_range[0] > global_range[0])
+      local_range[0] = global_range[0];
+    else if (global_range[0] > local_range[0]) {
+      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+      if (xMode != 0)
+        global_range[0] += static_cast<Index>(local_range[0] - xMode);
+    }
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+      cl::sycl::range<3> &local_range) const {
+    std::array<Index, 3> input_range = input_dim;
+    Index max_workgroup_Size =
+        static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    max_workgroup_Size =
+        std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+                                    EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                 static_cast<Index>(max_workgroup_Size));
+    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+    local_range[2] =
+        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
+    input_range[2] = input_dim[2];
+    if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[2] = input_range[2];
+    if (local_range[2] > global_range[2])
+      local_range[2] = global_range[2];
+    else if (global_range[2] > local_range[2]) {
+      Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
+      if (xMode != 0)
+        global_range[2] += static_cast<Index>(local_range[2] - xMode);
+    }
+    pow_of_2 = static_cast<Index>(
+        std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
+    local_range[1] =
+        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
+    input_range[1] = input_dim[1];
+    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[1] = input_range[1];
+    if (local_range[1] > global_range[1])
+      local_range[1] = global_range[1];
+    else if (global_range[1] > local_range[1]) {
+      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+      if (xMode != 0)
+        global_range[1] += static_cast<Index>(local_range[1] - xMode);
+    }
+    local_range[0] = static_cast<Index>(max_workgroup_Size /
+                                        (local_range[1] * local_range[2]));
+    input_range[0] = input_dim[0];
+    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+    global_range[0] = input_range[0];
+    if (local_range[0] > global_range[0])
+      local_range[0] = global_range[0];
+    else if (global_range[0] > local_range[0]) {
+      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+      if (xMode != 0)
+        global_range[0] += static_cast<Index>(local_range[0] - xMode);
+    }
+  }
+
+  EIGEN_STRONG_INLINE bool has_local_memory() const {
+#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+    return false;
+#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
+    return true;
+#else
+    return m_device_info.local_mem_type ==
+           cl::sycl::info::local_mem_type::local;
+#endif
+  }
+
+  EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
+    return m_device_info.max_mem_alloc_size;
+  }
+
+  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
+    return m_device_info.max_compute_units;
+  }
+
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
+    return m_device_info.max_work_group_size;
+  }
+
+  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
+    return m_device_info.max_work_item_sizes;
+  }
+
+  /// No need for sycl it should act the same as CPU version
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
+
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
+    // OpenCL doesnot have such concept
+    return 2;
+  }
+
+  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
+    return m_device_info.local_mem_size;
+  }
+
+  // This function returns the nearest power of 2 Work-group size which is <=
+  // maximum device workgroup size.
+  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
+    return getPowerOfTwo(m_device_info.max_work_group_size, false);
+  }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
-    ::memcpy(dst, src, n);
+  EIGEN_STRONG_INLINE std::string getPlatformName() const {
+    return m_device_info.platform_name;
   }
 
-  template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const {
-    auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
-    memcpy(host_acc.get_pointer(), src, n);
+  EIGEN_STRONG_INLINE std::string getDeviceName() const {
+    return m_device_info.device_name;
   }
- /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon.
-  template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const {
-    auto it = buffer_map.find(src);
-    if (it != buffer_map.end()) {
-      auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(it->second.get()))-> template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::host_buffer>();
-      memcpy(dst,host_acc.get_pointer(),  n);
-    } else{
-      eigen_assert("no device memory found. The memory might be destroyed before creation");
+
+  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
+    return m_device_info.device_vendor;
+  }
+
+  // This function returns the nearest power of 2
+  // if roundup is true returns result>=wgsize
+  // else it return result <= wgsize
+  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
+    if (roundUp) --wGSize;
+    wGSize |= (wGSize >> 1);
+    wGSize |= (wGSize >> 2);
+    wGSize |= (wGSize >> 4);
+    wGSize |= (wGSize >> 8);
+    wGSize |= (wGSize >> 16);
+#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
+    wGSize |= (wGSize >> 32);
+#endif
+    return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
+  }
+
+  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
+
+  // This function checks if the runtime recorded an error for the
+  // underlying stream device.
+  EIGEN_STRONG_INLINE bool ok() const {
+    if (!exception_caught_) {
+      synchronize();
     }
+    return !exception_caught_;
+  }
+
+  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+    std::lock_guard<std::mutex> lock(event_mutex_);
+    return latest_events_[std::this_thread::get_id()];
+#else
+    eigen_assert(false);
+    return cl::sycl::event();
+#endif
+  }
+
+  // destructor
+  ~QueueInterface() {
+    pMapper.clear();
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+    scratch_buffers.clear();
+#endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const {
-    ::memset(buffer, c, n);
+ protected:
+  EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+    std::lock_guard<std::mutex> lock(event_mutex_);
+    latest_events_[std::this_thread::get_id()] = e;
+#else
+    EIGEN_UNUSED_VARIABLE(e);
+#endif
+  }
+
+  void synchronize_and_callback(cl::sycl::event e,
+                                const std::function<void()> &callback) const {
+    set_latest_event(e);
+    if (callback) {
+      auto callback_ = [=]() {
+#ifdef EIGEN_EXCEPTIONS
+        cl::sycl::event(e).wait_and_throw();
+#else
+        cl::sycl::event(e).wait();
+#endif
+        callback();
+      };
+      m_thread_pool.Schedule(std::move(callback_));
+    } else {
+#ifdef EIGEN_EXCEPTIONS
+      m_queue.wait_and_throw();
+#else
+      m_queue.wait();
+#endif
+    }
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-  return 1;
+
+  bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
+    bool exception_caught = false;
+    for (const auto &e : exceptions) {
+      if (e) {
+        exception_caught = true;
+        EIGEN_THROW_X(e);
+      }
+    }
+    return exception_caught;
   }
+
+  /// class members:
+  bool exception_caught_ = false;
+
+  mutable std::mutex pmapper_mutex_;
+
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+  mutable std::mutex event_mutex_;
+  mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
+#endif
+
+  /// std::map is the container used to make sure that we create only one buffer
+  /// per pointer. The lifespan of the buffer now depends on the lifespan of
+  /// SyclDevice. If a non-read-only pointer is needed to be accessed on the
+  /// host we should manually deallocate it.
+  mutable TensorSycl::internal::PointerMapper pMapper;
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+  mutable std::unordered_set<void *> scratch_buffers;
+#endif
+  /// sycl queue
+  mutable cl::sycl::queue m_queue;
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+  mutable cl::sycl::program m_prog;
+#endif
+
+  /// The thread pool is used to wait on events and call callbacks
+  /// asynchronously
+  mutable Eigen::ThreadPool m_thread_pool;
+
+  const TensorSycl::internal::SyclDeviceInfo m_device_info;
 };
 
+struct SyclDeviceBase {
+  /// QueueInterface is not owned. it is the caller's responsibility to destroy
+  /// it
+  const QueueInterface *m_queue_stream;
+  explicit SyclDeviceBase(const QueueInterface *queue_stream)
+      : m_queue_stream(queue_stream) {}
+  EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
+    return m_queue_stream;
+  }
+};
+
+// Here is a sycl device struct which accept the sycl queue interface
+// as an input
+struct SyclDevice : public SyclDeviceBase {
+  explicit SyclDevice(const QueueInterface *queue_stream)
+      : SyclDeviceBase(queue_stream) {}
+
+  // this is the accessor used to construct the evaluator
+  template <cl::sycl::access::mode AcMd, typename T>
+  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
+  get_range_accessor(const void *ptr) const {
+    return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
+  }
+
+  // get sycl accessor
+  template <cl::sycl::access::mode AcMd>
+  EIGEN_STRONG_INLINE cl::sycl::accessor<
+      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
+    return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
+  }
+
+  /// Accessing the created sycl device buffer for the device pointer
+  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
+      const void *ptr) const {
+    return queue_stream()->get_sycl_buffer(ptr);
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
+                                              Index &rng, Index &GRange) const {
+    queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+      cl::sycl::range<2> &local_range) const {
+    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+      cl::sycl::range<3> &local_range) const {
+    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
+  }
+
+  /// allocate device memory
+  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
+    return queue_stream()->allocate(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
+    return queue_stream()->allocate_temp(num_bytes);
+  }
+
+  /// deallocate device memory
+  EIGEN_STRONG_INLINE void deallocate(void *p) const {
+    queue_stream()->deallocate(p);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
+    queue_stream()->deallocate_temp(buffer);
+  }
+  template <cl::sycl::access::mode AcMd, typename T>
+  EIGEN_STRONG_INLINE void deallocate_temp(
+      const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
+    queue_stream()->deallocate_temp(buffer);
+  }
+  EIGEN_STRONG_INLINE void deallocate_all() const {
+    queue_stream()->deallocate_all();
+  }
+
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
+      cl::sycl::access::mode::read_write, data_t>
+  get(data_t *data) const {
+    return queue_stream()->get(data);
+  }
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
+      TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
+                                        data_t>
+          data) const {
+    return queue_stream()->get(data);
+  }
+
+  /// attach existing buffer
+  EIGEN_STRONG_INLINE void *attach_buffer(
+      cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
+    return queue_stream()->attach_buffer(buf);
+  }
+  /// detach buffer
+  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
+    queue_stream()->detach_buffer(p);
+  }
+  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
+    return queue_stream()->get_offset(ptr);
+  }
+
+  // some runtime conditions that can be applied here
+  EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
+
+  /// memcpyHostToDevice
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(
+      Index *dst, const Index *src, size_t n,
+      std::function<void()> callback = {}) const {
+    queue_stream()->memcpyHostToDevice(dst, src, n, callback);
+  }
+  /// memcpyDeviceToHost
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
+      void *dst, const Index *src, size_t n,
+      std::function<void()> callback = {}) const {
+    queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
+  }
+  /// the memcpy function
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
+    queue_stream()->memcpy(dst, src, n);
+  }
+  /// the memset function
+  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
+    queue_stream()->memset(data, c, n);
+  }
+  /// returning the sycl queue
+  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
+    return queue_stream()->sycl_queue();
+  }
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+  EIGEN_STRONG_INLINE cl::sycl::program &program() const {
+    return queue_stream()->program();
+  }
+#endif
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on sycl devices.
+    return firstLevelCacheSize();
+  }
+  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
+    return queue_stream()->getNumSyclMultiProcessors();
+  }
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
+    return queue_stream()->maxSyclThreadsPerBlock();
+  }
+  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
+    return queue_stream()->maxWorkItemSizes();
+  }
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
+    // OpenCL doesnot have such concept
+    return queue_stream()->maxSyclThreadsPerMultiProcessor();
+  }
+  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
+    return queue_stream()->sharedMemPerBlock();
+  }
+  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
+    return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
+  }
+
+  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
+    return queue_stream()->getPowerOfTwo(val, roundUp);
+  }
+  /// No need for sycl it should act the same as CPU version
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    return queue_stream()->majorDeviceVersion();
+  }
+
+  EIGEN_STRONG_INLINE void synchronize() const {
+    queue_stream()->synchronize();
+  }
+  EIGEN_STRONG_INLINE void async_synchronize(
+      cl::sycl::event e = cl::sycl::event()) const {
+    queue_stream()->async_synchronize(e);
+  }
+  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
+    return queue_stream()->get_latest_event();
+  }
+
+  // This function checks if the runtime recorded an error for the
+  // underlying stream device.
+  EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
+
+  EIGEN_STRONG_INLINE bool has_local_memory() const {
+    return queue_stream()->has_local_memory();
+  }
+  EIGEN_STRONG_INLINE long max_buffer_size() const {
+    return queue_stream()->max_buffer_size();
+  }
+  EIGEN_STRONG_INLINE std::string getPlatformName() const {
+    return queue_stream()->getPlatformName();
+  }
+  EIGEN_STRONG_INLINE std::string getDeviceName() const {
+    return queue_stream()->getDeviceName();
+  }
+  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
+    return queue_stream()->getDeviceVendor();
+  }
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
+    queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
+        var...);
+  }
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
+    queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
+        var...);
+  }
+
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
+    queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
+        var...);
+  }
+};
 }  // end namespace Eigen
 
 #endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index a5e084a24095b2da015f07e42aebf706e66788aa..e524b535a88af5b47b261eaa9b4a3c554f5b290b 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -12,67 +12,6 @@
 
 namespace Eigen {
 
-// Use the SimpleThreadPool by default. We'll switch to the new non blocking
-// thread pool later.
-#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
-template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
-typedef NonBlockingThreadPool ThreadPool;
-#else
-template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
-typedef SimpleThreadPool ThreadPool;
-#endif
-
-
-// Barrier is an object that allows one or more threads to wait until
-// Notify has been called a specified number of times.
-class Barrier {
- public:
-  Barrier(unsigned int count) : state_(count << 1), notified_(false) {
-    eigen_assert(((count << 1) >> 1) == count);
-  }
-  ~Barrier() {
-    eigen_plain_assert((state_>>1) == 0);
-  }
-
-  void Notify() {
-    unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
-    if (v != 1) {
-      eigen_assert(((v + 2) & ~1) != 0);
-      return;  // either count has not dropped to 0, or waiter is not waiting
-    }
-    std::unique_lock<std::mutex> l(mu_);
-    eigen_assert(!notified_);
-    notified_ = true;
-    cv_.notify_all();
-  }
-
-  void Wait() {
-    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
-    if ((v >> 1) == 0) return;
-    std::unique_lock<std::mutex> l(mu_);
-    while (!notified_) {
-      cv_.wait(l);
-    }
-  }
-
- private:
-  std::mutex mu_;
-  std::condition_variable cv_;
-  std::atomic<unsigned int> state_;  // low bit is waiter flag
-  bool notified_;
-};
-
-
-// Notification is an object that allows a user to to wait for another
-// thread to signal a notification that an event has occurred.
-//
-// Multiple threads can wait on the same Notification object,
-// but only one caller must call Notify() on the object.
-struct Notification : Barrier {
-  Notification() : Barrier(1) {};
-};
-
-
 // Runs an arbitrary function and then calls Notify() on the passed in
 // Notification.
 template <typename Function, typename... Args> struct FunctionWrapperWithNotification
@@ -102,22 +41,75 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
   }
 }
 
+// An abstract interface to a device specific memory allocator.
+class Allocator {
+ public:
+  virtual ~Allocator() {}
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+};
 
 // Build a thread pool device on top the an existing pool of threads.
 struct ThreadPoolDevice {
   // The ownership of the thread pool remains with the caller.
-  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }
+  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr)
+      : pool_(pool), num_threads_(num_cores), allocator_(allocator) { }
 
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    return internal::aligned_malloc(num_bytes);
+    return allocator_ ? allocator_->allocate(num_bytes)
+        : internal::aligned_malloc(num_bytes);
   }
 
   EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-    internal::aligned_free(buffer);
+    if (allocator_) {
+      allocator_->deallocate(buffer);
+    } else {
+      internal::aligned_free(buffer);
+    }
+  }
+
+    EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
+    return allocate(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
+    deallocate(buffer);
+  }
+
+  template<typename Type>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
+    return data;
   }
 
   EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifdef __ANDROID__
     ::memcpy(dst, src, n);
+#else
+    // TODO(rmlarsen): Align blocks on cache lines.
+    // We have observed that going beyond 4 threads usually just wastes
+    // CPU cycles due to the threads competing for memory bandwidth, so we
+    // statically schedule at most 4 block copies here.
+    const size_t kMinBlockSize = 32768;
+    const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4);
+    if (n <= kMinBlockSize || num_threads < 2) {
+      ::memcpy(dst, src, n);
+    } else {
+      const char* src_ptr = static_cast<const char*>(src);
+      char* dst_ptr = static_cast<char*>(dst);
+      const size_t blocksize = (n + (num_threads - 1)) / num_threads;
+      Barrier barrier(static_cast<int>(num_threads - 1));
+      // Launch the last 3 blocks on worker threads.
+      for (size_t i = 1; i < num_threads; ++i) {
+        enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] {
+          ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize,
+                   numext::mini(blocksize, n - (i * blocksize)));
+        });
+      }
+      // Launch the first block on the main thread.
+      ::memcpy(dst_ptr, src_ptr, blocksize);
+      barrier.Wait();
+    }
+#endif
   }
   EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
     memcpy(dst, src, n);
@@ -134,6 +126,12 @@ struct ThreadPoolDevice {
     return num_threads_;
   }
 
+  // Number of theads available in the underlying thread pool. This number can
+  // be different from the value returned by numThreads().
+  EIGEN_STRONG_INLINE int numThreadsInPool() const {
+    return pool_->NumThreads();
+  }
+
   EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
     return l1CacheSize();
   }
@@ -149,23 +147,31 @@ struct ThreadPoolDevice {
   }
 
   template <class Function, class... Args>
-  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
+  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f,
+                                            Args&&... args) const {
     Notification* n = new Notification();
-    pool_->Schedule(std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...));
+    pool_->Schedule(
+        std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n,
+                  std::move(f), args...));
     return n;
   }
 
   template <class Function, class... Args>
-  EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
-                                                Function&& f,
+  EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f,
                                                 Args&&... args) const {
-    pool_->Schedule(std::bind(
-        &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...));
+    pool_->Schedule(
+        std::bind(&FunctionWrapperWithBarrier<Function, Args...>::run, b,
+                  std::move(f), args...));
   }
 
   template <class Function, class... Args>
-  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
-    pool_->Schedule(std::bind(f, args...));
+  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f,
+                                                 Args&&... args) const {
+    if (sizeof...(args) > 0) {
+      pool_->Schedule(std::bind(std::move(f), args...));
+    } else {
+      pool_->Schedule(std::move(f));
+    }
   }
 
   // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
@@ -174,44 +180,189 @@ struct ThreadPoolDevice {
     return pool_->CurrentThreadId();
   }
 
-  // parallelFor executes f with [0, n) arguments in parallel and waits for
-  // completion. F accepts a half-open interval [first, last).
-  // Block size is choosen based on the iteration cost and resulting parallel
+  // WARNING: This function is synchronous and will block the calling thread.
+  //
+  // Synchronous parallelFor executes f with [0, n) arguments in parallel and
+  // waits for completion. F accepts a half-open interval [first, last). Block
+  // size is chosen based on the iteration cost and resulting parallel
   // efficiency. If block_align is not nullptr, it is called to round up the
   // block size.
   void parallelFor(Index n, const TensorOpCost& cost,
                    std::function<Index(Index)> block_align,
                    std::function<void(Index, Index)> f) const {
-    typedef TensorCostModel<ThreadPoolDevice> CostModel;
+    if (EIGEN_PREDICT_FALSE(n <= 0)){
+      return;
+    // Compute small problems directly in the caller thread.
+    } else if (n == 1 || numThreads() == 1 ||
+               CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+      f(0, n);
+      return;
+    }
+
+    // Compute block size and total count of blocks.
+    ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
+
+    // Recursively divide size into halves until we reach block_size.
+    // Division code rounds mid to block_size, so we are guaranteed to get
+    // block_count leaves that do actual computations.
+    Barrier barrier(static_cast<unsigned int>(block.count));
+    std::function<void(Index, Index)> handleRange;
+    handleRange = [=, &handleRange, &barrier, &f](Index firstIdx,
+                                                  Index lastIdx) {
+      while (lastIdx - firstIdx > block.size) {
+        // Split into halves and schedule the second half on a different thread.
+        const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
+        pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); });
+        lastIdx = midIdx;
+      }
+      // Single block or less, execute directly.
+      f(firstIdx, lastIdx);
+      barrier.Notify();
+    };
+
+    if (block.count <= numThreads()) {
+      // Avoid a thread hop by running the root of the tree and one block on the
+      // main thread.
+      handleRange(0, n);
+    } else {
+      // Execute the root in the thread pool to avoid running work on more than
+      // numThreads() threads.
+      pool_->Schedule([=, &handleRange]() { handleRange(0, n); });
+    }
+
+    barrier.Wait();
+  }
+
+  // Convenience wrapper for parallelFor that does not align blocks.
+  void parallelFor(Index n, const TensorOpCost& cost,
+                   std::function<void(Index, Index)> f) const {
+    parallelFor(n, cost, nullptr, std::move(f));
+  }
+
+  // WARNING: This function is asynchronous and will not block the calling thread.
+  //
+  // Asynchronous parallelFor executes f with [0, n) arguments in parallel
+  // without waiting for completion. When the last block finished, it will call
+  // 'done' callback. F accepts a half-open interval [first, last). Block size
+  // is chosen based on the iteration cost and resulting parallel efficiency. If
+  // block_align is not nullptr, it is called to round up the block size.
+  void parallelForAsync(Index n, const TensorOpCost& cost,
+                        std::function<Index(Index)> block_align,
+                        std::function<void(Index, Index)> f,
+                        std::function<void()> done) const {
+    // Compute small problems directly in the caller thread.
     if (n <= 1 || numThreads() == 1 ||
         CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
       f(0, n);
+      done();
       return;
     }
 
-    // Calculate block size based on (1) the iteration cost and (2) parallel
-    // efficiency. We want blocks to be not too small to mitigate
-    // parallelization overheads; not too large to mitigate tail
-    // effect and potential load imbalance and we also want number
-    // of blocks to be evenly dividable across threads.
+    // Compute block size and total count of blocks.
+    ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
+
+    ParallelForAsyncContext* const ctx =
+        new ParallelForAsyncContext(block.count, std::move(f), std::move(done));
+
+    // Recursively divide size into halves until we reach block_size.
+    // Division code rounds mid to block_size, so we are guaranteed to get
+    // block_count leaves that do actual computations.
+    ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) {
+      while (lastIdx - firstIdx > block.size) {
+        // Split into halves and schedule the second half on a different thread.
+        const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
+        pool_->Schedule(
+            [ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); });
+        lastIdx = midIdx;
+      }
+
+      // Single block or less, execute directly.
+      ctx->f(firstIdx, lastIdx);
+
+      // Delete async context if it was the last block.
+      if (ctx->count.fetch_sub(1) == 1) delete ctx;
+    };
+
+    if (block.count <= numThreads()) {
+      // Avoid a thread hop by running the root of the tree and one block on the
+      // main thread.
+      ctx->handle_range(0, n);
+    } else {
+      // Execute the root in the thread pool to avoid running work on more than
+      // numThreads() threads.
+      pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); });
+    }
+  }
+
+  // Convenience wrapper for parallelForAsync that does not align blocks.
+  void parallelForAsync(Index n, const TensorOpCost& cost,
+                        std::function<void(Index, Index)> f,
+                        std::function<void()> done) const {
+    parallelForAsync(n, cost, nullptr, std::move(f), std::move(done));
+  }
+
+  // Thread pool accessor.
+  ThreadPoolInterface* getPool() const { return pool_; }
+
+  // Allocator accessor.
+  Allocator* allocator() const { return allocator_; }
 
-    double block_size_f = 1.0 / CostModel::taskSize(1, cost);
+ private:
+  typedef TensorCostModel<ThreadPoolDevice> CostModel;
+
+  // For parallelForAsync we must keep passed in closures on the heap, and
+  // delete them only after `done` callback finished.
+  struct ParallelForAsyncContext {
+    ParallelForAsyncContext(Index block_count,
+                            std::function<void(Index, Index)> block_f,
+                            std::function<void()> done_callback)
+        : count(block_count),
+          f(std::move(block_f)),
+          done(std::move(done_callback)) {}
+    ~ParallelForAsyncContext() { done(); }
+
+    std::atomic<Index> count;
+    std::function<void(Index, Index)> f;
+    std::function<void()> done;
+
+    std::function<void(Index, Index)> handle_range;
+  };
+
+  struct ParallelForBlock {
+    Index size;   // block size
+    Index count;  // number of blocks
+  };
+
+  // Calculates block size based on (1) the iteration cost and (2) parallel
+  // efficiency. We want blocks to be not too small to mitigate parallelization
+  // overheads; not too large to mitigate tail effect and potential load
+  // imbalance and we also want number of blocks to be evenly dividable across
+  // threads.
+  ParallelForBlock CalculateParallelForBlock(
+      const Index n, const TensorOpCost& cost,
+      std::function<Index(Index)> block_align) const {
+    const double block_size_f = 1.0 / CostModel::taskSize(1, cost);
     const Index max_oversharding_factor = 4;
     Index block_size = numext::mini(
-        n, numext::maxi<Index>(divup<Index>(n, max_oversharding_factor * numThreads()),
-                               block_size_f));
+        n, numext::maxi<Index>(
+               divup<Index>(n, max_oversharding_factor * numThreads()),
+               block_size_f));
     const Index max_block_size = numext::mini(n, 2 * block_size);
+
     if (block_align) {
       Index new_block_size = block_align(block_size);
       eigen_assert(new_block_size >= block_size);
       block_size = numext::mini(n, new_block_size);
     }
+
     Index block_count = divup(n, block_size);
+
     // Calculate parallel efficiency as fraction of total CPU time used for
     // computations:
     double max_efficiency =
         static_cast<double>(block_count) /
         (divup<int>(block_count, numThreads()) * numThreads());
+
     // Now try to increase block size up to max_block_size as long as it
     // doesn't decrease parallel efficiency.
     for (Index prev_block_count = block_count;
@@ -244,36 +395,12 @@ struct ThreadPoolDevice {
       }
     }
 
-    // Recursively divide size into halves until we reach block_size.
-    // Division code rounds mid to block_size, so we are guaranteed to get
-    // block_count leaves that do actual computations.
-    Barrier barrier(static_cast<unsigned int>(block_count));
-    std::function<void(Index, Index)> handleRange;
-    handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
-      if (last - first <= block_size) {
-        // Single block or less, execute directly.
-        f(first, last);
-        barrier.Notify();
-        return;
-      }
-      // Split into halves and submit to the pool.
-      Index mid = first + divup((last - first) / 2, block_size) * block_size;
-      pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
-      pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
-    };
-    handleRange(0, n);
-    barrier.Wait();
-  }
-
-  // Convenience wrapper for parallelFor that does not align blocks.
-  void parallelFor(Index n, const TensorOpCost& cost,
-                   std::function<void(Index, Index)> f) const {
-    parallelFor(n, cost, nullptr, std::move(f));
+    return {block_size, block_count};
   }
 
- private:
   ThreadPoolInterface* pool_;
   int num_threads_;
+  Allocator* allocator_;
 };
 
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 451940de3683e4f1f0dcf0325258644d72f49491..f0f1e832a89fa9cde83f74eb87043aa23fde9352 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -32,16 +32,16 @@ namespace Eigen {
 // Boilerplate code
 namespace internal {
 
-template<std::size_t n, typename Dimension> struct dget {
-  static const std::size_t value = get<n, Dimension>::value;
+template<std::ptrdiff_t n, typename Dimension> struct dget {
+  static const std::ptrdiff_t value = get<n, Dimension>::value;
 };
 
 
-template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
 struct fixed_size_tensor_index_linearization_helper
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(array<Index, NumIndices> const& indices,
+  static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
                           const Dimensions& dimensions)
   {
     return array_get<RowMajor ? n - 1 : (NumIndices - n)>(indices) +
@@ -50,21 +50,21 @@ struct fixed_size_tensor_index_linearization_helper
   }
 };
 
-template<typename Index, std::size_t NumIndices, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
 struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(array<Index, NumIndices> const&, const Dimensions&)
+  static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const&, const Dimensions&)
   {
     return 0;
   }
 };
 
-template<typename Index, std::size_t n>
+template<typename Index, std::ptrdiff_t n>
 struct fixed_size_tensor_index_extraction_helper
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(const Index index,
+  static EIGEN_STRONG_INLINE Index run(const Index index,
                           const Dimensions& dimensions)
   {
     const Index mult = (index == n-1) ? 1 : 0;
@@ -77,7 +77,7 @@ template<typename Index>
 struct fixed_size_tensor_index_extraction_helper<Index, 0>
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(const Index,
+  static EIGEN_STRONG_INLINE Index run(const Index,
                           const Dimensions&)
   {
     return 0;
@@ -90,9 +90,11 @@ struct fixed_size_tensor_index_extraction_helper<Index, 0>
 // Fixed size
 #ifndef EIGEN_EMULATE_CXX11_META_H
 template <typename std::ptrdiff_t... Indices>
-struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
+struct Sizes {
   typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
+  const Base t = Base();
   static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
+  static const ptrdiff_t count = Base::count;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
     return Base::count;
@@ -119,17 +121,17 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
     return *this;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const {
-    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, *this);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::ptrdiff_t index) const {
+    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t);
   }
 
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *static_cast<const Base*>(this));
+  ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, t);
   }
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *static_cast<const Base*>(this));
+  ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, t);
   }
 };
 
@@ -142,25 +144,25 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<Indi
 
 #else
 
-template <std::size_t n>
+template <std::ptrdiff_t n>
 struct non_zero_size {
-  typedef internal::type2val<std::size_t, n> type;
+  typedef internal::type2val<std::ptrdiff_t, n> type;
 };
 template <>
 struct non_zero_size<0> {
   typedef internal::null_type type;
 };
 
-template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0, std::size_t V5=0> struct Sizes {
+template <std::ptrdiff_t V1=0, std::ptrdiff_t V2=0, std::ptrdiff_t V3=0, std::ptrdiff_t V4=0, std::ptrdiff_t V5=0> struct Sizes {
   typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base;
-  static const size_t count = Base::count;
-  static const std::size_t total_size = internal::arg_prod<Base>::value;
+  static const std::ptrdiff_t count = Base::count;
+  static const std::ptrdiff_t total_size = internal::arg_prod<Base>::value;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t rank() const {
     return count;
   }
 
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t TotalSize() {
     return internal::arg_prod<Base>::value;
   }
 
@@ -176,7 +178,7 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { }
-  explicit Sizes(std::initializer_list<std::size_t>) {
+  explicit Sizes(std::initializer_list<std::ptrdiff_t>) {
     // todo: add assertion
   }
 #else
@@ -211,18 +213,18 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
   }
 
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+  ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
     return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *reinterpret_cast<const Base*>(this));
   }
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+  ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
     return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *reinterpret_cast<const Base*>(this));
   }
 };
 
 namespace internal {
-template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
+template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
   return Sizes<V1, V2, V3, V4, V5>::total_size;
 }
 }
@@ -231,7 +233,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2,
 
 // Boilerplate
 namespace internal {
-template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
 struct tensor_index_linearization_helper
 {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -243,7 +245,7 @@ struct tensor_index_linearization_helper
   }
 };
 
-template<typename Index, std::size_t NumIndices, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
 struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -262,7 +264,7 @@ struct DSizes : array<DenseIndex, NumDims> {
   typedef array<DenseIndex, NumDims> Base;
   static const int count = NumDims;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const {
     return NumDims;
   }
 
@@ -282,6 +284,57 @@ struct DSizes : array<DenseIndex, NumDims> {
     (*this)[0] = i0;
   }
 
+  EIGEN_DEVICE_FUNC DSizes(const DimensionList<DenseIndex, NumDims>& a) {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+
+  // Enable DSizes index type promotion only if we are promoting to the
+  // larger type, e.g. allow to promote dimensions of type int to long.
+  template<typename OtherIndex>
+  EIGEN_DEVICE_FUNC
+  explicit DSizes(const array<OtherIndex, NumDims>& other,
+                  // Default template parameters require c++11.
+                  typename internal::enable_if<
+                     internal::is_same<
+                         DenseIndex,
+                         typename internal::promote_index_type<
+                             DenseIndex,
+                             OtherIndex
+                         >::type
+                     >::value, void*>::type = 0) {
+    for (int i = 0; i < NumDims; ++i) {
+      (*this)[i] = static_cast<DenseIndex>(other[i]);
+    }
+  }
+
+#ifdef EIGEN_HAS_INDEX_LIST
+  template <typename FirstType, typename... OtherTypes>
+  EIGEN_DEVICE_FUNC
+  explicit DSizes(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
+    for (int i = 0; i < dimensions.count; ++i) {
+      (*this)[i] = dimensions[i];
+    }
+  }
+#endif
+
+#ifndef EIGEN_EMULATE_CXX11_META_H
+  template <typename std::ptrdiff_t... Indices>
+  EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+#else
+  template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5>
+  EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+#endif
+
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   template<typename... IndexTypes> EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
@@ -330,12 +383,21 @@ struct DSizes : array<DenseIndex, NumDims> {
   }
 };
 
-
-
+template <typename IndexType, int NumDims>
+std::ostream& operator<<(std::ostream& os,
+                         const DSizes<IndexType, NumDims>& dims) {
+  os << "[";
+  for (int i = 0; i < NumDims; ++i) {
+    if (i > 0) os << ", ";
+    os << dims[i];
+  }
+  os << "]";
+  return os;
+}
 
 // Boilerplate
 namespace internal {
-template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
 struct tensor_vsize_index_linearization_helper
 {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -347,7 +409,7 @@ struct tensor_vsize_index_linearization_helper
   }
 };
 
-template<typename Index, std::size_t NumIndices, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
 struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -362,10 +424,10 @@ struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 namespace internal {
 
 template <typename DenseIndex, int NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > {
-  static const size_t value = NumDims;
+  static const ptrdiff_t value = NumDims;
 };
 template <typename DenseIndex, int NumDims> struct array_size<DSizes<DenseIndex, NumDims> > {
-  static const size_t value = NumDims;
+  static const ptrdiff_t value = NumDims;
 };
 #ifndef EIGEN_EMULATE_CXX11_META_H
 template <typename std::ptrdiff_t... Indices> struct array_size<const Sizes<Indices...> > {
@@ -375,42 +437,42 @@ template <typename std::ptrdiff_t... Indices> struct array_size<Sizes<Indices...
 static const std::ptrdiff_t value = Sizes<Indices...>::count;
 };
 template <std::ptrdiff_t n, typename std::ptrdiff_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) {
-  return get<n, internal::numeric_list<std::size_t, Indices...> >::value;
+  return get<n, internal::numeric_list<std::ptrdiff_t, Indices...> >::value;
 }
 template <std::ptrdiff_t n> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) {
   eigen_assert(false && "should never be called");
   return -1;
 }
 #else
-template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
-  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
+template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
+  static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count;
 };
-template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
-  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
+template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
+  static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count;
 };
-template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
+template <std::ptrdiff_t n, std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
   return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
 }
 
 #endif
 
 
-template <typename Dims1, typename Dims2, size_t n, size_t m>
+template <typename Dims1, typename Dims2, ptrdiff_t n, ptrdiff_t m>
 struct sizes_match_below_dim {
-  static EIGEN_DEVICE_FUNC  inline bool run(Dims1&, Dims2&) {
+  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
     return false;
   }
 };
-template <typename Dims1, typename Dims2, size_t n>
+template <typename Dims1, typename Dims2, ptrdiff_t n>
 struct sizes_match_below_dim<Dims1, Dims2, n, n> {
-  static EIGEN_DEVICE_FUNC  inline bool run(Dims1& dims1, Dims2& dims2) {
-    return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &
+  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) {
+    return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &&
         sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
   }
 };
 template <typename Dims1, typename Dims2>
 struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
-  static EIGEN_DEVICE_FUNC  inline bool run(Dims1&, Dims2&) {
+  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
     return true;
   }
 };
@@ -419,7 +481,7 @@ struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
 
 
 template <typename Dims1, typename Dims2>
-EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) {
   return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
 }
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 06987132b69cc00c51707fea445655cc54cd5802..a48d035f5aad48f0560e613b8d361f25d1075e30 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -32,6 +32,7 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> >
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename MakePointer_<Scalar>::Type PointerType;
 
   enum {
     Flags = 0
@@ -41,6 +42,8 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> >
     // Intermediate typedef to workaround MSVC issue.
     typedef MakePointer_<T> MakePointerT;
     typedef typename MakePointerT::Type Type;
+
+
   };
 };
 
@@ -73,6 +76,8 @@ class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType, MakePointer_>,
   typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
 
+  static const int NumDims = Eigen::internal::traits<TensorEvalToOp>::NumDimensions;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr)
       : m_xpr(expr), m_buffer(buffer) {}
 
@@ -98,38 +103,60 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
   typedef typename XprType::Index Index;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
   enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = true
+    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = true,
+    PreferBlockAccess = false,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = true
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_device(device),
-          m_buffer(op.buffer()), m_op(op), m_expression(op.expression())
-  { }
+  static const int NumDims = internal::traits<ArgType>::NumDimensions;
 
-  // Used for accessor extraction in SYCL Managed TensorMap:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const {
-    return m_op;
-  }
-  
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  typedef internal::TensorBlockAssignment<
+      CoeffReturnType, NumDims, typename ArgTensorBlock::XprType, Index>
+      TensorBlockAssignment;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){}
+
+
+  EIGEN_STRONG_INLINE ~TensorEvaluator() {
   }
 
-  typedef typename internal::traits<const TensorEvalToOp<ArgType, MakePointer_> >::template MakePointer<CoeffReturnType>::Type DevicePointer;
+
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) {
     EIGEN_UNUSED_VARIABLE(scalar);
     eigen_assert(scalar == NULL);
     return m_impl.evalSubExprsIfNeeded(m_buffer);
   }
 
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType scalar, EvalSubExprsCallback done) {
+    EIGEN_UNUSED_VARIABLE(scalar);
+    eigen_assert(scalar == NULL);
+    m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done));
+  }
+#endif
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
     m_buffer[i] = m_impl.coeff(i);
   }
@@ -137,7 +164,34 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
     internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return m_impl.getResourceRequirements();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(
+      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    // Add `m_buffer` as destination buffer to the block descriptor.
+    desc.template AddDestinationBuffer<Layout>(
+        /*dst_base=*/m_buffer + desc.offset(),
+        /*dst_strides=*/internal::strides<Layout>(m_impl.dimensions()));
+
+    ArgTensorBlock block =
+        m_impl.block(desc, scratch, /*root_of_expr_ast=*/true);
+
+    // If block was evaluated into a destination buffer, there is no need to do
+    // an assignment.
+    if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(
+              desc.dimensions(), internal::strides<Layout>(m_impl.dimensions()),
+              m_buffer, desc.offset()),
+          block.expr());
+    }
+    block.cleanup();
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -159,19 +213,20 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
         TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; }
   ArgType expression() const { return m_expression; }
+  #ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+    m_buffer.bind(cgh);
+  }
+  #endif
 
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-  /// added for sycl in order to construct the buffer from the sycl device
-  const Device& device() const{return m_device;}
 
  private:
   TensorEvaluator<ArgType, Device> m_impl;
-  const Device& m_device;
-  DevicePointer m_buffer;
-  const XprType& m_op;
+  EvaluatorPointerType m_buffer;
   const ArgType m_expression;
 };
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 834ce07df5580058f518cbb6d99edb02c9db4a6a..3aff7fa0142d46ef09206942ff78a23eea94b698 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -32,44 +32,72 @@ struct TensorEvaluator
   typedef typename Derived::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename Derived::Dimensions Dimensions;
+  typedef Derived XprType;
+  static const int PacketSize =  PacketType<CoeffReturnType, Device>::size;
+  typedef typename internal::traits<Derived>::template MakePointer<Scalar>::Type TensorPointerType;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   // NumDimensions is -1 for variable dim tensors
   static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
                                internal::traits<Derived>::NumDimensions : 0;
 
   enum {
-    IsAligned = Derived::IsAligned,
-    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
-    Layout = Derived::Layout,
-    CoordAccess = NumCoords > 0,
-    RawAccess = true
+    IsAligned          = Derived::IsAligned,
+    PacketAccess       = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess        = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+    PreferBlockAccess  = false,
+    Layout             = Derived::Layout,
+    CoordAccess        = NumCoords > 0,
+    RawAccess          = true
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
-      : m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+      : m_data(device.get((const_cast<TensorPointerType>(m.data())))),
+        m_dims(m.dimensions()),
+        m_device(device)
   { }
 
-  // Used for accessor extraction in SYCL Managed TensorMap:
-  const Derived& derived() const { return m_impl; }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) {
-    if (dest) {
-      m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize());
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) {
+    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && dest) {
+      m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
       return false;
     }
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation.
+    done(evalSubExprsIfNeeded(dest));
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    eigen_assert(m_data);
+    eigen_assert(m_data != NULL);
     return m_data[index];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
-    eigen_assert(m_data);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) {
+    eigen_assert(m_data != NULL);
     return m_data[index];
   }
 
@@ -79,6 +107,18 @@ struct TensorEvaluator
     return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
   }
 
+  // Return a packet starting at `index` where `umask` specifies which elements
+  // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
+  // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
+  // float element will be loaded, otherwise 0 will be loaded.
+  // Function has been templatized to enable Sfinae.
+  template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type
+  partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const
+  {
+    return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
+  }
+
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
@@ -86,7 +126,7 @@ struct TensorEvaluator
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
-    eigen_assert(m_data);
+    eigen_assert(m_data != NULL);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       return m_data[m_dims.IndexOfColMajor(coords)];
     } else {
@@ -94,8 +134,9 @@ struct TensorEvaluator
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) {
-    eigen_assert(m_data);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
+  coeffRef(const array<DenseIndex, NumCoords>& coords) {
+    eigen_assert(m_data != NULL);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       return m_data[m_dims.IndexOfColMajor(coords)];
     } else {
@@ -105,19 +146,50 @@ struct TensorEvaluator
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        internal::unpacket_traits<PacketReturnType>::size);
+                        PacketType<CoeffReturnType, Device>::size);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
   }
 
-  EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    assert(m_data != NULL);
+    return TensorBlock::materialize(m_data, m_dims, desc, scratch);
+  }
+
+  template<typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlockDesc& desc, const TensorBlock& block) {
+    assert(m_data != NULL);
+
+    typedef typename TensorBlock::XprType TensorBlockExpr;
+    typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr,
+                                            Index>
+        TensorBlockAssign;
 
-  /// required by sycl in order to construct sycl buffer from raw pointer
-  const Device& device() const{return m_device;}
+    TensorBlockAssign::Run(
+        TensorBlockAssign::target(desc.dimensions(),
+                                  internal::strides<Layout>(m_dims), m_data,
+                                  desc.offset()),
+        block.expr());
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
 
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_data.bind(cgh);
+  }
+#endif
  protected:
-  typename internal::traits<Derived>::template MakePointer<Scalar>::Type m_data;
+  EvaluatorPointerType m_data;
   Dimensions m_dims;
-  const Device& m_device;
-  const Derived& m_impl;
+  const Device EIGEN_DEVICE_REF m_device;
 };
 
 namespace {
@@ -126,7 +198,7 @@ T loadConstant(const T* address) {
   return *address;
 }
 // Use the texture cache on CUDA devices whenever possible
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float loadConstant(const float* address) {
   return __ldg(address);
@@ -140,6 +212,13 @@ Eigen::half loadConstant(const Eigen::half* address) {
   return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
 }
 #endif
+#ifdef EIGEN_USE_SYCL
+// overload of load constant should be implemented here based on range access
+template <cl::sycl::access::mode AcMd, typename T>
+T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess<AcMd, T> &address) {
+  return *address;
+}
+#endif
 }
 
 
@@ -152,40 +231,64 @@ struct TensorEvaluator<const Derived, Device>
   typedef typename Derived::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename Derived::Dimensions Dimensions;
+  typedef const Derived XprType;
+  typedef typename internal::traits<Derived>::template MakePointer<const Scalar>::Type TensorPointerType;
+  typedef StorageMemory<const Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
   // NumDimensions is -1 for variable dim tensors
   static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
                                internal::traits<Derived>::NumDimensions : 0;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned = Derived::IsAligned,
-    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
-    Layout = Derived::Layout,
-    CoordAccess = NumCoords > 0,
-    RawAccess = true
+    IsAligned         = Derived::IsAligned,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = internal::is_arithmetic<ScalarNoConst>::value,
+    PreferBlockAccess = false,
+    Layout            = Derived::Layout,
+    CoordAccess       = NumCoords > 0,
+    RawAccess         = true
   };
 
-  // Used for accessor extraction in SYCL Managed TensorMap:
-  const Derived& derived() const { return m_impl; }
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
-      : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m)
+  EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+      : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
   { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data) {
-      m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar));
+      m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
       return false;
     }
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation.
+    done(evalSubExprsIfNeeded(dest));
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    eigen_assert(m_data);
+    eigen_assert(m_data != NULL);
     return loadConstant(m_data+index);
   }
 
@@ -195,8 +298,20 @@ struct TensorEvaluator<const Derived, Device>
     return internal::ploadt_ro<PacketReturnType, LoadMode>(m_data + index);
   }
 
+  // Return a packet starting at `index` where `umask` specifies which elements
+  // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
+  // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
+  // float element will be loaded, otherwise 0 will be loaded.
+  // Function has been templatized to enable Sfinae.
+  template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type
+  partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const
+  {
+    return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
-    eigen_assert(m_data);
+    eigen_assert(m_data != NULL);
     const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
                         : m_dims.IndexOfRowMajor(coords);
     return loadConstant(m_data+index);
@@ -204,19 +319,32 @@ struct TensorEvaluator<const Derived, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        internal::unpacket_traits<PacketReturnType>::size);
+                        PacketType<CoeffReturnType, Device>::size);
   }
 
-  EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
 
-  /// added for sycl in order to construct the buffer from the sycl device
-  const Device& device() const{return m_device;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    assert(m_data != NULL);
+    return TensorBlock::materialize(m_data, m_dims, desc, scratch);
+  }
 
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_data.bind(cgh);
+  }
+#endif
  protected:
-  typename internal::traits<Derived>::template MakePointer<const Scalar>::Type m_data;
+  EvaluatorPointerType m_data;
   Dimensions m_dims;
-  const Device& m_device;
-  const Derived& m_impl;
+  const Device EIGEN_DEVICE_REF m_device;
 };
 
 
@@ -229,15 +357,6 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 {
   typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
 
-  enum {
-    IsAligned = true,
-    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  EIGEN_DEVICE_FUNC
   TensorEvaluator(const XprType& op, const Device& device)
       : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
   { }
@@ -246,13 +365,42 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess
+    #ifdef EIGEN_USE_SYCL
+    &&  (PacketType<CoeffReturnType, Device>::size >1)
+    #endif
+    ,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    done(true);
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { }
 
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
@@ -268,16 +416,17 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        internal::unpacket_traits<PacketReturnType>::size);
+                        PacketType<CoeffReturnType, Device>::size);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
-
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; }
-  /// required by sycl in order to extract the accessor
-  NullaryOp functor() const { return m_functor; }
+  EIGEN_DEVICE_FUNC  EvaluatorPointerType data() const { return NULL; }
 
+#ifdef EIGEN_USE_SYCL
+   // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_argImpl.bind(cgh);
+  }
+#endif
 
  private:
   const NullaryOp m_functor;
@@ -295,32 +444,60 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
   typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
 
   enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned          = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess       = int(TensorEvaluator<ArgType, Device>::PacketAccess) &
+                         int(internal::functor_traits<UnaryOp>::PacketAccess),
+    BlockAccess        = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess  = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout             = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess        = false,  // to be implemented
+    RawAccess          = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-    : m_functor(op.functor()),
+  TensorEvaluator(const XprType& op, const Device& device)
+    : m_device(device),
+      m_functor(op.functor()),
       m_argImpl(op.nestedExpression(), device)
   { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  static const int NumDims = internal::array_size<Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  typedef internal::TensorCwiseUnaryBlock<UnaryOp, ArgTensorBlock>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_argImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_argImpl.cleanup();
   }
 
@@ -341,15 +518,31 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
         TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    static const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
+    return m_argImpl.getResourceRequirements().addCostPerCoeff(
+        {0, 0, functor_cost / PacketSize});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    return TensorBlock(m_argImpl.block(desc, scratch), m_functor);
+  }
 
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<ArgType, Device> & impl() const { return m_argImpl; }
-  /// added for sycl in order to construct the buffer from sycl device
-  UnaryOp functor() const { return m_functor; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{
+    m_argImpl.bind(cgh);
+  }
+#endif
 
 
  private:
+  const Device EIGEN_DEVICE_REF m_device;
   const UnaryOp m_functor;
   TensorEvaluator<ArgType, Device> m_argImpl;
 };
@@ -363,16 +556,23 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
 
   enum {
-    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess &
-                   internal::functor_traits<BinaryOp>::PacketAccess,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = int(TensorEvaluator<LeftArgType, Device>::IsAligned) &
+                        int(TensorEvaluator<RightArgType, Device>::IsAligned),
+    PacketAccess      = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
+                        int(TensorEvaluator<RightArgType, Device>::PacketAccess) &
+                        int(internal::functor_traits<BinaryOp>::PacketAccess),
+    BlockAccess       = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
+                        int(TensorEvaluator<RightArgType, Device>::BlockAccess),
+    PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
+                        int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-    : m_functor(op.functor()),
+  TensorEvaluator(const XprType& op, const Device& device)
+    : m_device(device),
+      m_functor(op.functor()),
       m_leftImpl(op.lhsExpression(), device),
       m_rightImpl(op.rhsExpression(), device)
   {
@@ -384,8 +584,27 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
   typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static const int NumDims = internal::array_size<
+      typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlock
+      LeftTensorBlock;
+  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock
+      RightTensorBlock;
+
+  typedef internal::TensorCwiseBinaryBlock<BinaryOp, LeftTensorBlock,
+                                           RightTensorBlock>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
@@ -393,12 +612,25 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
     return m_leftImpl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): Evaluate two expression in parallel?
+    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
+      m_rightImpl.evalSubExprsIfNeededAsync(nullptr,
+                                            [done](bool) { done(true); });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
   }
@@ -421,15 +653,34 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
            TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
-  /// required by sycl in order to extract the accessor
-  BinaryOp functor() const { return m_functor; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    static const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
+    return internal::TensorBlockResourceRequirements::merge(
+               m_leftImpl.getResourceRequirements(),
+               m_rightImpl.getResourceRequirements())
+        .addCostPerCoeff({0, 0, functor_cost / PacketSize});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    desc.DropDestinationBuffer();
+    return TensorBlock(m_leftImpl.block(desc, scratch),
+                         m_rightImpl.block(desc, scratch), m_functor);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 
+  #ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_leftImpl.bind(cgh);
+    m_rightImpl.bind(cgh);
+  }
+  #endif
  private:
+  const Device EIGEN_DEVICE_REF m_device;
   const BinaryOp m_functor;
   TensorEvaluator<LeftArgType, Device> m_leftImpl;
   TensorEvaluator<RightArgType, Device> m_rightImpl;
@@ -444,14 +695,20 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
 
   enum {
     IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
-                   internal::functor_traits<TernaryOp>::PacketAccess,
-    Layout = TensorEvaluator<Arg1Type, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    PacketAccess      = TensorEvaluator<Arg1Type, Device>::PacketAccess &&
+                        TensorEvaluator<Arg2Type, Device>::PacketAccess &&
+                        TensorEvaluator<Arg3Type, Device>::PacketAccess &&
+                        internal::functor_traits<TernaryOp>::PacketAccess,
+    BlockAccess       = false,
+    PreferBlockAccess = TensorEvaluator<Arg1Type, Device>::PreferBlockAccess ||
+                        TensorEvaluator<Arg2Type, Device>::PreferBlockAccess ||
+                        TensorEvaluator<Arg3Type, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<Arg1Type, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+  TensorEvaluator(const XprType& op, const Device& device)
     : m_functor(op.functor()),
       m_arg1Impl(op.arg1Expression(), device),
       m_arg2Impl(op.arg2Expression(), device),
@@ -479,8 +736,14 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
   typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
@@ -488,13 +751,13 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
     return m_arg1Impl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_arg1Impl.evalSubExprsIfNeeded(NULL);
     m_arg2Impl.evalSubExprsIfNeeded(NULL);
     m_arg3Impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_arg1Impl.cleanup();
     m_arg2Impl.cleanup();
     m_arg3Impl.cleanup();
@@ -521,14 +784,16 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
            TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<Arg1Type, Device> & arg1Impl() const { return m_arg1Impl; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<Arg2Type, Device>& arg2Impl() const { return m_arg2Impl; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<Arg3Type, Device>& arg3Impl() const { return m_arg3Impl; }
+#ifdef EIGEN_USE_SYCL
+   // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_arg1Impl.bind(cgh);
+    m_arg2Impl.bind(cgh);
+    m_arg3Impl.bind(cgh);
+  }
+#endif
 
  private:
   const TernaryOp m_functor;
@@ -547,15 +812,23 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   typedef typename XprType::Scalar Scalar;
 
   enum {
-    IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
-                   internal::packet_traits<Scalar>::HasBlend,
-    Layout = TensorEvaluator<IfArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = TensorEvaluator<ThenArgType, Device>::IsAligned &
+                        TensorEvaluator<ElseArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ThenArgType, Device>::PacketAccess &
+                        TensorEvaluator<ElseArgType, Device>::PacketAccess &
+                        PacketType<Scalar, Device>::HasBlend,
+    BlockAccess       = TensorEvaluator<IfArgType, Device>::BlockAccess &&
+                        TensorEvaluator<ThenArgType, Device>::BlockAccess &&
+                        TensorEvaluator<ElseArgType, Device>::BlockAccess,
+    PreferBlockAccess = TensorEvaluator<IfArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<ThenArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<ElseArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<IfArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+  TensorEvaluator(const XprType& op, const Device& device)
     : m_condImpl(op.ifExpression(), device),
       m_thenImpl(op.thenExpression(), device),
       m_elseImpl(op.elseExpression(), device)
@@ -569,8 +842,42 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   typedef typename XprType::Index Index;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
   typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static const int NumDims = internal::array_size<Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+    typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const IfArgType, Device>::TensorBlock
+      IfArgTensorBlock;
+  typedef typename TensorEvaluator<const ThenArgType, Device>::TensorBlock
+      ThenArgTensorBlock;
+  typedef typename TensorEvaluator<const ElseArgType, Device>::TensorBlock
+      ElseArgTensorBlock;
+
+  struct TensorSelectOpBlockFactory {
+    template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
+    struct XprType {
+      typedef TensorSelectOp<const IfArgXprType, const ThenArgXprType, const ElseArgXprType> type;
+    };
+
+    template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
+    typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type expr(
+        const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const {
+      return typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type(if_expr, then_expr, else_expr);
+    }
+  };
+
+  typedef internal::TensorTernaryExprBlock<TensorSelectOpBlockFactory,
+                                           IfArgTensorBlock, ThenArgTensorBlock,
+                                           ElseArgTensorBlock>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
@@ -578,13 +885,26 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
     return m_condImpl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_condImpl.evalSubExprsIfNeeded(NULL);
     m_thenImpl.evalSubExprsIfNeeded(NULL);
     m_elseImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
+      m_thenImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
+        m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); });
+      });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_condImpl.cleanup();
     m_thenImpl.cleanup();
     m_elseImpl.cleanup();
@@ -597,13 +917,15 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
   {
-    internal::Selector<PacketSize> select;
-    for (Index i = 0; i < PacketSize; ++i) {
-      select.select[i] = m_condImpl.coeff(index+i);
-    }
-    return internal::pblend(select,
-                            m_thenImpl.template packet<LoadMode>(index),
-                            m_elseImpl.template packet<LoadMode>(index));
+     internal::Selector<PacketSize> select;
+     EIGEN_UNROLL_LOOP
+     for (Index i = 0; i < PacketSize; ++i) {
+       select.select[i] = m_condImpl.coeff(index+i);
+     }
+     return internal::pblend(select,
+                             m_thenImpl.template packet<LoadMode>(index),
+                             m_elseImpl.template packet<LoadMode>(index));
+
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -613,14 +935,42 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
         .cwiseMax(m_elseImpl.costPerCoeff(vectorized));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<IfArgType, Device> & cond_impl() const { return m_condImpl; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<ThenArgType, Device>& then_impl() const { return m_thenImpl; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<ElseArgType, Device>& else_impl() const { return m_elseImpl; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    auto then_req = m_thenImpl.getResourceRequirements();
+    auto else_req = m_elseImpl.getResourceRequirements();
+
+    auto merged_req =
+        internal::TensorBlockResourceRequirements::merge(then_req, else_req);
+    merged_req.cost_per_coeff =
+        then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff);
+
+    return internal::TensorBlockResourceRequirements::merge(
+        m_condImpl.getResourceRequirements(), merged_req);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    // It's unsafe to pass destination buffer to underlying expressions, because
+    // output might be aliased with one of the inputs.
+    desc.DropDestinationBuffer();
+
+    return TensorBlock(
+        m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch),
+        m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory());
+  }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_condImpl.bind(cgh);
+    m_thenImpl.bind(cgh);
+    m_elseImpl.bind(cgh);
+  }
+#endif
  private:
   TensorEvaluator<IfArgType, Device> m_condImpl;
   TensorEvaluator<ThenArgType, Device> m_thenImpl;
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index f01d77c0a061a3c93775da1c7643e630d175bb80..c52fb77dc756e72121cd39541b6cd0c32005e6f8 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -12,31 +12,94 @@
 
 namespace Eigen {
 
-/** \class TensorExecutor
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief The tensor executor class.
-  *
-  * This class is responsible for launch the evaluation of the expression on
-  * the specified computing device.
-  */
+/**
+ * \class TensorExecutor
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor executor class.
+ *
+ * This class is responsible for launch the evaluation of the expression on
+ * the specified computing device.
+ *
+ * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
+ *                      instructions)
+ * @tparam Tiling       can use block based tensor evaluation
+ *                      (see TensorBlock.h)
+ */
 namespace internal {
 
-// Default strategy: the expression is evaluated with a single cpu thread.
-template<typename Expression, typename Device, bool Vectorizable>
-class TensorExecutor
-{
+/**
+ * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely
+ * expensive. If expression has at least one broadcast op in it, and it supports
+ * block based evaluation, we always prefer it, even for the small tensors. For
+ * all other tileable ops, block evaluation overhead for small tensors (fits
+ * into L1) is too large, and we fallback on vectorized evaluation.
+ */
+
+// TODO(ezhulenev): Add specializations for all other types of Tensor ops.
+
+template<typename Expression>
+struct ExpressionHasTensorBroadcastingOp {
+  enum { value = false };
+};
+
+template<typename LhsXprType, typename RhsXprType>
+struct ExpressionHasTensorBroadcastingOp<
+    const TensorAssignOp<LhsXprType, RhsXprType> > {
+  enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
+};
+
+template<typename UnaryOp, typename XprType>
+struct ExpressionHasTensorBroadcastingOp<
+    const TensorCwiseUnaryOp<UnaryOp, XprType> > {
+  enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
+};
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct ExpressionHasTensorBroadcastingOp<
+    const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
+  enum {
+    value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
+        ExpressionHasTensorBroadcastingOp<RhsXprType>::value
+  };
+};
+
+template<typename Broadcast, typename XprType>
+struct ExpressionHasTensorBroadcastingOp<
+    const TensorBroadcastingOp<Broadcast, XprType> > {
+  enum { value = true };
+};
+
+// -------------------------------------------------------------------------- //
+
+/**
+ * Default strategy: the expression is evaluated sequentially with a single cpu
+ * thread, without vectorization and block evaluation.
+ */
+template <typename Expression, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling>
+class TensorExecutor {
  public:
-  typedef typename Expression::Index Index;
+  typedef typename Expression::Index StorageIndex;
+
+  // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
+  // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
+  // violation. If this template is instantiated with a non-default device, it
+  // means that this header file was included without defining
+  // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
+  static_assert(std::is_same<Device, DefaultDevice>::value,
+                "Default executor instantiated with non-default device. "
+                "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
+                "EIGEN_USE_SYCL before including Eigen headers.");
+
   EIGEN_DEVICE_FUNC
-  static inline void run(const Expression& expr, const Device& device = Device())
-  {
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                                      const Device& device = Device()) {
     TensorEvaluator<Expression, Device> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const Index size = array_prod(evaluator.dimensions());
-      for (Index i = 0; i < size; ++i) {
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
+      for (StorageIndex i = 0; i < size; ++i) {
         evaluator.evalScalar(i);
       }
     }
@@ -44,35 +107,48 @@ class TensorExecutor
   }
 };
 
-
-template<typename Expression>
-class TensorExecutor<Expression, DefaultDevice, true>
-{
+/**
+ * Default async execution strategy is not implemented. Currently it's only
+ * available for ThreadPoolDevice (see definition below).
+ */
+template <typename Expression, typename Device, typename DoneCallback,
+          bool Vectorizable, TiledEvaluation Tiling>
+class TensorAsyncExecutor {};
+
+/**
+ * Process all the data with a single cpu thread, using vectorized instructions.
+ */
+template <typename Expression>
+class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
+                     /*Tiling=*/TiledEvaluation::Off> {
  public:
-  typedef typename Expression::Index Index;
+  typedef typename Expression::Index StorageIndex;
+
   EIGEN_DEVICE_FUNC
-  static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
-  {
+  static EIGEN_STRONG_INLINE void run(
+      const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
     TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const Index size = array_prod(evaluator.dimensions());
-      const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
-      // Give the compiler a strong hint to unroll the loop. But don't insist
-      // on unrolling, because if the function is expensive the compiler should not
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
+      const int PacketSize = unpacket_traits<typename TensorEvaluator<
+          Expression, DefaultDevice>::PacketReturnType>::size;
+
+      // Give compiler a strong possibility to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive compiler should not
       // unroll the loop at the expense of inlining.
-      const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
-      for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
-        for (Index j = 0; j < 4; j++) {
+      const StorageIndex UnrolledSize =
+          (size / (4 * PacketSize)) * 4 * PacketSize;
+      for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; j++) {
           evaluator.evalPacket(i + j * PacketSize);
         }
       }
-      const Index VectorizedSize = (size / PacketSize) * PacketSize;
-      for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
+      const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
+      for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
         evaluator.evalPacket(i);
       }
-      for (Index i = VectorizedSize; i < size; ++i) {
+      for (StorageIndex i = VectorizedSize; i < size; ++i) {
         evaluator.evalScalar(i);
       }
     }
@@ -80,55 +156,162 @@ class TensorExecutor<Expression, DefaultDevice, true>
   }
 };
 
+/**
+ * Process all the data with a single cpu thread, using blocks of data. By
+ * sizing a block to fit L1 cache we get better cache performance.
+ */
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, DefaultDevice, Vectorizable,
+                     /*Tiling=*/TiledEvaluation::On> {
+ public:
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+  typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
+  typedef typename traits<Expression>::Index StorageIndex;
+
+  static const int NumDims = traits<Expression>::NumDimensions;
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                         const DefaultDevice& device = DefaultDevice()) {
+    typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
+        TensorBlockMapper;
+
+    typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
+        TensorBlockDesc;
+    typedef internal::TensorBlockScratchAllocator<DefaultDevice>
+        TensorBlockScratch;
+
+    Evaluator evaluator(expr, device);
+
+    // TODO(ezhulenev): Do not use tiling for small tensors?
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
 
+    if (needs_assign) {
+      // Query expression tree for desired block size/shape.
+      const TensorBlockResourceRequirements requirements =
+          evaluator.getResourceRequirements();
 
-// Multicore strategy: the index space is partitioned and each partition is executed on a single core
+      const TensorBlockMapper block_mapper(
+          typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
+          requirements);
+
+      // Share scratch memory allocator between all blocks.
+      TensorBlockScratch scratch(device);
+
+      const StorageIndex total_block_count = block_mapper.blockCount();
+      for (StorageIndex i = 0; i < total_block_count; ++i) {
+        TensorBlockDesc desc = block_mapper.blockDescriptor(i);
+        evaluator.evalBlock(desc, scratch);
+        scratch.reset();
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+/**
+ * Multicore strategy: the index space is partitioned and each partition is
+ * executed on a single core.
+ *
+ * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread
+ *     pool, and will block the caller thread until all tasks are finished.
+ *
+ * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to
+ *     the ThreadPoolDevice managed thread pool, and will return immediately.
+ *     It will call 'done' callback after all tasks are finished.
+ */
 #ifdef EIGEN_USE_THREADS
-template <typename Evaluator, typename Index, bool Vectorizable>
+
+template <typename TensorBlockMapper>
+struct TensorExecutorTilingContext {
+  TensorExecutorTilingContext() = default;
+  TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
+                              const TensorOpCost& b_cost, size_t b_aligned_size)
+      : block_mapper(b_mapper),
+        cost(b_cost),
+        aligned_blocksize(b_aligned_size) {}
+
+  TensorBlockMapper block_mapper;  // navigate through blocks
+  TensorOpCost cost;               // cost of computing a single block
+  size_t aligned_blocksize;        // block size after memory alignment
+};
+
+// Computes a block evaluation parameters, and allocates temporary memory buffer
+// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
+template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
+TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
+    const Evaluator& evaluator) {
+  // Query expression tree for desired block size/shape.
+  TensorBlockResourceRequirements requirements =
+      evaluator.getResourceRequirements();
+
+  // Update target block size based on cost model.
+  double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
+      1, requirements.cost_per_coeff);
+  requirements.size = static_cast<size_t>(1.0 / taskSize);
+
+  TensorBlockMapper block_mapper(
+      typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
+      requirements);
+
+  size_t block_size = block_mapper.blockTotalSize();
+  const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+  const size_t aligned_blocksize =
+      align *
+      divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
+
+  return {block_mapper, requirements.cost_per_coeff * block_size,
+          aligned_blocksize};
+}
+
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
 struct EvalRange {
-  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
+                  const StorageIndex lastIdx) {
     Evaluator evaluator = *evaluator_in;
-    eigen_assert(last >= first);
-    for (Index i = first; i < last; ++i) {
+    eigen_assert(lastIdx >= firstIdx);
+    for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
       evaluator.evalScalar(i);
     }
   }
 
-  static Index alignBlockSize(Index size) {
-    return size;
-  }
+  static StorageIndex alignBlockSize(StorageIndex size) { return size; }
 };
 
-template <typename Evaluator, typename Index>
-struct EvalRange<Evaluator, Index, true> {
-  static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+template <typename Evaluator, typename StorageIndex>
+struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
+  static const int PacketSize =
+      unpacket_traits<typename Evaluator::PacketReturnType>::size;
 
-  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
+                  const StorageIndex lastIdx) {
     Evaluator evaluator = *evaluator_in;
-    eigen_assert(last >= first);
-    Index i = first;
-    if (last - first >= PacketSize) {
-      eigen_assert(first % PacketSize == 0);
-      Index last_chunk_offset = last - 4 * PacketSize;
-      // Give the compiler a strong hint to unroll the loop. But don't insist
-      // on unrolling, because if the function is expensive the compiler should not
+    eigen_assert(lastIdx >= firstIdx);
+    StorageIndex i = firstIdx;
+    if (lastIdx - firstIdx >= PacketSize) {
+      eigen_assert(firstIdx % PacketSize == 0);
+      StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
+      // Give compiler a strong possibility to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive compiler should not
       // unroll the loop at the expense of inlining.
-      for (; i <= last_chunk_offset; i += 4*PacketSize) {
-        for (Index j = 0; j < 4; j++) {
+      for (; i <= last_chunk_offset; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; j++) {
           evaluator.evalPacket(i + j * PacketSize);
         }
       }
-      last_chunk_offset = last - PacketSize;
+      last_chunk_offset = lastIdx - PacketSize;
       for (; i <= last_chunk_offset; i += PacketSize) {
         evaluator.evalPacket(i);
       }
     }
-    for (; i < last; ++i) {
+    for (; i < lastIdx; ++i) {
       evaluator.evalScalar(i);
     }
   }
 
-  static Index alignBlockSize(Index size) {
+  static StorageIndex alignBlockSize(StorageIndex size) {
     // Align block size to packet size and account for unrolling in run above.
     if (size >= 16 * PacketSize) {
       return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
@@ -138,144 +321,376 @@ struct EvalRange<Evaluator, Index, true> {
   }
 };
 
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
  public:
-  typedef typename Expression::Index Index;
-  static inline void run(const Expression& expr, const ThreadPoolDevice& device)
-  {
+  typedef typename Expression::Index StorageIndex;
+
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                         const ThreadPoolDevice& device) {
     typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+    typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
+
     Evaluator evaluator(expr, device);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const Index size = array_prod(evaluator.dimensions());
-#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
       device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
-                         EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
-                         [&evaluator](Index first, Index last) {
-                           EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
+                         EvalRange::alignBlockSize,
+                         [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
+                           EvalRange::run(&evaluator, firstIdx, lastIdx);
                          });
-#else
-      size_t num_threads = device.numThreads();
-      if (num_threads > 1) {
-        num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
-            size, evaluator.costPerCoeff(Vectorizable), num_threads);
-      }
-      if (num_threads == 1) {
-        EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
-      } else {
-        const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
-        Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
-        const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
-        const Index numblocks = size / blocksize;
-
-        Barrier barrier(numblocks);
-        for (int i = 0; i < numblocks; ++i) {
-          device.enqueue_with_barrier(
-              &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
-              &evaluator, i * blocksize, (i + 1) * blocksize);
-        }
-        if (numblocks * blocksize < size) {
-          EvalRange<Evaluator, Index, Vectorizable>::run(
-              &evaluator, numblocks * blocksize, size);
+    }
+    evaluator.cleanup();
+  }
+};
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+                     /*Tiling=*/TiledEvaluation::On> {
+ public:
+  typedef typename traits<Expression>::Index IndexType;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+  static const int NumDims = traits<Expression>::NumDimensions;
+
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
+  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType>
+      TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
+      TensorBlockScratch;
+
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                                      const ThreadPoolDevice& device) {
+    Evaluator evaluator(expr, device);
+
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    if (needs_assign) {
+      const TilingContext tiling =
+          internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
+                                                   Vectorizable>(evaluator);
+
+      auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
+                                                       IndexType lastBlockIdx) {
+        TensorBlockScratch scratch(device);
+
+        for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
+             ++block_idx) {
+          TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
+          evaluator.evalBlock(desc, scratch);
+          scratch.reset();
         }
-        barrier.Wait();
+      };
+
+      // Evaluate small expressions directly as a single block.
+      if (tiling.block_mapper.blockCount() == 1) {
+        TensorBlockScratch scratch(device);
+        TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
+        evaluator.evalBlock(desc, scratch);
+      } else {
+        device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
+                           eval_block);
       }
-#endif  // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
     }
     evaluator.cleanup();
   }
 };
-#endif  // EIGEN_USE_THREADS
 
+template <typename Expression, typename DoneCallback, bool Vectorizable,
+          TiledEvaluation Tiling>
+class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
+                          Vectorizable, Tiling> {
+ public:
+  typedef typename Expression::Index StorageIndex;
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+
+  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
+                                           const ThreadPoolDevice& device,
+                                           DoneCallback done) {
+    TensorAsyncExecutorContext* const ctx =
+        new TensorAsyncExecutorContext(expr, device, std::move(done));
+
+    const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
+      if (!need_assign) {
+        delete ctx;
+        return;
+      }
+
+      typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
+      const StorageIndex size = array_prod(ctx->evaluator.dimensions());
+      device.parallelForAsync(
+          size, ctx->evaluator.costPerCoeff(Vectorizable),
+          EvalRange::alignBlockSize,
+          [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
+            EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
+          },
+          [ctx]() { delete ctx; });
+    };
+
+    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
+  }
+
+ private:
+  struct TensorAsyncExecutorContext {
+    TensorAsyncExecutorContext(const Expression& expr,
+                               const ThreadPoolDevice& thread_pool,
+                               DoneCallback done)
+        : evaluator(expr, thread_pool), on_done(std::move(done)) {}
+
+    ~TensorAsyncExecutorContext() {
+      evaluator.cleanup();
+      on_done();
+    }
+
+    Evaluator evaluator;
+
+   private:
+    DoneCallback on_done;
+  };
+};
+
+template <typename Expression, typename DoneCallback, bool Vectorizable>
+class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
+                          Vectorizable, /*Tileable*/ TiledEvaluation::On> {
+ public:
+  typedef typename traits<Expression>::Index IndexType;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+  static const int NumDims = traits<Expression>::NumDimensions;
+
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
+  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
+      TensorBlockScratch;
+
+  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
+                                           const ThreadPoolDevice& device,
+                                           DoneCallback done) {
+
+    TensorAsyncExecutorContext* const ctx =
+        new TensorAsyncExecutorContext(expr, device, std::move(done));
+
+    const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
+      if (!need_assign) {
+        delete ctx;
+        return;
+      }
+
+      ctx->tiling = internal::GetTensorExecutorTilingContext<
+          Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
+
+      auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
+        TensorBlockScratch scratch(ctx->device);
+
+        for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
+             ++block_idx) {
+          TensorBlockDesc desc =
+              ctx->tiling.block_mapper.blockDescriptor(block_idx);
+          ctx->evaluator.evalBlock(desc, scratch);
+          scratch.reset();
+        }
+      };
+
+      // Evaluate small expressions directly as a single block.
+      if (ctx->tiling.block_mapper.blockCount() == 1) {
+        TensorBlockScratch scratch(ctx->device);
+        TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
+        ctx->evaluator.evalBlock(desc, scratch);
+        delete ctx;
+      } else {
+        ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
+                                     ctx->tiling.cost, eval_block,
+                                     [ctx]() { delete ctx; });
+      }
+    };
+
+    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
+  }
+
+ private:
+  struct TensorAsyncExecutorContext {
+    TensorAsyncExecutorContext(const Expression& expr,
+                               const ThreadPoolDevice& thread_pool,
+                               DoneCallback done)
+        : device(thread_pool),
+          evaluator(expr, thread_pool),
+          on_done(std::move(done)) {}
+
+    ~TensorAsyncExecutorContext() {
+      evaluator.cleanup();
+      on_done();
+    }
+
+    const ThreadPoolDevice& device;
+    Evaluator evaluator;
+    TilingContext tiling;
+
+   private:
+    DoneCallback on_done;
+  };
+};
+
+#endif  // EIGEN_USE_THREADS
 
 // GPU: the evaluation of the expression is offloaded to a GPU.
 #if defined(EIGEN_USE_GPU)
 
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, GpuDevice, Vectorizable> {
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
  public:
-  typedef typename Expression::Index Index;
+  typedef typename Expression::Index StorageIndex;
   static void run(const Expression& expr, const GpuDevice& device);
 };
 
-
-#if defined(__CUDACC__)
-template <typename Evaluator, typename Index, bool Vectorizable>
+#if defined(EIGEN_GPUCC)
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
 struct EigenMetaKernelEval {
-  static __device__ EIGEN_ALWAYS_INLINE
-  void run(Evaluator& eval, Index first, Index last, Index step_size) {
-    for (Index i = first; i < last; i += step_size) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
+    for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
       eval.evalScalar(i);
     }
   }
 };
 
-template <typename Evaluator, typename Index>
-struct EigenMetaKernelEval<Evaluator, Index, true> {
-  static __device__ EIGEN_ALWAYS_INLINE
-  void run(Evaluator& eval, Index first, Index last, Index step_size) {
-    const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
-    const Index vectorized_size = (last / PacketSize) * PacketSize;
-    const Index vectorized_step_size = step_size * PacketSize;
+template <typename Evaluator, typename StorageIndex>
+struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
+    const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
+    const StorageIndex vectorized_step_size = step_size * PacketSize;
 
     // Use the vector path
-    for (Index i = first * PacketSize; i < vectorized_size;
+    for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
          i += vectorized_step_size) {
       eval.evalPacket(i);
     }
-    for (Index i = vectorized_size + first; i < last; i += step_size) {
+    for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
       eval.evalScalar(i);
     }
   }
 };
 
-template <typename Evaluator, typename Index>
+template <typename Evaluator, typename StorageIndex>
 __global__ void
 __launch_bounds__(1024)
-EigenMetaKernel(Evaluator eval, Index size) {
+EigenMetaKernel(Evaluator eval, StorageIndex size) {
 
-  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index step_size = blockDim.x * gridDim.x;
+  const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
+  const StorageIndex step_size = blockDim.x * gridDim.x;
 
   const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
-  EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
+  EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
 }
 
 /*static*/
-template <typename Expression, bool Vectorizable>
-inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
     const Expression& expr, const GpuDevice& device) {
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
-  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
   if (needs_assign) {
-    const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / block_size;
-    const Index size = array_prod(evaluator.dimensions());
+
+    const int block_size = device.maxGpuThreadsPerBlock();
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const StorageIndex size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
 
-    LAUNCH_CUDA_KERNEL(
-        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
+    LAUNCH_GPU_KERNEL(
+        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
         num_blocks, block_size, 0, device, evaluator, size);
   }
   evaluator.cleanup();
 }
 
-#endif  // __CUDACC__
+#endif  // EIGEN_GPUCC
 #endif  // EIGEN_USE_GPU
 
 // SYCL Executor policy
 #ifdef EIGEN_USE_SYCL
 
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, SyclDevice, Vectorizable> {
-public:
-  static inline void run(const Expression &expr, const SyclDevice &device) {
-    // call TensorSYCL module
-    TensorSycl::run(expr, device);
+template <typename Evaluator>
+struct ExecExprFunctorKernel {
+  typedef typename Evaluator::Index Index;
+  Evaluator evaluator;
+  const Index range;
+  template <typename Scratch>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
+      const Scratch, Evaluator evaluator_, const Index range_)
+      : evaluator(evaluator_), range(range_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
+      cl::sycl::nd_item<1> itemID) {
+    compute(itemID);
+  }
+  template <bool is_vec = Evaluator::PacketAccess>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type
+  compute(const cl::sycl::nd_item<1>& itemID) {
+    Index gId = static_cast<Index>(itemID.get_global_linear_id());
+    Index total_threads = itemID.get_global_range(0);
+
+    for (Index i = gId; i < range; i += total_threads) {
+      evaluator.evalScalar(i);
+    }
+  }
+  template <bool is_vec = Evaluator::PacketAccess>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type
+  compute(const cl::sycl::nd_item<1>& itemID) {
+    const Index vectorizedRange =
+        (range / Evaluator::PacketSize) * Evaluator::PacketSize;
+    Index gId = static_cast<Index>(itemID.get_global_linear_id());
+    const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+    const Index start = Evaluator::PacketSize * gId;
+    for (Index i = start; i < vectorizedRange; i += step) {
+      evaluator.evalPacket(i);
+    }
+    gId += vectorizedRange;
+    for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
+      evaluator.evalScalar(i);
+    }
+  }
+};
+
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
+ public:
+  typedef typename Expression::Index Index;
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                                      const Eigen::SyclDevice& dev) {
+    typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
+    Evaluator evaluator(expr, dev);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign) {
+      Index range, GRange, tileSize;
+      Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
+      total_size = (total_size == 0) ? 1 : total_size;
+      const int PacketSize =
+          Eigen::PacketType<typename Evaluator::CoeffReturnType,
+                            Eigen::SyclDevice>::size;
+      Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
+      dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
+      range = total_size;
+
+      dev.template nullary_kernel_launcher<
+          typename Evaluator::CoeffReturnType,
+          ExecExprFunctorKernel<Evaluator> >(
+          evaluator,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
+                                cl::sycl::range<1>(tileSize)),
+          Index(1), range);
+    }
+    evaluator.cleanup();
   }
 };
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index 85dfc7a69f760ec7883f859728d4cc53eb1dd38b..c9bccfc66f1cca4c86755a5d6a371c46a0418276 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -38,7 +38,7 @@ struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> >
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-
+  typedef typename XprTraits::PointerType PointerType;
   enum {
     Flags = 0
   };
@@ -89,6 +89,10 @@ struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> >
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename TypeConversion<Scalar, 
+                                  typename XprTraits::PointerType
+                                  >::type 
+                                  PointerType;
 };
 
 template<typename UnaryOp, typename XprType>
@@ -161,7 +165,12 @@ struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-
+  typedef typename TypeConversion<Scalar,
+                                  typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                                                      typename traits<LhsXprType>::PointerType,
+                                                      typename traits<RhsXprType>::PointerType>::type
+                                  >::type 
+                                  PointerType;
   enum {
     Flags = 0
   };
@@ -238,7 +247,12 @@ struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprT
   typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-
+  typedef typename TypeConversion<Scalar,
+                                  typename conditional<Pointer_type_promotion<typename Arg2XprType::Scalar, Scalar>::val,
+                                                      typename traits<Arg2XprType>::PointerType,
+                                                      typename traits<Arg3XprType>::PointerType>::type
+                                  >::type 
+                                  PointerType;
   enum {
     Flags = 0
   };
@@ -314,6 +328,9 @@ struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
   typedef typename ElseXprType::Nested ElseNested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename conditional<Pointer_type_promotion<typename ThenXprType::Scalar, Scalar>::val,
+                               typename traits<ThenXprType>::PointerType,
+                               typename traits<ElseXprType>::PointerType>::type PointerType;
 };
 
 template<typename IfXprType, typename ThenXprType, typename ElseXprType>
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index 08eb5595a28e967db0e86d2f74264446ddbbb438..4a1a0687cabbb915d3987bedcfd94149e296005d 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -10,10 +10,6 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H
 #define EIGEN_CXX11_TENSOR_TENSOR_FFT_H
 
-// This code requires the ability to initialize arrays of constant
-// values directly inside a class.
-#if __cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1900
-
 namespace Eigen {
 
 /** \class TensorFFT
@@ -71,6 +67,7 @@ struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename traits<XprType>::PointerType PointerType;
 };
 
 template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
@@ -130,17 +127,24 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
   typedef OutputScalar CoeffReturnType;
   typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+    typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
     PacketAccess = true,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     for (int i = 0; i < NumDims; ++i) {
       eigen_assert(input_dims[i] > 0);
@@ -165,19 +169,19 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
     return m_dimensions;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     m_impl.evalSubExprsIfNeeded(NULL);
     if (data) {
       evalToBuf(data);
       return false;
     } else {
-      m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size);
+      m_data = (EvaluatorPointerType)m_device.get((CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size)));
       evalToBuf(m_data);
       return true;
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     if (m_data) {
       m_device.deallocate(m_data);
       m_data = NULL;
@@ -200,11 +204,16 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
-
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_data.bind(cgh);
+  }
+#endif
 
  private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) {
     const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
     ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size);
 
@@ -230,20 +239,32 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
         //   t_n = exp(sqrt(-1) * pi * n^2 / line_len)
         // for n = 0, 1,..., line_len-1.
         // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2
-        pos_j_base_powered[0] = ComplexScalar(1, 0);
-        if (line_len > 1) {
-          const RealScalar pi_over_len(EIGEN_PI / line_len);
-          const ComplexScalar pos_j_base = ComplexScalar(
-	       std::cos(pi_over_len), std::sin(pi_over_len));
-          pos_j_base_powered[1] = pos_j_base;
-          if (line_len > 2) {
-            const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
-            for (int j = 2; j < line_len + 1; ++j) {
-              pos_j_base_powered[j] = pos_j_base_powered[j - 1] *
-                                      pos_j_base_powered[j - 1] /
-                                      pos_j_base_powered[j - 2] * pos_j_base_sq;
-            }
-          }
+
+        // The recurrence is correct in exact arithmetic, but causes
+        // numerical issues for large transforms, especially in
+        // single-precision floating point.
+        //
+        // pos_j_base_powered[0] = ComplexScalar(1, 0);
+        // if (line_len > 1) {
+        //   const ComplexScalar pos_j_base = ComplexScalar(
+        //       numext::cos(M_PI / line_len), numext::sin(M_PI / line_len));
+        //   pos_j_base_powered[1] = pos_j_base;
+        //   if (line_len > 2) {
+        //     const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
+        //     for (int i = 2; i < line_len + 1; ++i) {
+        //       pos_j_base_powered[i] = pos_j_base_powered[i - 1] *
+        //           pos_j_base_powered[i - 1] /
+        //           pos_j_base_powered[i - 2] *
+        //           pos_j_base_sq;
+        //     }
+        //   }
+        // }
+        // TODO(rmlarsen): Find a way to use Eigen's vectorized sin
+        // and cosine functions here.
+        for (int j = 0; j < line_len + 1; ++j) {
+          double arg = ((EIGEN_PI * j) * j) / line_len;
+          std::complex<double> tmp(numext::cos(arg), numext::sin(arg));
+          pos_j_base_powered[j] = static_cast<ComplexScalar>(tmp);
         }
       }
 
@@ -253,7 +274,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
         // get data into line_buf
         const Index stride = m_strides[dim];
         if (stride == 1) {
-          memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
+          m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
         } else {
           Index offset = base_offset;
           for (int j = 0; j < line_len; ++j, offset += stride) {
@@ -261,7 +282,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
           }
         }
 
-        // processs the line
+        // process the line
         if (is_power_of_two) {
           processDataLineCooleyTukey(line_buf, line_len, log_len);
         }
@@ -271,7 +292,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
         // write back
         if (FFTDir == FFT_FORWARD && stride == 1) {
-          memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
+          m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
         } else {
           Index offset = base_offset;
           const ComplexScalar div_factor =  ComplexScalar(1.0 / line_len, 0);
@@ -562,12 +583,12 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
  protected:
   Index m_size;
-  const FFT& m_fft;
+  const FFT EIGEN_DEVICE_REF m_fft;
   Dimensions m_dimensions;
   array<Index, NumDims> m_strides;
   TensorEvaluator<ArgType, Device> m_impl;
-  CoeffReturnType* m_data;
-  const Device& m_device;
+  EvaluatorPointerType m_data;
+  const Device EIGEN_DEVICE_REF m_device;
 
   // This will support a maximum FFT size of 2^32 for each dimension
   // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2;
@@ -645,7 +666,4 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
 }  // end namespace Eigen
 
-#endif  // EIGEN_HAS_CONSTEXPR
-
-
 #endif  // EIGEN_CXX11_TENSOR_TENSOR_FFT_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index fcee5f60d03688ec42dc197dbe898186a5880ca4..ca39bb855217e287abeacbf0333c671441bb8046 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -20,7 +20,7 @@ namespace Eigen {
   * The fixed sized equivalent of
   * Eigen::Tensor<float, 3> t(3, 5, 7);
   * is
-  * Eigen::TensorFixedSize<float, Size<3,5,7>> t;
+  * Eigen::TensorFixedSize<float, Sizes<3,5,7>> t;
   */
 
 template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
@@ -40,11 +40,18 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
 
     enum {
       IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+      BlockAccess = false,
+      PreferBlockAccess = false,
       Layout = Options_ & RowMajor ? RowMajor : ColMajor,
       CoordAccess = true,
       RawAccess = true
     };
 
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
   typedef Dimensions_ Dimensions;
   static const std::size_t NumIndices = Dimensions::count;
 
@@ -333,27 +340,10 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
     }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorFixedSize& operator=(const TensorFixedSize& other)
-    {
-      // FIXME: check that the dimensions of other match the dimensions of *this.
-      // Unfortunately this isn't possible yet when the rhs is an expression.
-      typedef TensorAssignOp<Self, const TensorFixedSize> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other)
-    {
-      // FIXME: check that the dimensions of other match the dimensions of *this.
-      // Unfortunately this isn't possible yet when the rhs is an expression.
-      typedef TensorAssignOp<Self, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    // FIXME: check that the dimensions of other match the dimensions of *this.
+    // Unfortunately this isn't possible yet when the rhs is an expression.
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(TensorFixedSize)
+
 
   protected:
     EIGEN_DEVICE_FUNC
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 8bece4e65152d8b0dfbf7307b921459491ca878e..e800dedc6cf062b97224bde02475f9ab49dd5021 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -12,9 +12,16 @@
 
 namespace Eigen {
 
+/** \class TensorForcedEval
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reshaping class.
+  *
+  *
+  */
 namespace internal {
-template<typename XprType, template <class> class MakePointer_>
-struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
+template<typename XprType>
+struct traits<TensorForcedEvalOp<XprType> >
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
   typedef typename XprType::Scalar Scalar;
@@ -25,50 +32,31 @@ struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 
   enum {
     Flags = 0
   };
-  template <class T> struct MakePointer {
-    // Intermediate typedef to workaround MSVC issue.
-    typedef MakePointer_<T> MakePointerT;
-    typedef typename MakePointerT::Type Type;
-  };
 };
 
-template<typename XprType, template <class> class MakePointer_>
-struct eval<TensorForcedEvalOp<XprType, MakePointer_>, Eigen::Dense>
+template<typename XprType>
+struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
 {
-  typedef const TensorForcedEvalOp<XprType, MakePointer_>& type;
+  typedef const TensorForcedEvalOp<XprType>& type;
 };
 
-template<typename XprType, template <class> class MakePointer_>
-struct nested<TensorForcedEvalOp<XprType, MakePointer_>, 1, typename eval<TensorForcedEvalOp<XprType, MakePointer_> >::type>
+template<typename XprType>
+struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
 {
-  typedef TensorForcedEvalOp<XprType, MakePointer_> type;
+  typedef TensorForcedEvalOp<XprType> type;
 };
 
 }  // end namespace internal
 
 
 
-// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_)
-
-/** \class TensorForcedEvalOp
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor reshaping class.
-  *
-  *
-  */
-/// `template <class> class MakePointer_` is added to convert the host pointer to the device pointer.
-/// It is added due to the fact that for our device compiler `T*` is not allowed.
-/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`.
-/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_<T>` is `T*` .
-/// Therefore, by adding the default value, we managed to convert the type and it does not break any
-/// existing code as its default value is `T*`.
-template<typename XprType, template <class> class MakePointer_>
-class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePointer_>, ReadOnlyAccessors>
+template<typename XprType>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
@@ -89,49 +77,113 @@ class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePoi
     typename XprType::Nested m_xpr;
 };
 
+namespace internal {
+template <typename Device, typename CoeffReturnType>
+struct non_integral_type_placement_new{
+  template <typename StorageType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) {
+   // Initialize non-trivially constructible types.
+    if (!internal::is_arithmetic<CoeffReturnType>::value) {
+      for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType();
+    }
+}
+};
 
-template<typename ArgType, typename Device, template <class> class MakePointer_>
-struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
+// SYCL does not support non-integral types 
+// having new (m_buffer + i) CoeffReturnType() causes the following compiler error for SYCL Devices 
+// no matching function for call to 'operator new'
+template <typename CoeffReturnType>
+struct non_integral_type_placement_new<Eigen::SyclDevice, CoeffReturnType> {
+  template <typename StorageType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) {
+}
+};
+} // end namespace internal
+
+template<typename ArgType_, typename Device>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
 {
-  typedef TensorForcedEvalOp<ArgType, MakePointer_> XprType;
+  typedef const typename internal::remove_all<ArgType_>::type ArgType;
+  typedef TensorForcedEvalOp<ArgType> XprType;
   typedef typename ArgType::Scalar Scalar;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = true,
-    PacketAccess = (PacketSize > 1),
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess = true
+    IsAligned         = true,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = internal::is_arithmetic<CoeffReturnType>::value,
+    PreferBlockAccess = false,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = true
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-	/// op_ is used for sycl
-      : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
+  static const int NumDims = internal::traits<ArgType>::NumDimensions;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_op(op.expression()),
+      m_device(device), m_buffer(NULL)
   { }
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     const Index numValues =  internal::array_prod(m_impl.dimensions());
-    m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
-    // Should initialize the memory in case we're dealing with non POD types.
-    if (NumTraits<CoeffReturnType>::RequireInitialization) {
-      for (Index i = 0; i < numValues; ++i) {
-        new(m_buffer+i) CoeffReturnType();
-      }
-    }
+    m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)));
+
+   internal::non_integral_type_placement_new<Device, CoeffReturnType>()(numValues, m_buffer);
+
     typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
-    EvalTo evalToTmp(m_buffer, m_op);
-    const bool PacketAccess = internal::IsVectorizable<Device, const ArgType>::value;
-    internal::TensorExecutor<const EvalTo, typename internal::remove_const<Device>::type, PacketAccess>::run(evalToTmp, m_device);
+    EvalTo evalToTmp(m_device.get(m_buffer), m_op);
+
+    internal::TensorExecutor<
+        const EvalTo, typename internal::remove_const<Device>::type,
+        /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
+        /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
+        run(evalToTmp, m_device);
+
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    m_device.deallocate(m_buffer);
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    const Index numValues = internal::array_prod(m_impl.dimensions());
+    m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(
+        numValues * sizeof(CoeffReturnType)));
+    typedef TensorEvalToOp<const typename internal::remove_const<ArgType>::type>
+        EvalTo;
+    EvalTo evalToTmp(m_device.get(m_buffer), m_op);
+
+    auto on_done = std::bind([](EvalSubExprsCallback done_) { done_(true); },
+                             std::move(done));
+    internal::TensorAsyncExecutor<
+        const EvalTo, typename internal::remove_const<Device>::type,
+        decltype(on_done),
+        /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
+        /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
+        runAsync(evalToTmp, m_device, std::move(on_done));
+  }
+#endif
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_device.deallocate_temp(m_buffer);
     m_buffer = NULL;
   }
 
@@ -146,21 +198,37 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
     return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    assert(m_buffer != NULL);
+    return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC typename MakePointer<Scalar>::Type data() const { return m_buffer; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  EvaluatorPointerType data() const { return m_buffer; }
 
-  /// required by sycl in order to extract the sycl accessor
-  const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
-  /// used by sycl in order to build the sycl buffer
-  const Device& device() const{return m_device;}
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_buffer.bind(cgh);
+    m_impl.bind(cgh);
+  }
+#endif
  private:
   TensorEvaluator<ArgType, Device> m_impl;
   const ArgType m_op;
-  const Device& m_device;
-  typename MakePointer<CoeffReturnType>::Type m_buffer;
+  const Device EIGEN_DEVICE_REF m_device;
+  EvaluatorPointerType m_buffer;
 };
 
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 52b803d7f269b32f35b2c62f63b830822531e851..246ebe44eeefcd6093006c5fb1b7f375d45679b1 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -12,7 +12,7 @@
 
 namespace Eigen {
 
-// MakePointer class is used as a container of the adress space of the pointer
+// MakePointer class is used as a container of the address space of the pointer
 // on the host and on the device. From the host side it generates the T* pointer
 // and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to
 // T* m_data on the host. It is always called on the device.
@@ -20,8 +20,35 @@ namespace Eigen {
 // map_allocator.
 template<typename T> struct MakePointer {
   typedef T* Type;
+  typedef const T* ConstType;
 };
 
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* constCast(const T* data) {
+  return const_cast<T*>(data);
+}
+
+// The StorageMemory class is a container of the device specific pointer
+// used for refering to a Pointer on TensorEvaluator class. While the TensorExpression
+// is a device-agnostic type and need MakePointer class for type conversion,
+// the TensorEvaluator class can be specialized for a device, hence it is possible
+// to construct different types of temproray storage memory in TensorEvaluator
+// for different devices by specializing the following StorageMemory class.
+template<typename T, typename device> struct StorageMemory: MakePointer <T> {};
+
+namespace internal{
+template<typename A, typename B> struct Pointer_type_promotion {
+  static const bool val=false;
+};
+template<typename A> struct Pointer_type_promotion<A, A> {
+  static const bool val = true;
+};
+template<typename A, typename B> struct TypeConversion {
+  typedef A* type;
+};
+}
+
+
 template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
 template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
 template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
@@ -37,7 +64,7 @@ template<typename Op, typename Dims, typename XprType, template <class> class Ma
 template<typename XprType> class TensorIndexTupleOp;
 template<typename ReduceOp, typename Dims, typename XprType> class TensorTupleReducerOp;
 template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
-template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp;
+template<typename Dimensions, typename LeftXprType, typename RightXprType, typename OutputKernelType> class TensorContractionOp;
 template<typename TargetType, typename XprType> class TensorConversionOp;
 template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
 template<typename FFT, typename XprType, int FFTDataType, int FFTDirection> class TensorFFTOp;
@@ -58,21 +85,50 @@ template<typename Strides, typename XprType> class TensorInflationOp;
 template<typename Generator, typename XprType> class TensorGeneratorOp;
 template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
 template<typename Op, typename XprType> class TensorScanOp;
+template<typename Dims, typename XprType> class TensorTraceOp;
 
 template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
 
 template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp;
-template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorForcedEvalOp;
+template<typename XprType> class TensorForcedEvalOp;
 
 template<typename ExpressionType, typename DeviceType> class TensorDevice;
+template<typename ExpressionType, typename DeviceType, typename DoneCallback> class TensorAsyncDevice;
 template<typename Derived, typename Device> struct TensorEvaluator;
 
+struct NoOpOutputKernel;
+
 struct DefaultDevice;
 struct ThreadPoolDevice;
 struct GpuDevice;
 struct SyclDevice;
 
+#ifdef EIGEN_USE_SYCL
+
+template <typename T> struct MakeSYCLPointer {
+  typedef Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> Type;
+};
+
+template <typename T>
+EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>&
+constCast(const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>& data) {
+  return data;
+}
+
+template <typename T>
+struct StorageMemory<T, SyclDevice> : MakeSYCLPointer<T> {};
+template <typename T>
+struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {};
+
+namespace TensorSycl {
+namespace internal{
+template <typename Evaluator, typename Op> class GenericNondeterministicReducer;
+}
+}
+#endif
+
+
 enum FFTResultType {
   RealPart = 0,
   ImagPart = 1,
@@ -98,10 +154,36 @@ struct IsVectorizable<GpuDevice, Expression> {
                             TensorEvaluator<Expression, GpuDevice>::IsAligned;
 };
 
+// Tiled evaluation strategy.
+enum TiledEvaluation {
+  Off = 0,    // tiled evaluation is not supported
+  On = 1,     // still work in progress (see TensorBlock.h)
+};
+
+template <typename Device, typename Expression>
+struct IsTileable {
+  // Check that block evaluation is supported and it's a preferred option (at
+  // least one sub-expression has much faster block evaluation, e.g.
+  // broadcasting).
+  static const bool BlockAccess =
+      TensorEvaluator<Expression, Device>::BlockAccess &&
+      TensorEvaluator<Expression, Device>::PreferBlockAccess;
+
+  static const TiledEvaluation value =
+      BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off;
+};
+
 template <typename Expression, typename Device,
-          bool Vectorizable = IsVectorizable<Device, Expression>::value>
+          bool Vectorizable      = IsVectorizable<Device, Expression>::value,
+          TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
 class TensorExecutor;
 
+template <typename Expression, typename Device, typename DoneCallback,
+          bool Vectorizable = IsVectorizable<Device, Expression>::value,
+          TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
+class TensorAsyncExecutor;
+
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index d73f6dc68338facef151d8126f83878cb511551f..d9630322435fdd2cd75cc4e67ccf97a44088404d 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -20,7 +20,7 @@ namespace internal {
 template <typename Scalar>
 struct scalar_mod_op {
   EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a % m_divisor; }
   const Scalar m_divisor;
 };
 template <typename Scalar>
@@ -33,8 +33,8 @@ struct functor_traits<scalar_mod_op<Scalar> >
  */
 template <typename Scalar>
 struct scalar_mod2_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod2_op<Scalar> >
@@ -42,7 +42,7 @@ struct functor_traits<scalar_mod2_op<Scalar> >
 
 template <typename Scalar>
 struct scalar_fmod_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op);
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
   operator()(const Scalar& a, const Scalar& b) const {
     return numext::fmod(a, b);
@@ -54,50 +54,19 @@ struct functor_traits<scalar_fmod_op<Scalar> > {
          PacketAccess = false };
 };
 
-
-/** \internal
-  * \brief Template functor to compute the sigmoid of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::sigmoid()
-  */
-template <typename T>
-struct scalar_sigmoid_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
-    const T one = T(1);
-    return one / (one + numext::exp(-x));
-  }
-
-  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Packet packetOp(const Packet& x) const {
-    const Packet one = pset1<Packet>(T(1));
-    return pdiv(one, padd(one, pexp(pnegate(x))));
-  }
-};
-
-template <typename T>
-struct functor_traits<scalar_sigmoid_op<T> > {
-  enum {
-    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 6,
-    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
-                   packet_traits<T>::HasNegate && packet_traits<T>::HasExp
-  };
-};
-
-
 template<typename Reducer, typename Device>
 struct reducer_traits {
   enum {
     Cost = 1,
-    PacketAccess = false
+    PacketAccess = false,
+    IsStateful = false,
+    IsExactlyAssociative = true
   };
 };
 
 // Standard reduction functors
 template <typename T> struct SumReducer
 {
-  static const bool PacketAccess = packet_traits<T>::HasAdd;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
     internal::scalar_sum_op<T> sum_op;
     *accum = sum_op(*accum, t);
@@ -133,16 +102,14 @@ template <typename T, typename Device>
 struct reducer_traits<SumReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasAdd
+    PacketAccess = PacketType<T, Device>::HasAdd,
+    IsStateful = false,
+    IsExactlyAssociative = NumTraits<T>::IsInteger
   };
 };
 
-
 template <typename T> struct MeanReducer
 {
-  static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
-  static const bool IsStateful = true;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   MeanReducer() : scalarCount_(0), packetCount_(0) { }
 
@@ -166,16 +133,20 @@ template <typename T> struct MeanReducer
     return pset1<Packet>(initialize());
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
-    return accum / scalarCount_;
+    internal::scalar_quotient_op<T> quotient_op;
+    return quotient_op(accum, T(scalarCount_));
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
-    return pdiv(vaccum, pset1<Packet>(packetCount_));
+    return pdiv(vaccum, pset1<Packet>(T(packetCount_)));
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
     internal::scalar_sum_op<T> sum_op;
-    return sum_op(saccum, predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits<Packet>::size);
+    internal::scalar_quotient_op<T> quotient_op;
+    return quotient_op(
+        sum_op(saccum, predux(vaccum)),
+        T(scalarCount_ + packetCount_ * unpacket_traits<Packet>::size));
   }
 
   protected:
@@ -187,7 +158,10 @@ template <typename T, typename Device>
 struct reducer_traits<MeanReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasAdd
+    PacketAccess = PacketType<T, Device>::HasAdd &&
+                   PacketType<T, Device>::HasDiv && !NumTraits<T>::IsInteger,
+    IsStateful = true,
+    IsExactlyAssociative = NumTraits<T>::IsInteger
   };
 };
 
@@ -218,20 +192,19 @@ struct MinMaxBottomValue<T, false, false> {
 };
 
 
-template <typename T> struct MaxReducer
+template <typename T, int NaNPropagation=PropagateFast> struct MaxReducer
 {
-  static const bool PacketAccess = packet_traits<T>::HasMax;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    if (t > *accum) { *accum = t; }
+    scalar_max_op<T, T, NaNPropagation> op;
+    *accum = op(t, *accum);
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
-    (*accum) = pmax<Packet>(*accum, p);
+    scalar_max_op<T, T, NaNPropagation> op;
+    (*accum) = op.packetOp(*accum, p);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return MinMaxBottomValue<T, true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
+    return MinMaxBottomValue<T, /*IsMax=*/true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -246,33 +219,34 @@ template <typename T> struct MaxReducer
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    return numext::maxi(saccum, predux_max(vaccum));
+    scalar_max_op<T, T, NaNPropagation> op;
+    return op(saccum, op.predux(vaccum));
   }
 };
 
-template <typename T, typename Device>
-struct reducer_traits<MaxReducer<T>, Device> {
+template <typename T, typename Device, int NaNPropagation>
+    struct reducer_traits<MaxReducer<T, NaNPropagation>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasMax
+    PacketAccess = PacketType<T, Device>::HasMax,
+    IsStateful = false,
+    IsExactlyAssociative = (NaNPropagation!=PropagateFast)
   };
 };
 
-
-template <typename T> struct MinReducer
+template <typename T, int NaNPropagation=PropagateFast> struct MinReducer
 {
-  static const bool PacketAccess = packet_traits<T>::HasMin;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    if (t < *accum) { *accum = t; }
+    scalar_min_op<T, T, NaNPropagation> op;
+    *accum = op(t, *accum);
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
-    (*accum) = pmin<Packet>(*accum, p);
+    scalar_min_op<T, T, NaNPropagation> op;
+    (*accum) = op.packetOp(*accum, p);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return MinMaxBottomValue<T, false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
+    return MinMaxBottomValue<T, /*IsMax=*/false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -287,24 +261,23 @@ template <typename T> struct MinReducer
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    return numext::mini(saccum, predux_min(vaccum));
+    scalar_min_op<T, T, NaNPropagation> op;
+    return op(saccum, op.predux(vaccum));
   }
 };
 
-template <typename T, typename Device>
-struct reducer_traits<MinReducer<T>, Device> {
+template <typename T, typename Device, int NaNPropagation>
+    struct reducer_traits<MinReducer<T, NaNPropagation>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasMin
+    PacketAccess = PacketType<T, Device>::HasMin,
+    IsStateful = false,
+    IsExactlyAssociative = (NaNPropagation!=PropagateFast)
   };
 };
 
-
 template <typename T> struct ProdReducer
 {
-  static const bool PacketAccess = packet_traits<T>::HasMul;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
     internal::scalar_product_op<T> prod_op;
     (*accum) = prod_op(*accum, t);
@@ -313,7 +286,6 @@ template <typename T> struct ProdReducer
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
     (*accum) = pmul<Packet>(*accum, p);
   }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
     internal::scalar_cast_op<int, T> conv;
     return conv(1);
@@ -340,16 +312,15 @@ template <typename T, typename Device>
 struct reducer_traits<ProdReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::MulCost,
-    PacketAccess = PacketType<T, Device>::HasMul
+    PacketAccess = PacketType<T, Device>::HasMul,
+    IsStateful = false,
+    IsExactlyAssociative = true
   };
 };
 
 
 struct AndReducer
 {
-  static const bool PacketAccess = false;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
     *accum = *accum && t;
   }
@@ -365,15 +336,14 @@ template <typename Device>
 struct reducer_traits<AndReducer, Device> {
   enum {
     Cost = 1,
-    PacketAccess = false
+    PacketAccess = false,
+    IsStateful = false,
+    IsExactlyAssociative = true
   };
 };
 
 
 struct OrReducer {
-  static const bool PacketAccess = false;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
     *accum = *accum || t;
   }
@@ -389,19 +359,22 @@ template <typename Device>
 struct reducer_traits<OrReducer, Device> {
   enum {
     Cost = 1,
-    PacketAccess = false
+    PacketAccess = false,
+    IsStateful = false,
+    IsExactlyAssociative = true
   };
 };
 
-
-// Argmin/Argmax reducers
+// Argmin/Argmax reducers.  Returns the first occurrence if multiple locations
+// contain the same min/max value.
 template <typename T> struct ArgMaxTupleReducer
 {
-  static const bool PacketAccess = false;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    if (t.second > accum->second) { *accum = t; }
+    if (t.second < accum->second) {
+      return;
+    } else if (t.second > accum->second || accum->first > t.first ) {
+      *accum = t;
+    }
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
     return T(0, NumTraits<typename T::second_type>::lowest());
@@ -415,18 +388,21 @@ template <typename T, typename Device>
 struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = false
+    PacketAccess = false,
+    IsStateful = false,
+    IsExactlyAssociative = true
   };
 };
 
 
 template <typename T> struct ArgMinTupleReducer
 {
-  static const bool PacketAccess = false;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const {
-    if (t.second < accum->second) { *accum = t; }
+    if (t.second > accum->second) {
+      return;
+    } else if (t.second < accum->second || accum->first > t.first) {
+      *accum = t;
+    }
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
     return T(0, NumTraits<typename T::second_type>::highest());
@@ -440,7 +416,9 @@ template <typename T, typename Device>
 struct reducer_traits<ArgMinTupleReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = false
+    PacketAccess = false,
+    IsStateful = false,
+    IsExactlyAssociative = true
   };
 };
 
@@ -454,6 +432,7 @@ class GaussianGenerator {
                                       const array<T, NumDims>& std_devs)
       : m_means(means)
   {
+    EIGEN_UNROLL_LOOP
     for (size_t i = 0; i < NumDims; ++i) {
       m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2;
     }
@@ -461,6 +440,7 @@ class GaussianGenerator {
 
   EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
     T tmp = T(0);
+    EIGEN_UNROLL_LOOP
     for (size_t i = 0; i < NumDims; ++i) {
       T offset = coordinates[i] - m_means[i];
       tmp += offset * offset / m_two_sigmas[i];
@@ -483,6 +463,25 @@ struct functor_traits<GaussianGenerator<T, Index, NumDims> > {
   };
 };
 
+template <typename Scalar>
+struct scalar_clamp_op {
+  EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+  operator()(const Scalar& x) const {
+    return numext::mini(numext::maxi(x, m_min), m_max);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& x) const {
+    return internal::pmin(internal::pmax(x, pset1<Packet>(m_min)), pset1<Packet>(m_max));
+  }
+  const Scalar m_min;
+  const Scalar m_max;
+};
+template<typename Scalar>
+struct functor_traits<scalar_clamp_op<Scalar> >
+{ enum { Cost = 2 * NumTraits<Scalar>::AddCost, PacketAccess = (packet_traits<Scalar>::HasMin && packet_traits<Scalar>::HasMax)}; };
+
 } // end namespace internal
 } // end namespace Eigen
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index e27753b1946dc9ae42cb4bc0e6d213fe41382c94..174bf06838c008215a312537a03f3c94870db836 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -31,6 +31,7 @@ struct traits<TensorGeneratorOp<Generator, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Generator, typename XprType>
@@ -87,40 +88,58 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
   enum {
-    IsAligned = false,
-    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = true,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_generator(op.generator())
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      :  m_device(device), m_generator(op.generator())
   {
-    TensorEvaluator<ArgType, Device> impl(op.expression(), device);
-    m_dimensions = impl.dimensions();
+    TensorEvaluator<ArgType, Device> argImpl(op.expression(), device);
+    m_dimensions = argImpl.dimensions();
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       m_strides[0] = 1;
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < NumDims; ++i) {
         m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+        if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
       }
     } else {
       m_strides[NumDims - 1] = 1;
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 2; i >= 0; --i) {
         m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+        if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
       }
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
@@ -133,7 +152,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
     EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < dimensions().TotalSize());
 
@@ -145,6 +164,97 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
     return rslt;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.firstLevelCacheSize();
+    // TODO(ezhulenev): Generator should have a cost.
+    return internal::TensorBlockResourceRequirements::skewed<Scalar>(
+        target_size);
+  }
+
+  struct BlockIteratorState {
+    Index stride;
+    Index span;
+    Index size;
+    Index count;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    static const bool is_col_major =
+        static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+    // Compute spatial coordinates for the first block element.
+    array<Index, NumDims> coords;
+    extract_coordinates(desc.offset(), coords);
+    array<Index, NumDims> initial_coords = coords;
+
+    // Offset in the output block buffer.
+    Index offset = 0;
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = is_col_major ? i : NumDims - 1 - i;
+      it[i].size = desc.dimension(dim);
+      it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride);
+      it[i].span = it[i].stride * (it[i].size - 1);
+      it[i].count = 0;
+    }
+    eigen_assert(it[0].stride == 1);
+
+    // Prepare storage for the materialized generator result.
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch);
+
+    CoeffReturnType* block_buffer = block_storage.data();
+
+    static const int packet_size = PacketType<CoeffReturnType, Device>::size;
+
+    static const int inner_dim = is_col_major ? 0 : NumDims - 1;
+    const Index inner_dim_size = it[0].size;
+    const Index inner_dim_vectorized = inner_dim_size - packet_size;
+
+    while (it[NumDims - 1].count < it[NumDims - 1].size) {
+      Index i = 0;
+      // Generate data for the vectorized part of the inner-most dimension.
+      for (; i <= inner_dim_vectorized; i += packet_size) {
+        for (Index j = 0; j < packet_size; ++j) {
+          array<Index, NumDims> j_coords = coords;  // Break loop dependence.
+          j_coords[inner_dim] += j;
+          *(block_buffer + offset + i + j) = m_generator(j_coords);
+        }
+        coords[inner_dim] += packet_size;
+      }
+      // Finalize non-vectorized part of the inner-most dimension.
+      for (; i < inner_dim_size; ++i) {
+        *(block_buffer + offset + i) = m_generator(coords);
+        coords[inner_dim]++;
+      }
+      coords[inner_dim] = initial_coords[inner_dim];
+
+      // For the 1d tensor we need to generate only one inner-most dimension.
+      if (NumDims == 1) break;
+
+      // Update offset.
+      for (i = 1; i < NumDims; ++i) {
+        if (++it[i].count < it[i].size) {
+          offset += it[i].stride;
+          coords[is_col_major ? i : NumDims - 1 - i]++;
+          break;
+        }
+        if (i != NumDims - 1) it[i].count = 0;
+        coords[is_col_major ? i : NumDims - 1 - i] =
+            initial_coords[is_col_major ? i : NumDims - 1 - i];
+        offset -= it[i].span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool) const {
     // TODO(rmlarsen): This is just a placeholder. Define interface to make
@@ -153,21 +263,26 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
                                   TensorOpCost::MulCost<Scalar>());
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType  data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {}
+#endif
 
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_strides[i];
+        const Index idx = index / m_fast_strides[i];
         index -= idx * m_strides[i];
         coords[i] = idx;
       }
       coords[0] = index;
     } else {
       for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_strides[i];
+        const Index idx = index / m_fast_strides[i];
         index -= idx * m_strides[i];
         coords[i] = idx;
       }
@@ -175,8 +290,10 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
     }
   }
 
+  const Device EIGEN_DEVICE_REF m_device;
   Dimensions m_dimensions;
   array<Index, NumDims> m_strides;
+  array<IndexDivisor, NumDims> m_fast_strides;
   Generator m_generator;
 };
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb53ce29891adcf7bbf4287e4db34923bb32915d
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
@@ -0,0 +1,99 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
+#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+
+// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design
+// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU,  but
+// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
+// When compiling such files, gcc will end up trying to pick up the CUDA headers by 
+// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
+// This will obviously not work when trying to compile tensorflow on a system with no CUDA
+// To work around this issue for HIP systems (and leave the default behaviour intact), the
+// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and 
+// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
+// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well
+
+#if defined(EIGEN_USE_HIP)
+
+#define gpuStream_t hipStream_t
+#define gpuDeviceProp_t hipDeviceProp_t
+#define gpuError_t hipError_t
+#define gpuSuccess hipSuccess
+#define gpuErrorNotReady hipErrorNotReady
+#define gpuGetDeviceCount hipGetDeviceCount
+#define gpuGetLastError hipGetLastError
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorName hipGetErrorName
+#define gpuGetErrorString hipGetErrorString
+#define gpuGetDeviceProperties hipGetDeviceProperties
+#define gpuStreamDefault hipStreamDefault
+#define gpuGetDevice hipGetDevice
+#define gpuSetDevice hipSetDevice
+#define gpuMalloc hipMalloc
+#define gpuFree hipFree
+#define gpuMemsetAsync hipMemsetAsync
+#define gpuMemcpyAsync hipMemcpyAsync
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuStreamQuery hipStreamQuery
+#define gpuSharedMemConfig hipSharedMemConfig
+#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig
+#define gpuStreamSynchronize hipStreamSynchronize
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuMemcpy hipMemcpy
+
+#else
+
+#define gpuStream_t cudaStream_t
+#define gpuDeviceProp_t cudaDeviceProp
+#define gpuError_t cudaError_t
+#define gpuSuccess cudaSuccess
+#define gpuErrorNotReady cudaErrorNotReady
+#define gpuGetDeviceCount cudaGetDeviceCount
+#define gpuGetLastError cudaGetLastError
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorName cudaGetErrorName
+#define gpuGetErrorString cudaGetErrorString
+#define gpuGetDeviceProperties cudaGetDeviceProperties
+#define gpuStreamDefault cudaStreamDefault
+#define gpuGetDevice cudaGetDevice
+#define gpuSetDevice cudaSetDevice
+#define gpuMalloc cudaMalloc
+#define gpuFree cudaFree
+#define gpuMemsetAsync cudaMemsetAsync
+#define gpuMemcpyAsync cudaMemcpyAsync
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuStreamQuery cudaStreamQuery
+#define gpuSharedMemConfig cudaSharedMemConfig
+#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig
+#define gpuStreamSynchronize cudaStreamSynchronize
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuMemcpy cudaMemcpy
+
+#endif
+
+// gpu_assert can be overridden
+#ifndef gpu_assert
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+// HIPCC do not support the use of assert on the GPU side.
+#define gpu_assert(COND)
+#else
+#define gpu_assert(COND) assert(COND)
+#endif
+
+#endif // gpu_assert
+
+#endif  // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d142f2ee8790c87590ca235c7fecb5592366da1
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
@@ -0,0 +1,44 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
+
+#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
+
+#undef gpuStream_t
+#undef gpuDeviceProp_t 
+#undef gpuError_t
+#undef gpuSuccess
+#undef gpuErrorNotReady
+#undef gpuGetDeviceCount
+#undef gpuGetErrorString
+#undef gpuGetDeviceProperties
+#undef gpuStreamDefault
+#undef gpuGetDevice
+#undef gpuSetDevice
+#undef gpuMalloc
+#undef gpuFree
+#undef gpuMemsetAsync
+#undef gpuMemcpyAsync
+#undef gpuMemcpyDeviceToDevice
+#undef gpuMemcpyDeviceToHost
+#undef gpuMemcpyHostToDevice
+#undef gpuStreamQuery
+#undef gpuSharedMemConfig
+#undef gpuDeviceSetSharedMemConfig
+#undef gpuStreamSynchronize
+#undef gpuDeviceSynchronize
+#undef gpuMemcpy
+
+#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
+
+#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+
+#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 566856ed20028d275847f538cc23b27d0c2bc53e..dd51850b7760f42713ad5a7ac3b856362d5ac5e7 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -27,6 +27,7 @@ namespace Eigen {
   * patch_cols, and 1 for all the additional dimensions.
   */
 namespace internal {
+
 template<DenseIndex Rows, DenseIndex Cols, typename XprType>
 struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
 {
@@ -38,6 +39,7 @@ struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions + 1;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<DenseIndex Rows, DenseIndex Cols, typename XprType>
@@ -52,6 +54,66 @@ struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorIm
   typedef TensorImagePatchOp<Rows, Cols, XprType> type;
 };
 
+template <typename Self, bool Vectorizable>
+struct ImagePatchCopyOp {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::Impl Impl;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Self& self, const Index num_coeff_to_copy, const Index dst_index,
+      Scalar* dst_data, const Index src_index) {
+    const Impl& impl = self.impl();
+    for (Index i = 0; i < num_coeff_to_copy; ++i) {
+      dst_data[dst_index + i] = impl.coeff(src_index + i);
+    }
+  }
+};
+
+template <typename Self>
+struct ImagePatchCopyOp<Self, true> {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::Impl Impl;
+  typedef typename packet_traits<Scalar>::type Packet;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Self& self, const Index num_coeff_to_copy, const Index dst_index,
+      Scalar* dst_data, const Index src_index) {
+    const Impl& impl = self.impl();
+    const Index packet_size = internal::unpacket_traits<Packet>::size;
+    const Index vectorized_size =
+        (num_coeff_to_copy / packet_size) * packet_size;
+    for (Index i = 0; i < vectorized_size; i += packet_size) {
+      Packet p = impl.template packet<Unaligned>(src_index + i);
+      internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, p);
+    }
+    for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
+      dst_data[dst_index + i] = impl.coeff(src_index + i);
+    }
+  }
+};
+
+template <typename Self>
+struct ImagePatchPaddingOp {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Index num_coeff_to_pad, const Scalar padding_value,
+      const Index dst_index, Scalar* dst_data) {
+    const Index packet_size = internal::unpacket_traits<Packet>::size;
+    const Packet padded_packet = internal::pset1<Packet>(padding_value);
+    const Index vectorized_size =
+        (num_coeff_to_pad / packet_size) * packet_size;
+    for (Index i = 0; i < vectorized_size; i += packet_size) {
+      internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i,
+                                                   padded_packet);
+    }
+    for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) {
+      dst_data[dst_index + i] = padding_value;
+    }
+  }
+};
+
 }  // end namespace internal
 
 template<DenseIndex Rows, DenseIndex Cols, typename XprType>
@@ -70,12 +132,12 @@ class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprT
                                                            DenseIndex in_row_strides, DenseIndex in_col_strides,
                                                            DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
                                                            PaddingType padding_type, Scalar padding_value)
-      : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-        m_row_strides(row_strides), m_col_strides(col_strides),
-        m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-        m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-        m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
-        m_padding_type(padding_type), m_padding_value(padding_value) {}
+                                                           : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+                                                           m_row_strides(row_strides), m_col_strides(col_strides),
+                                                           m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+                                                           m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+                                                           m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+                                                           m_padding_type(padding_type), m_padding_value(padding_value) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
                                                            DenseIndex row_strides, DenseIndex col_strides,
@@ -84,13 +146,14 @@ class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprT
                                                            DenseIndex padding_top, DenseIndex padding_bottom,
                                                            DenseIndex padding_left, DenseIndex padding_right,
                                                            Scalar padding_value)
-      : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-        m_row_strides(row_strides), m_col_strides(col_strides),
-        m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-        m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-        m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
-        m_padding_left(padding_left), m_padding_right(padding_right),
-        m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+                                                           : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+                                                           m_row_strides(row_strides), m_col_strides(col_strides),
+                                                           m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+                                                           m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+                                                           m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+                                                           m_padding_left(padding_left), m_padding_right(padding_right),
+                                                           m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+
 
     EIGEN_DEVICE_FUNC
     DenseIndex patch_rows() const { return m_patch_rows; }
@@ -161,18 +224,26 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   typedef TensorEvaluator<ArgType, Device> Impl;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = false,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
+      : m_device(device), m_impl(op.expression(), device)
   {
     EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -238,9 +309,15 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
           // Calculate the padding
           m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2;
           m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2;
+          // The padding size calculation for PADDING_SAME has been updated to
+          // be consistent with how TensorFlow extracts its paddings.
+          m_rowPaddingTop = numext::maxi<Index>(0, m_rowPaddingTop);
+          m_colPaddingLeft = numext::maxi<Index>(0, m_colPaddingLeft);
           break;
         default:
           eigen_assert(false && "unexpected padding");
+          m_outputCols=0; // silence the uninitialised warning;
+          m_outputRows=0; //// silence the uninitialised warning;
       }
     }
     eigen_assert(m_outputRows > 0);
@@ -312,12 +389,20 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -418,20 +503,27 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
     return packetWithPossibleZero(index);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
-  Index rowPaddingTop() const { return m_rowPaddingTop; }
-  Index colPaddingLeft() const { return m_colPaddingLeft; }
-  Index outputRows() const { return m_outputRows; }
-  Index outputCols() const { return m_outputCols; }
-  Index userRowStride() const { return m_row_strides; }
-  Index userColStride() const { return m_col_strides; }
-  Index userInRowStride() const { return m_in_row_strides; }
-  Index userInColStride() const { return m_in_col_strides; }
-  Index rowInflateStride() const { return m_row_inflate_strides; }
-  Index colInflateStride() const { return m_col_inflate_strides; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
@@ -449,6 +541,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -500,6 +593,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
 
   Scalar m_paddingValue;
 
+  const Device EIGEN_DEVICE_REF m_device;
   TensorEvaluator<ArgType, Device> m_impl;
 };
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 3209fecd34d770db04503e93d3e7de80d7a36e72..2d8c7b9033f22a48b3e7192c31216396fa270dd0 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -37,36 +37,36 @@ namespace Eigen {
   * \sa Tensor
   */
 
-template <DenseIndex n>
+template <Index n>
 struct type2index {
-  static const DenseIndex value = n;
-  EIGEN_DEVICE_FUNC constexpr operator DenseIndex() const { return n; }
-  EIGEN_DEVICE_FUNC void set(DenseIndex val) {
+  static const Index value = n;
+  EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; }
+  EIGEN_DEVICE_FUNC void set(Index val) {
     eigen_assert(val == n);
   }
 };
 
 // This can be used with IndexPairList to get compile-time constant pairs,
 // such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>().
-template <DenseIndex f, DenseIndex s>
+template <Index f, Index s>
 struct type2indexpair {
-  static const DenseIndex first = f;
-  static const DenseIndex second = s;
+  static const Index first = f;
+  static const Index second = s;
 
-  constexpr EIGEN_DEVICE_FUNC operator IndexPair<DenseIndex>() const {
-    return IndexPair<DenseIndex>(f, s);
+  constexpr EIGEN_DEVICE_FUNC operator IndexPair<Index>() const {
+    return IndexPair<Index>(f, s);
   }
 
-  EIGEN_DEVICE_FUNC void set(const IndexPair<DenseIndex>& val) {
+  EIGEN_DEVICE_FUNC void set(const IndexPair<Index>& val) {
     eigen_assert(val.first == f);
     eigen_assert(val.second == s);
   }
 };
 
 
-template<DenseIndex n> struct NumTraits<type2index<n> >
+template<Index n> struct NumTraits<type2index<n> >
 {
-  typedef DenseIndex Real;
+  typedef Index Real;
   enum {
     IsComplex = 0,
     RequireInitialization = false,
@@ -75,28 +75,28 @@ template<DenseIndex n> struct NumTraits<type2index<n> >
     MulCost = 1
   };
 
-  EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; }
-  EIGEN_DEVICE_FUNC static inline Real highest() { return n; }
-  EIGEN_DEVICE_FUNC static inline Real lowest() { return n; }
+  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real dummy_precision() { return 0; }
+  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real highest() { return n; }
+  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real lowest() { return n; }
 };
 
 namespace internal {
 template <typename T>
-EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) {
-  val = new_val;
+EIGEN_DEVICE_FUNC void update_value(T& val, Index new_val) {
+  val = internal::convert_index<T>(new_val);
 }
-template <DenseIndex n>
-EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, DenseIndex new_val) {
+template <Index n>
+EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, Index new_val) {
   val.set(new_val);
 }
 
 template <typename T>
-EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<DenseIndex> new_val) {
+EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<Index> new_val) {
   val = new_val;
 }
-template <DenseIndex f, DenseIndex s>
-EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<DenseIndex> new_val) {
+template <Index f, Index s>
+EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<Index> new_val) {
   val.set(new_val);
 }
 
@@ -106,36 +106,36 @@ struct is_compile_time_constant {
   static constexpr bool value = false;
 };
 
-template <DenseIndex idx>
+template <Index idx>
 struct is_compile_time_constant<type2index<idx> > {
   static constexpr bool value = true;
 };
-template <DenseIndex idx>
+template <Index idx>
 struct is_compile_time_constant<const type2index<idx> > {
   static constexpr bool value = true;
 };
-template <DenseIndex idx>
+template <Index idx>
 struct is_compile_time_constant<type2index<idx>& > {
   static constexpr bool value = true;
 };
-template <DenseIndex idx>
+template <Index idx>
 struct is_compile_time_constant<const type2index<idx>& > {
   static constexpr bool value = true;
 };
 
-template <DenseIndex f, DenseIndex s>
+template <Index f, Index s>
 struct is_compile_time_constant<type2indexpair<f, s> > {
   static constexpr bool value = true;
 };
-template <DenseIndex f, DenseIndex s>
+template <Index f, Index s>
 struct is_compile_time_constant<const type2indexpair<f, s> > {
   static constexpr bool value = true;
 };
-template <DenseIndex f, DenseIndex s>
+template <Index f, Index s>
 struct is_compile_time_constant<type2indexpair<f, s>& > {
   static constexpr bool value = true;
 };
-template <DenseIndex f, DenseIndex s>
+template <Index f, Index s>
 struct is_compile_time_constant<const type2indexpair<f, s>& > {
   static constexpr bool value = true;
 };
@@ -228,15 +228,15 @@ template <typename T, typename... O>
 
 
 
-template <DenseIndex Idx, typename ValueT>
+template <Index Idx, typename ValueT>
 struct tuple_coeff {
   template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple<T...>& t) {
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple<T...>& t) {
     //    return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
     return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx-1, ValueT>::get(i, t));
   }
   template <typename... T>
-  EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT& value) {
+  EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT& value) {
     if (i == Idx) {
       update_value(array_get<Idx>(t), value);
     } else {
@@ -245,7 +245,7 @@ struct tuple_coeff {
   }
 
   template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>& t) {
+  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>& t) {
     return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
         tuple_coeff<Idx-1, ValueT>::value_known_statically(i, t);
   }
@@ -268,18 +268,18 @@ struct tuple_coeff {
 template <typename ValueT>
 struct tuple_coeff<0, ValueT> {
   template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple<T...>& t) {
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple<T...>& t) {
     //  eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
     return array_get<0>(t)/* * (i == 0)*/;
   }
   template <typename... T>
-  EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT value) {
+  EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT value) {
     eigen_assert (i == 0);
     update_value(array_get<0>(t), value);
   }
   template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>&) {
-    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value & (i == 0);
+  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>&) {
+    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value && (i == 0);
   }
 
   template <typename... T>
@@ -298,32 +298,43 @@ struct tuple_coeff<0, ValueT> {
 
 template<typename FirstType, typename... OtherTypes>
 struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index operator[] (const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::get(i, *this);
   }
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index get(const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::get(i, *this);
   }
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::set(i, *this, value);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value) {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::set(i, *this, value);
   }
 
   EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
   EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple<FirstType, OtherTypes...>(first, other...) { }
   EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
 
-  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
+  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::value_known_statically(i, *this);
   }
   EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_known_statically(*this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::values_up_to_known_statically(*this);
   }
 
   EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::values_up_to_statically_known_to_increase(*this);
   }
 };
 
+template <typename FirstType, typename... OtherTypes>
+std::ostream& operator<<(std::ostream& os,
+                         const IndexList<FirstType, OtherTypes...>& dims) {
+  os << "[";
+  for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) {
+    if (i > 0) os << ", ";
+    os << dims[i];
+  }
+  os << "]";
+  return os;
+}
 
 template<typename FirstType, typename... OtherTypes>
 constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
@@ -333,26 +344,28 @@ constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, Ot
 
 template<typename FirstType, typename... OtherTypes>
 struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<DenseIndex> operator[] (const DenseIndex i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<DenseIndex>>::get(i, *this);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<Index> operator[] (const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<Index>>::get(i, *this);
   }
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair<DenseIndex> value) {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<DenseIndex> >::set(i, *this, value);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const IndexPair<Index> value) {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<Index> >::set(i, *this, value);
   }
 
   EIGEN_DEVICE_FUNC  constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
   EIGEN_DEVICE_FUNC  constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
 
-  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
+  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::value_known_statically(i, *this);
   }
 };
 
 namespace internal {
 
-template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
-  size_t result = 1;
-  for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
+template<typename FirstType, typename... OtherTypes>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
+  Index result = 1;
+  EIGEN_UNROLL_LOOP
+  for (size_t i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
     result *= sizes[i];
   }
   return result;
@@ -372,30 +385,30 @@ template<typename FirstType, typename... OtherTypes> struct array_size<const Ind
   static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
 };
 
-template<DenseIndex N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) {
+template<Index N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr Index array_get(IndexList<FirstType, OtherTypes...>& a) {
   return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
 }
-template<DenseIndex N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(const IndexList<FirstType, OtherTypes...>& a) {
+template<Index N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr Index array_get(const IndexList<FirstType, OtherTypes...>& a) {
   return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
 }
 
 template <typename T>
 struct index_known_statically_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
   }
 };
@@ -447,14 +460,14 @@ template <typename FirstType, typename... OtherTypes>
 
 template <typename Tx>
 struct index_statically_eq_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) == value);
   }
@@ -462,7 +475,7 @@ struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...> > {
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) == value);
   }
@@ -471,14 +484,14 @@ struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...> > {
 
 template <typename T>
 struct index_statically_ne_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) != value);
   }
@@ -486,7 +499,7 @@ struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...> > {
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) != value);
   }
@@ -495,14 +508,14 @@ struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...> > {
 
 template <typename T>
 struct index_statically_gt_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) > value);
   }
@@ -510,7 +523,7 @@ struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...> > {
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) > value);
   }
@@ -520,14 +533,14 @@ struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...> > {
 
 template <typename T>
 struct index_statically_lt_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) < value);
   }
@@ -535,7 +548,7 @@ struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...> > {
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) < value);
   }
@@ -545,14 +558,14 @@ struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
 
 template <typename Tx>
 struct index_pair_first_statically_eq_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
   }
@@ -560,7 +573,7 @@ struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes..
 
 template <typename FirstType, typename... OtherTypes>
 struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
   }
@@ -570,14 +583,14 @@ struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherT
 
 template <typename Tx>
 struct index_pair_second_statically_eq_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
   }
@@ -585,7 +598,7 @@ struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes.
 
 template <typename FirstType, typename... OtherTypes>
 struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
   }
@@ -602,7 +615,7 @@ namespace internal {
 
 template <typename T>
 struct index_known_statically_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const Index) {
     return false;
   }
 };
@@ -623,42 +636,42 @@ struct indices_statically_known_to_increase_impl {
 
 template <typename T>
 struct index_statically_eq_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_ne_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_gt_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_lt_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename Tx>
 struct index_pair_first_statically_eq_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename Tx>
 struct index_pair_second_statically_eq_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
     return false;
   }
 };
@@ -674,7 +687,7 @@ struct index_pair_second_statically_eq_impl {
 namespace Eigen {
 namespace internal {
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(DenseIndex i) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) {
   return index_known_statically_impl<T>::run(i);
 }
 
@@ -689,32 +702,32 @@ static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increa
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(DenseIndex i, DenseIndex value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) {
   return index_statically_eq_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(DenseIndex i, DenseIndex value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) {
   return index_statically_ne_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(DenseIndex i, DenseIndex value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) {
   return index_statically_gt_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i, DenseIndex value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) {
   return index_statically_lt_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) {
   return index_pair_first_statically_eq_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) {
   return index_pair_second_statically_eq_impl<T>::run(i, value);
 }
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index f391fb9ee55dbd0b3f707d1248d254656c69de76..c5cb61af53468fd38def4498dab4cab50ae07488 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -31,6 +31,7 @@ struct traits<TensorInflationOp<Strides, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Strides, typename XprType>
@@ -84,18 +85,25 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_strides(op.strides())
   {
     m_dimensions = m_impl.dimensions();
@@ -129,11 +137,11 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -144,6 +152,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
     eigen_assert(index < dimensions().TotalSize());
     *inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_outputStrides[i];
         if (idx != idx / m_fastStrides[i] * m_strides[i]) {
@@ -158,6 +167,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
       *inputIndex += index / m_strides[0];
       return true;
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_outputStrides[i];
         if (idx != idx / m_fastStrides[i] * m_strides[i]) {
@@ -193,6 +203,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -213,7 +224,14 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
                         compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  protected:
   Dimensions m_dimensions;
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
index 33edc49e39a9e0591a325610a686465f186247bf..26a3818f3bce40cc6e4d249d5a674870a5a79139 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
@@ -32,7 +32,7 @@ struct Initializer {
                   Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
                   const InitList& vals) {
     int i = 0;
-    for (auto v : vals) {
+    for (const auto& v : vals) {
       (*indices)[traits<Derived>::NumDimensions - N] = i++;
       Initializer<Derived, N - 1>::run(tensor, indices, v);
     }
@@ -48,7 +48,7 @@ struct Initializer<Derived, 1> {
                   const InitList& vals) {
     int i = 0;
     // There is likely a faster way to do that than iterating.
-    for (auto v : vals) {
+    for (const auto& v : vals) {
       (*indices)[traits<Derived>::NumDimensions - 1] = i++;
       tensor.coeffRef(*indices) = v;
     }
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index ede3939c260dfde66a1c0f605c2e0ba52775996a..6d5cce4aa6fe65c7507309e3278da65a04ec93a1 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -21,7 +21,7 @@ namespace Eigen {
   * \brief Fast integer division by a constant.
   *
   * See the paper from Granlund and Montgomery for explanation.
-  *   (at http://dx.doi.org/10.1145/773473.178249)
+  *   (at https://doi.org/10.1145/773473.178249)
   *
   * \sa Tensor
   */
@@ -35,8 +35,10 @@ namespace {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
   {
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_GPU_COMPILE_PHASE
     return __clz(val);
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::clz(val);
 #elif EIGEN_COMP_MSVC
     unsigned long index;
     _BitScanReverse(&index, val);
@@ -51,8 +53,10 @@ namespace {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
   {
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_GPU_COMPILE_PHASE
     return __clzll(val);
+#elif defined(SYCL_DEVICE_ONLY)
+    return static_cast<int>(cl::sycl::clz(val));
 #elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
     unsigned long index;
     _BitScanReverse64(&index, val);
@@ -86,8 +90,10 @@ namespace {
 
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
     return __umulhi(a, b);
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
 #else
     return (static_cast<uint64_t>(a) * b) >> 32;
 #endif
@@ -95,9 +101,11 @@ namespace {
 
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
     return __umul64hi(a, b);
-#elif defined(__SIZEOF_INT128__)
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
+#elif EIGEN_HAS_BUILTIN_INT128
     __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
     return static_cast<uint64_t>(v >> 64);
 #else
@@ -116,7 +124,7 @@ namespace {
   template <typename T>
   struct DividerHelper<64, T> {
     static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
-#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__)
+#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
       return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
 #else
       const uint64_t shift = 1ULL << log_div;
@@ -159,7 +167,7 @@ struct TensorIntDivisor {
     shift2 = log_div > 1 ? log_div-1 : 0;
   }
 
-  // Must have 0 <= numerator. On platforms that dont support the __uint128_t
+  // Must have 0 <= numerator. On platforms that don't support the __uint128_t
   // type numerator should also be less than 2^32-1.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
     eigen_assert(static_cast<typename UnsignedTraits<T>::type>(numerator) < NumTraits<UnsignedType>::highest()/2);
@@ -195,8 +203,10 @@ class TensorIntDivisor<int32_t, true> {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_GPU_COMPILE_PHASE
     return (__umulhi(magic, n) >> shift);
+#elif defined(SYCL_DEVICE_ONLY)
+    return (cl::sycl::mul_hi(magic, static_cast<uint32_t>(n)) >> shift);
 #else
     uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
     return (static_cast<uint32_t>(v >> 32) >> shift);
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index cd0109ef44d2b2b606c877b54b73ed8ab2a059b0..80106c1a0093dcfaf8b98f8e67ee4738c4c203da 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -46,6 +46,7 @@ struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = traits<XprType>::NumDimensions;
   static const int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename XprType>
@@ -68,39 +69,22 @@ template<typename XprType>
 class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors>
 {
   public:
-  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
+    typedef TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors> Base;
+    typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
-      : m_xpr(expr) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
+        : m_xpr(expr) {}
 
     EIGEN_DEVICE_FUNC
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other)
-    {
-      typedef TensorAssignOp<TensorLayoutSwapOp, const TensorLayoutSwapOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorLayoutSwapOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorLayoutSwapOp)
   protected:
     typename XprType::Nested m_xpr;
 };
@@ -118,12 +102,18 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false,  // to be implemented
     RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
     for(int i = 0; i < NumDims; ++i) {
@@ -131,16 +121,25 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
     }
   }
 
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
+
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     return m_impl.evalSubExprsIfNeeded(data);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -159,7 +158,9 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
     return m_impl.costPerCoeff(vectorized);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const {
+    return constCast(m_impl.data());
+  }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
@@ -180,11 +181,17 @@ template<typename ArgType, typename Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false  // to be implemented
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
   { }
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index ee0078bbcc4cbf67edca70dec65fdddfad48189d..73ff3d2db1a981e3dfd023c9d0504e8d5a78afad 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -27,7 +27,7 @@
  */
 
 // SFINAE requires variadic templates
-#ifndef __CUDACC__
+#if !defined(EIGEN_GPUCC)
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   // SFINAE doesn't work for gcc <= 4.7
   #ifdef EIGEN_COMP_GNUC
@@ -43,12 +43,56 @@
 #define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \
     typename internal::enable_if< ( __condition__ ) , int >::type = 0
 
+// Define a macro to use a reference on the host but a value on the device
+#if defined(SYCL_DEVICE_ONLY)
+  #define EIGEN_DEVICE_REF
+#else
+  #define EIGEN_DEVICE_REF &
+#endif
+
+// Define a macro for catching SYCL exceptions if exceptions are enabled
+#define EIGEN_SYCL_TRY_CATCH(X) \
+  do { \
+    EIGEN_TRY {X;} \
+    EIGEN_CATCH(const cl::sycl::exception& e) { \
+      EIGEN_THROW_X(std::runtime_error("SYCL exception at " + \
+                                       std::string(__FILE__) + ":" + \
+                                       std::to_string(__LINE__) + "\n" + \
+                                       e.what())); \
+    } \
+  } while (false)
 
-#if EIGEN_HAS_CONSTEXPR
-#define EIGEN_CONSTEXPR constexpr
+// Define a macro if local memory flags are unset or one of them is set
+// Setting both flags is the same as unsetting them
+#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \
+     (defined(EIGEN_SYCL_LOCAL_MEM) &&  defined(EIGEN_SYCL_NO_LOCAL_MEM))
+  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
+  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
+#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
+  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
+#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
+#endif
+
+#if EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
+  #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    using Base::operator =; \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
+    template <typename OtherDerived> \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) { Base::operator=(other); return *this; }
 #else
-#define EIGEN_CONSTEXPR
+  #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
 #endif
 
+/** \internal
+ * \brief Macro to manually inherit assignment operators.
+ * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
+ * This also inherits template<OtherDerived> operator=(const OtherDerived&) assignments.
+ * With C++11 or later this also default-implements the copy-constructor
+ */
+#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived)  \
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived)
 
 #endif
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index e4fc86a40cfcf329a9d0259032ce9cd6923fd6f7..6834c97e44152abaaeb01eaf0b325b22a41eb1c0 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -30,21 +30,39 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
 {
   public:
     typedef TensorMap<PlainObjectType, Options_, MakePointer_> Self;
-    typedef typename PlainObjectType::Base Base;
-    typedef typename Eigen::internal::nested<Self>::type Nested;
-    typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+    typedef TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> > Base;
+  #ifdef EIGEN_USE_SYCL
+    typedef  typename Eigen::internal::remove_reference<typename Eigen::internal::nested<Self>::type>::type Nested;
+  #else
+     typedef typename Eigen::internal::nested<Self>::type Nested;
+  #endif
+   typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
     typedef typename internal::traits<PlainObjectType>::Index Index;
     typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename Base::CoeffReturnType CoeffReturnType;
+    typedef typename PlainObjectType::Base::CoeffReturnType CoeffReturnType;
 
-  /*    typedef typename internal::conditional<
-                         bool(internal::is_lvalue<PlainObjectType>::value),
-                         Scalar *,
-                         const Scalar *>::type
-                     PointerType;*/
     typedef typename MakePointer_<Scalar>::Type PointerType;
-    typedef PointerType PointerArgType;
+    typedef typename MakePointer_<Scalar>::ConstType PointerConstType;
+
+    // WARN: PointerType still can be a pointer to const (const Scalar*), for
+    // example in TensorMap<Tensor<const Scalar, ...>> expression. This type of
+    // expression should be illegal, but adding this restriction is not possible
+    // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488).
+    typedef typename internal::conditional<
+        bool(internal::is_lvalue<PlainObjectType>::value),
+        PointerType,      // use simple pointer in lvalue expressions
+        PointerConstType  // use const pointer in rvalue expressions
+        >::type StoragePointerType;
+
+    // If TensorMap was constructed over rvalue expression (e.g. const Tensor),
+    // we should return a reference to const from operator() (and others), even
+    // if TensorMap itself is not const.
+    typedef typename internal::conditional<
+        bool(internal::is_lvalue<PlainObjectType>::value),
+        Scalar&,
+        const Scalar&
+        >::type StorageRefType;
 
     static const int Options = Options_;
 
@@ -59,47 +77,47 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     };
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
       EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
       EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
       EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
       EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #endif
 
-   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions)
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const array<Index, NumIndices>& dimensions)
       : m_data(dataPtr), m_dimensions(dimensions)
     { }
 
     template <typename Dimensions>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions)
       : m_data(dataPtr), m_dimensions(dimensions)
     { }
 
@@ -116,12 +134,12 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE PointerType data() { return m_data; }
+    EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const PointerType data() const { return m_data; }
+    EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices) const
     {
       //      eigen_assert(checkIndexRange(indices));
       if (PlainObjectType::Options&RowMajor) {
@@ -134,14 +152,14 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()() const
+    EIGEN_STRONG_INLINE StorageRefType operator()() const
     {
       EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
       return m_data[0];
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const
     {
       eigen_internal_assert(index >= 0 && index < size());
       return m_data[index];
@@ -149,9 +167,10 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
       if (PlainObjectType::Options&RowMajor) {
         const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
         return m_data[index];
@@ -162,7 +181,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) const
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i1 + i0 * m_dimensions[1];
@@ -173,7 +192,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) const
     {
       if (PlainObjectType::Options&RowMajor) {
          const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
@@ -184,7 +203,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) const
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
@@ -195,7 +214,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
@@ -208,7 +227,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
 #endif
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+    EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices)
     {
       //      eigen_assert(checkIndexRange(indices));
       if (PlainObjectType::Options&RowMajor) {
@@ -221,14 +240,14 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()()
+    EIGEN_STRONG_INLINE StorageRefType operator()()
     {
       EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
       return m_data[0];
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index index)
     {
       eigen_internal_assert(index >= 0 && index < size());
       return m_data[index];
@@ -236,9 +255,10 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
     {
       static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+       eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
       const std::size_t NumDims = sizeof...(otherIndices) + 2;
       if (PlainObjectType::Options&RowMajor) {
         const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
@@ -250,7 +270,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1)
     {
        if (PlainObjectType::Options&RowMajor) {
          const Index index = i1 + i0 * m_dimensions[1];
@@ -261,7 +281,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2)
     {
        if (PlainObjectType::Options&RowMajor) {
          const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
@@ -272,7 +292,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3)
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
@@ -283,7 +303,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
@@ -295,26 +315,10 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 #endif
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other)
-    {
-      typedef TensorAssignOp<Self, const Self> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Self& operator=(const OtherDerived& other)
-    {
-      typedef TensorAssignOp<Self, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorMap)
 
   private:
-    typename MakePointer_<Scalar>::Type m_data;
+    StoragePointerType m_data;
     Dimensions m_dimensions;
 };
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 615559d445473a036fdfd01da703b88febdfcc45..a6181d35e6834f5d1010e2450d5eb04c865a2e3a 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -52,11 +52,13 @@ struct PacketType : internal::packet_traits<Scalar> {
 };
 
 // For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
-template <>
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16)
+
+typedef ulonglong2 Packet4h2;
+template<>
 struct PacketType<half, GpuDevice> {
-  typedef half2 type;
-  static const int size = 2;
+  typedef Packet4h2 type;
+  static const int size = 8;
   enum {
     HasAdd    = 1,
     HasSub    = 1,
@@ -75,6 +77,7 @@ struct PacketType<half, GpuDevice> {
     HasSqrt   = 1,
     HasRsqrt  = 1,
     HasExp    = 1,
+    HasExpm1  = 0,
     HasLog    = 1,
     HasLog1p  = 0,
     HasLog10  = 0,
@@ -84,9 +87,57 @@ struct PacketType<half, GpuDevice> {
 #endif
 
 #if defined(EIGEN_USE_SYCL)
-template <typename T>
-  struct PacketType<T, SyclDevice> {
-  typedef T type;
+
+namespace TensorSycl {
+namespace internal {
+
+template <typename Index, Index A, Index B> struct PlusOp {
+  static constexpr Index Value = A + B;
+};
+
+template <typename Index, Index A, Index B> struct DivOp {
+  static constexpr Index Value = A / B;
+};
+
+template <typename Index, Index start, Index end, Index step,
+          template <class Indx, Indx...> class StepOp>
+struct static_for {
+  template <typename UnaryOperator>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) {
+    op(start);
+    static_for<Index, StepOp<Index, start, step>::Value, end, step,
+               StepOp>::loop(op);
+  }
+};
+template <typename Index, Index end, Index step,
+          template <class Indx, Indx...> class StepOp>
+struct static_for<Index, end, end, step, StepOp> {
+  template <typename UnaryOperator>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {}
+};
+
+template <typename OutScalar, typename Device, bool Vectorizable>
+struct Vectorise {
+  static const int PacketSize = 1;
+  typedef OutScalar PacketReturnType;
+};
+
+template <typename OutScalar, typename Device>
+struct Vectorise<OutScalar, Device, true> {
+  static const int PacketSize = Eigen::PacketType<OutScalar, Device>::size;
+  typedef typename Eigen::PacketType<OutScalar, Device>::type PacketReturnType;
+};
+
+static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) {
+  return ((((x) + (y)-1) / (y)) * (y));
+}
+
+} // namespace internal
+} // namespace TensorSycl
+
+template <>
+  struct PacketType<half, SyclDevice> {
+  typedef half type;
   static const int size = 1;
   enum {
     HasAdd    = 0,
@@ -103,8 +154,58 @@ template <typename T>
     HasBlend  = 0
   };
 };
-#endif
+template <typename Scalar>
+struct PacketType<Scalar, SyclDevice> : internal::default_packet_traits {
+  typedef Scalar type;
+  typedef Scalar half;
+  enum {
+    Vectorizable = 0,
+    size = 1,
+    AlignedOnScalar = 0,
+    HasHalfPacket = 0
+  };
+  enum {
+    HasAdd    = 0,
+    HasSub    = 0,
+    HasMul    = 0,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0
+  };
+
+};
+
+template <typename Scalar>
+struct PacketType<Scalar, const SyclDevice> : PacketType<Scalar, SyclDevice>{};
+
+#ifndef EIGEN_DONT_VECTORIZE_SYCL
+#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)\
+template<> struct PacketType<CVQual Type, DEV> : internal::sycl_packet_traits<val, lengths> \
+{\
+  typedef typename internal::packet_traits<Type>::type type;\
+  typedef typename internal::packet_traits<Type>::half half;\
+};
+
+
+PACKET_TYPE(const, float, 1, 4, SyclDevice)
+PACKET_TYPE(, float, 1, 4, SyclDevice)
+PACKET_TYPE(const, float, 1, 4, const SyclDevice)
+PACKET_TYPE(, float, 1, 4, const SyclDevice)
 
+PACKET_TYPE(const, double, 0, 2, SyclDevice)
+PACKET_TYPE(, double, 0, 2, SyclDevice)
+PACKET_TYPE(const, double, 0, 2, const SyclDevice)
+PACKET_TYPE(, double, 0, 2, const SyclDevice)
+#undef PACKET_TYPE
+
+template<> struct PacketType<half, const SyclDevice>: PacketType<half, SyclDevice>{};
+template<> struct PacketType<const half, const SyclDevice>: PacketType<half, SyclDevice>{};
+#endif
+#endif
 
 // Tuple mimics std::pair but works on e.g. nvcc.
 template <typename U, typename V> struct Tuple {
@@ -121,14 +222,6 @@ template <typename U, typename V> struct Tuple {
   EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Tuple(const U& f, const V& s) : first(f), second(s) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Tuple& operator= (const Tuple& rhs) {
-    if (&rhs == this) return *this;
-    first = rhs.first;
-    second = rhs.second;
-    return *this;
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void swap(Tuple& rhs) {
     using numext::swap;
@@ -168,12 +261,12 @@ template <typename Idx> struct IndexPair {
 #ifdef EIGEN_HAS_SFINAE
 namespace internal {
 
-  template<typename IndexType, Index... Is>
+  template<typename IndexType, typename Index, Index... Is>
   EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   array<Index, sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, Is...>) {
     return { idx[Is]... };
   }
-  template<typename IndexType>
+  template<typename IndexType, typename Index>
   EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) {
     return array<Index, 0>();
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index d34f1e328c60d984bd4fe234552557586a6ee25e..b3f00f77af6a317b19871b7c4f034ad27c3bb0df 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -31,12 +31,13 @@ struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprTyp
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = array_size<NewDimensions>::value;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename NewDimensions, typename XprType>
 struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense>
 {
-  typedef const TensorReshapingOp<NewDimensions, XprType>& type;
+  typedef const TensorReshapingOp<NewDimensions, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename NewDimensions, typename XprType>
@@ -53,6 +54,7 @@ template<typename NewDimensions, typename XprType>
 class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors>
 {
   public:
+  typedef TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors> Base;
   typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
@@ -69,24 +71,7 @@ class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, Xpr
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other)
-    {
-      typedef TensorAssignOp<TensorReshapingOp, const TensorReshapingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorReshapingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReshapingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -101,15 +86,63 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
   typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
   typedef NewDimensions Dimensions;
 
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage;
+
+  static const int NumOutputDims = internal::array_size<Dimensions>::value;
+  static const int NumInputDims  = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+
+  enum ReshapingKind {
+    // We do not use layout information to determine reshaping kind.
+    // Depending on the layout `N` can be inner or outer dimension.
+    OneByN = 0,  // expr.reshape(1, N)
+    NByOne = 1,  // expr.reshape(N, 1)
+    Runtime = 2  // Reshape dimensions are dynamic (specified at runtime).
+  };
+
+  // clang-format off
+  static const ReshapingKind kind =
+#if defined(EIGEN_HAS_INDEX_LIST)
+        (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/0, /*value=*/1)) ? OneByN
+      : (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/1, /*value=*/1)) ? NByOne
+      : Runtime;
+#else
+        Runtime;
+#endif
+  // clang-format on
+
   enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    // For trivial reshapes with raw access to underlying data we will provide
+    // zero overhead block access.
+    // TODO(ezhulenev): Consider adding block access without raw access?
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess &&
+                        NumInputDims > 0 && NumOutputDims > 0,
+    PreferBlockAccess = false,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumOutputDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef
+      typename internal::TensorMaterializedBlock<ScalarNoConst, NumOutputDims,
+                                                 Layout, Index>
+          TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dimensions(op.dimensions())
   {
     // The total size of the reshaped tensor must be equal to the total size
@@ -117,17 +150,20 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
   }
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType data, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(data, std::move(done));
+  }
+#endif
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     return m_impl.evalSubExprsIfNeeded(data);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -146,10 +182,53 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     return m_impl.costPerCoeff(vectorized);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  // required in block(OutputTensorBlock* output_block) const
+  // For C++03 compatibility this must be defined outside the method
+  struct BlockIteratorState {
+    Index stride;
+    Index span;
+    Index size;
+    Index count;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    eigen_assert(m_impl.data() != NULL);
+    eigen_assert((kind == Runtime) ||
+                 (kind == OneByN && desc.dimensions()[0] == 1) ||
+                 (kind == NByOne && desc.dimensions()[1] == 1));
+
+    if (kind == OneByN || kind == NByOne) {
+      // We can guarantee at compile time that block is just a contiguous slice
+      // of the underlying expression memory buffer.
+      return TensorBlock(internal::TensorBlockKind::kView,
+                           m_impl.data() + desc.offset(), desc.dimensions());
+    } else {
+      // This will do additional runtime checks, and in the end it might be also
+      // a view, or it might be a block materialized in the temporary buffer.
+      return TensorBlock::materialize(m_impl.data(), m_dimensions, desc,
+                                        scratch);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const {
+    return constCast(m_impl.data());
+  }
 
   EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
+  #ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+  #endif
  protected:
   TensorEvaluator<ArgType, Device> m_impl;
   NewDimensions m_dimensions;
@@ -167,14 +246,16 @@ template<typename NewDimensions, typename ArgType, typename Device>
   typedef NewDimensions Dimensions;
 
   enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = false,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
   { }
 
@@ -183,15 +264,38 @@ template<typename NewDimensions, typename ArgType, typename Device>
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
 
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<TensorEvaluator::NumOutputDims, Index>
+      TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
     return this->m_impl.coeffRef(index);
   }
+
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
     this->m_impl.template writePacket<StoreMode>(index, x);
   }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlockDesc& desc, const TensorBlock& block) {
+    assert(this->m_impl.data() != NULL);
+
+    typedef typename TensorBlock::XprType TensorBlockExpr;
+    typedef internal::TensorBlockAssignment<
+        Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index>
+        TensorBlockAssign;
+
+    TensorBlockAssign::Run(
+        TensorBlockAssign::target(desc.dimensions(),
+                                  internal::strides<Layout>(this->dimensions()),
+                                  this->m_impl.data(), desc.offset()),
+        block.expr());
+  }
 };
 
 
@@ -214,12 +318,13 @@ struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<Xp
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = array_size<StartIndices>::value;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename StartIndices, typename Sizes, typename XprType>
 struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense>
 {
-  typedef const TensorSlicingOp<StartIndices, Sizes, XprType>& type;
+  typedef const TensorSlicingOp<StartIndices, Sizes, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename StartIndices, typename Sizes, typename XprType>
@@ -236,6 +341,7 @@ template<typename StartIndices, typename Sizes, typename XprType>
 class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> >
 {
   public:
+  typedef TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> > Base;
   typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
@@ -254,25 +360,7 @@ class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, X
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorSlicingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other)
-    {
-      typedef TensorAssignOp<TensorSlicingOp, const TensorSlicingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorSlicingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -283,9 +371,12 @@ class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, X
 
 // Fixme: figure out the exact threshold
 namespace {
-template <typename Index, typename Device> struct MemcpyTriggerForSlicing {
+template <typename Index, typename Device, bool BlockAccess> struct MemcpyTriggerForSlicing {
   EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { }
-  EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > threshold_; }
+  EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const {
+    const bool prefer_block_evaluation = BlockAccess && total > 32*1024;
+    return !prefer_block_evaluation && contiguous > threshold_;
+  }
 
  private:
   Index threshold_;
@@ -294,11 +385,21 @@ template <typename Index, typename Device> struct MemcpyTriggerForSlicing {
 // It is very expensive to start the memcpy kernel on GPU: we therefore only
 // use it for large copies.
 #ifdef EIGEN_USE_GPU
-template <typename Index> struct MemcpyTriggerForSlicing<Index, GpuDevice>  {
+template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, GpuDevice, BlockAccess>  {
   EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { }
-  EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; }
+  EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
 };
 #endif
+
+// It is very expensive to start the memcpy kernel on GPU: we therefore only
+// use it for large copies.
+#ifdef EIGEN_USE_SYCL
+template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice, BlockAccess>  {
+  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { }
+  EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
+};
+#endif
+
 }
 
 // Eval as rvalue
@@ -308,23 +409,56 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
   static const int NumDims = internal::array_size<Sizes>::value;
 
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Sizes Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
     // slice offsets and sizes.
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess &&
+                        // FIXME: Temporary workaround for bug in slicing of bool tensors.
+                        !internal::is_same<typename internal::remove_const<Scalar>::type, bool>::value,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  // Tensor slicing does not change the block type.
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
   {
-    for (std::size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
-      eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
+    m_is_identity = true;
+    for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+      eigen_assert(m_impl.dimensions()[i] >=
+                   op.sizes()[i] + op.startIndices()[i]);
+      if (m_impl.dimensions()[i] != op.sizes()[i] ||
+          op.startIndices()[i] != 0) {
+        m_is_identity = false;
+      }
     }
 
+    // No strides for scalars.
+    if (NumDims == 0) return;
+
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     const Sizes& output_dims = op.sizes();
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -337,7 +471,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       m_outputStrides[0] = 1;
       for (int i = 1; i < NumDims; ++i) {
         m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
       }
     } else {
       m_inputStrides[NumDims-1] = 1;
@@ -349,23 +483,17 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       m_outputStrides[NumDims-1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
         m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
       }
     }
   }
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef Sizes Dimensions;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     m_impl.evalSubExprsIfNeeded(NULL);
-    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data && m_impl.data()) {
+    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization
+        && data && m_impl.data()) {
       Index contiguous_values = 1;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         for (int i = 0; i < NumDims; ++i) {
@@ -383,12 +511,12 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
         }
       }
       // Use memcpy if it's going to be faster than using the regular evaluation.
-      const MemcpyTriggerForSlicing<Index, Device> trigger(m_device);
-      if (trigger(contiguous_values)) {
-        Scalar* src = (Scalar*)m_impl.data();
-        for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
+      const MemcpyTriggerForSlicing<Index, Device, BlockAccess> trigger(m_device);
+      if (trigger(internal::array_prod(dimensions()), contiguous_values)) {
+        EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data();
+        for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
           Index offset = srcCoeff(i);
-          m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar));
+          m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar));
         }
         return false;
       }
@@ -396,25 +524,42 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType /*data*/, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    return m_impl.coeff(srcCoeff(index));
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
   }
 
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
     EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
 
+    if (m_is_identity) {
+      return m_impl.template packet<LoadMode>(index);
+    }
+
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + packetSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / m_fastOutputStrides[i];
         const Index idx1 = indices[1] / m_fastOutputStrides[i];
@@ -426,6 +571,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       inputIndices[0] += (indices[0] + m_offsets[0]);
       inputIndices[1] += (indices[1] + m_offsets[0]);
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx0 = indices[0] / m_fastOutputStrides[i];
         const Index idx1 = indices[1] / m_fastOutputStrides[i];
@@ -445,6 +591,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
       values[0] = m_impl.coeff(inputIndices[0]);
       values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < packetSize-1; ++i) {
         values[i] = coeff(index+i);
       }
@@ -454,12 +601,28 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
+        m_impl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset()));
+    TensorBlock block = m_impl.block(arg_desc, scratch);
+    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
+    return block;
+  }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
-    Scalar* result = m_impl.data();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+    typename Storage::Type result = constCast(m_impl.data());
     if (result) {
       Index offset = 0;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -493,12 +656,19 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     }
     return NULL;
   }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_fastOutputStrides[i];
         inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
@@ -506,6 +676,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       }
       inputIndex += (index + m_offsets[0]);
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_fastOutputStrides[i];
         inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
@@ -520,8 +691,9 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
   TensorEvaluator<ArgType, Device> m_impl;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
   Dimensions m_dimensions;
+  bool m_is_identity;
   const StartIndices m_offsets;
 };
 
@@ -535,36 +707,55 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
   typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
   static const int NumDims = internal::array_size<Sizes>::value;
 
-  enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = false
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : Base(op, device)
-    { }
-
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef Sizes Dimensions;
 
+  enum {
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,
+    RawAccess         = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
+  };
+
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+    { }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
-    return this->m_impl.coeffRef(this->srcCoeff(index));
+    if (this->m_is_identity) {
+      return this->m_impl.coeffRef(index);
+    } else {
+      return this->m_impl.coeffRef(this->srcCoeff(index));
+    }
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    if (this->m_is_identity) {
+      this->m_impl.template writePacket<StoreMode>(index, x);
+      return;
+    }
+
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + packetSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
         const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
@@ -576,6 +767,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
       inputIndices[0] += (indices[0] + this->m_offsets[0]);
       inputIndices[1] += (indices[1] + this->m_offsets[0]);
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
         const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
@@ -595,14 +787,20 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
       internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
       this->m_impl.coeffRef(inputIndices[0]) = values[0];
       this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < packetSize-1; ++i) {
         this->coeffRef(index+i) = values[i];
       }
     }
   }
-};
-
 
+  template<typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlockDesc& desc, const TensorBlock& block) {
+    TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset()));
+    this->m_impl.writeBlock(arg_desc, block);
+  }
+};
 
 namespace internal {
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
@@ -616,12 +814,13 @@ struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprTyp
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = array_size<StartIndices>::value;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
 struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense>
 {
-  typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>& type;
+  typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
@@ -637,6 +836,7 @@ template<typename StartIndices, typename StopIndices, typename Strides, typename
 class TensorStridingSlicingOp : public TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >
 {
   public:
+  typedef TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > Base;
   typedef typename internal::traits<TensorStridingSlicingOp>::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename internal::nested<TensorStridingSlicingOp>::type Nested;
@@ -660,26 +860,7 @@ class TensorStridingSlicingOp : public TensorBase<TensorStridingSlicingOp<StartI
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other)
-    {
-      typedef TensorAssignOp<TensorStridingSlicingOp, const TensorStridingSlicingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorStridingSlicingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingSlicingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -694,6 +875,13 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
 {
   typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
   static const int NumDims = internal::array_size<Strides>::value;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef Strides Dimensions;
 
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
@@ -701,43 +889,58 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_device(device), m_strides(op.strides())
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device),
+        m_device(device),
+        m_strides(op.strides())
   {
     // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
-    DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
-    for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+    DSizes<Index, NumDims> startIndicesClamped, stopIndicesClamped;
+    for (ptrdiff_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
       eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
-      if(m_strides[i]>0){
-        startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
-        stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
-      }else{
-        /* implies m_strides[i]<0 by assert */
-        startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
-        stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
+      if (m_strides[i] > 0) {
+        startIndicesClamped[i] =
+            clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
+        stopIndicesClamped[i] =
+            clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
+      } else {
+        /* implies m_strides[i] < 0 by assert */
+        startIndicesClamped[i] =
+            clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
+        stopIndicesClamped[i] =
+            clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
       }
       m_startIndices[i] = startIndicesClamped[i];
     }
 
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+    const InputDimensions& input_dims = m_impl.dimensions();
 
-    // check for degenerate intervals and compute output tensor shape
-    bool degenerate = false;;
-    for(int i = 0; i < NumDims; i++){
+    // compute output tensor shape
+    m_is_identity = true;
+    for (int i = 0; i < NumDims; i++) {
       Index interval = stopIndicesClamped[i] - startIndicesClamped[i];
-      if(interval == 0 || ((interval<0) != (m_strides[i]<0))){
+      if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) {
         m_dimensions[i] = 0;
-        degenerate = true;
-      }else{
-        m_dimensions[i] = interval / m_strides[i]
-                          + (interval % m_strides[i] != 0 ? 1 : 0);
+      } else {
+        m_dimensions[i] =
+            (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0);
         eigen_assert(m_dimensions[i] >= 0);
       }
+      if (m_strides[i] != 1 || interval != m_impl.dimensions()[i]) {
+        m_is_identity = false;
+      }
     }
+
     Strides output_dims = m_dimensions;
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -754,8 +957,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
       m_outputStrides[0] = 1;
       for (int i = 1; i < NumDims; ++i) {
         m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
-        // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
       }
     } else {
       m_inputStrides[NumDims-1] = m_strides[NumDims-1];
@@ -770,58 +972,58 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
       m_outputStrides[NumDims-1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
         m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
-        // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
       }
     }
-    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
-                                          device.lastLevelCacheSize() /
-                                          sizeof(Scalar));
   }
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef Strides Dimensions;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    return m_impl.coeff(srcCoeff(index));
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
     return NULL;
   }
-
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i >= 0; --i) {
         const Index idx = index / m_fastOutputStrides[i];
         inputIndex += idx * m_inputStrides[i] + m_offsets[i];
         index -= idx * m_outputStrides[i];
       }
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims; ++i) {
         const Index idx = index / m_fastOutputStrides[i];
         inputIndex += idx * m_inputStrides[i] + m_offsets[i];
@@ -831,20 +1033,24 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
     return inputIndex;
   }
 
-  static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+#ifndef SYCL_DEVICE_ONLY
     return numext::maxi(min, numext::mini(max,value));
+#else
+    return cl::sycl::clamp(value, min, max);
+#endif
   }
 
   array<Index, NumDims> m_outputStrides;
   array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
+  bool m_is_identity;
   TensorEvaluator<ArgType, Device> m_impl;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
   DSizes<Index, NumDims> m_startIndices; // clamped startIndices
   DSizes<Index, NumDims> m_dimensions;
   DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
   const Strides m_strides;
-  std::size_t m_block_total_size_max;
 };
 
 // Eval as lvalue
@@ -860,25 +1066,33 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
     { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef Strides Dimensions;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
-    return this->m_impl.coeffRef(this->srcCoeff(index));
+    if (this->m_is_identity) {
+      return this->m_impl.coeffRef(index);
+    } else {
+      return this->m_impl.coeffRef(this->srcCoeff(index));
+    }
   }
 };
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 647bcf10887e26cb55fe50e5c768881eeb50770f..ee44382cffacfcaf0da049991252fe6f3353ecc8 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -31,6 +31,7 @@ struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprT
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename PaddingDimensions, typename XprType>
@@ -90,18 +91,33 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = true,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = true,
-    RawAccess = false
+    IsAligned         = true,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = true,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value())
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device)
   {
     // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
     // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
@@ -135,11 +151,20 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -148,6 +173,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     eigen_assert(index < dimensions().TotalSize());
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_outputStrides[i];
         if (isPaddingAtIndexForDim(idx, i)) {
@@ -161,6 +187,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       }
       inputIndex += (index - m_padding[0].first);
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_outputStrides[i+1];
         if (isPaddingAtIndexForDim(idx, i)) {
@@ -189,18 +216,298 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     TensorOpCost cost = m_impl.costPerCoeff(vectorized);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims; ++i)
         updateCostPerDimension(cost, i, i == 0);
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i >= 0; --i)
         updateCostPerDimension(cost, i, i == NumDims - 1);
     }
     return cost;
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
+        m_impl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    // If one of the dimensions is zero, return empty block view.
+    if (desc.size() == 0) {
+      return TensorBlock(internal::TensorBlockKind::kView, NULL,
+                           desc.dimensions());
+    }
+
+    static const bool IsColMajor = Layout == static_cast<int>(ColMajor);
+    const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1;
+
+    Index offset = desc.offset();
+
+    // Compute offsets in the output tensor corresponding to the desc.offset().
+    DSizes<Index, NumDims> output_offsets;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      const int stride_dim = IsColMajor ? dim : dim + 1;
+      output_offsets[dim] = offset / m_outputStrides[stride_dim];
+      offset -= output_offsets[dim] * m_outputStrides[stride_dim];
+    }
+    output_offsets[inner_dim_idx] = offset;
+
+    // Offsets in the input corresponding to output offsets.
+    DSizes<Index, NumDims> input_offsets = output_offsets;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      input_offsets[dim] = input_offsets[dim] - m_padding[dim].first;
+    }
+
+    // Compute offset in the input buffer (at this point it might be illegal and
+    // point outside of the input buffer, because we don't check for negative
+    // offsets, it will be autocorrected in the block iteration loop below).
+    Index input_offset = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      input_offset += input_offsets[dim] * m_inputStrides[dim];
+    }
+
+    // Destination buffer and scratch buffer both indexed from 0 and have the
+    // same dimensions as the requested block (for destination buffer this
+    // property is guaranteed by `desc.destination()`).
+    Index output_offset = 0;
+    const DSizes<Index, NumDims> output_strides =
+        internal::strides<Layout>(desc.dimensions());
+
+    // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1`
+    // dimensions, skipping innermost dimension. In theory it should be possible
+    // to squeeze matching innermost dimensions, however in practice that did
+    // not show any improvements in benchmarks. Also in practice first outer
+    // dimension usually has padding, and will prevent squeezing.
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims - 1> it;
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const int dim = IsColMajor ? i + 1 : NumDims - i - 2;
+      it[i].count = 0;
+      it[i].size = desc.dimension(dim);
+
+      it[i].input_stride = m_inputStrides[dim];
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      it[i].output_stride = output_strides[dim];
+      it[i].output_span = it[i].output_stride * (it[i].size - 1);
+    }
+
+    const Index input_inner_dim_size =
+        static_cast<Index>(m_impl.dimensions()[inner_dim_idx]);
+
+    // Total output size.
+    const Index output_size = desc.size();
+
+    // We will fill inner dimension of this size in the output. It might be
+    // larger than the inner dimension in the input, so we might have to pad
+    // before/after we copy values from the input inner dimension.
+    const Index output_inner_dim_size = desc.dimension(inner_dim_idx);
+
+    // How many values to fill with padding BEFORE reading from the input inner
+    // dimension.
+    const Index output_inner_pad_before_size =
+        input_offsets[inner_dim_idx] < 0
+            ? numext::mini(numext::abs(input_offsets[inner_dim_idx]),
+                           output_inner_dim_size)
+            : 0;
+
+    // How many values we can actually copy from the input inner dimension.
+    const Index output_inner_copy_size = numext::mini(
+        // Want to copy from input.
+        (output_inner_dim_size - output_inner_pad_before_size),
+        // Can copy from input.
+        numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] +
+                                             output_inner_pad_before_size),
+                     Index(0)));
+
+    eigen_assert(output_inner_copy_size >= 0);
+
+    // How many values to fill with padding AFTER reading from the input inner
+    // dimension.
+    const Index output_inner_pad_after_size =
+        (output_inner_dim_size - output_inner_copy_size -
+         output_inner_pad_before_size);
+
+    // Sanity check, sum of all sizes must be equal to the output size.
+    eigen_assert(output_inner_dim_size ==
+                 (output_inner_pad_before_size + output_inner_copy_size +
+                  output_inner_pad_after_size));
+
+    // Keep track of current coordinates and padding in the output.
+    DSizes<Index, NumDims> output_coord = output_offsets;
+    DSizes<Index, NumDims> output_padded;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+    }
+
+    typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;
+
+    // Prepare storage for the materialized padding result.
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch);
+
+    // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a
+    // single logical inner dimension.
+
+    // When possible we squeeze writes for the innermost (only if non-padded)
+    // dimension with the first padded dimension. This allows to reduce the
+    // number of calls to LinCopy and better utilize vector instructions.
+    const bool squeeze_writes =
+        NumDims > 1 &&
+        // inner dimension is not padded
+        (input_inner_dim_size == m_dimensions[inner_dim_idx]) &&
+        // and equal to the block inner dimension
+        (input_inner_dim_size == output_inner_dim_size);
+
+    const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1;
+
+    // Maximum coordinate on a squeeze dimension that we can write to.
+    const Index squeeze_max_coord =
+        squeeze_writes ? numext::mini(
+                             // max non-padded element in the input
+                             static_cast<Index>(m_dimensions[squeeze_dim] -
+                                                m_padding[squeeze_dim].second),
+                             // max element in the output buffer
+                             static_cast<Index>(output_offsets[squeeze_dim] +
+                                                desc.dimension(squeeze_dim)))
+                       : static_cast<Index>(0);
+
+    // Iterate copying data from `m_impl.data()` to the output buffer.
+    for (Index size = 0; size < output_size;) {
+      // Detect if we are in the padded region (exclude innermost dimension).
+      bool is_padded = false;
+      for (int j = 1; j < NumDims; ++j) {
+        const int dim = IsColMajor ? j : NumDims - j - 1;
+        is_padded = output_padded[dim];
+        if (is_padded) break;
+      }
+
+      if (is_padded) {
+        // Fill single innermost dimension with padding value.
+        size += output_inner_dim_size;
+
+        LinCopy::template Run<LinCopy::Kind::FillLinear>(
+            typename LinCopy::Dst(output_offset, 1, block_storage.data()),
+            typename LinCopy::Src(0, 0, &m_paddingValue),
+            output_inner_dim_size);
+
+
+      } else if (squeeze_writes) {
+        // Squeeze multiple reads from innermost dimensions.
+        const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim];
+        size += output_inner_dim_size * squeeze_num;
+
+        // Copy `squeeze_num` inner dimensions from input to output.
+        LinCopy::template Run<LinCopy::Kind::Linear>(
+            typename LinCopy::Dst(output_offset, 1, block_storage.data()),
+            typename LinCopy::Src(input_offset, 1, m_impl.data()),
+            output_inner_dim_size * squeeze_num);
+
+        // Update iteration state for only `squeeze_num - 1` processed inner
+        // dimensions, because we have another iteration state update at the end
+        // of the loop that will update iteration state for the last inner
+        // processed dimension.
+        it[0].count += (squeeze_num - 1);
+        input_offset += it[0].input_stride * (squeeze_num - 1);
+        output_offset += it[0].output_stride * (squeeze_num - 1);
+        output_coord[squeeze_dim] += (squeeze_num - 1);
+
+      } else {
+        // Single read from innermost dimension.
+        size += output_inner_dim_size;
+
+        {  // Fill with padding before copying from input inner dimension.
+          const Index out = output_offset;
+
+          LinCopy::template Run<LinCopy::Kind::FillLinear>(
+              typename LinCopy::Dst(out, 1, block_storage.data()),
+              typename LinCopy::Src(0, 0, &m_paddingValue),
+              output_inner_pad_before_size);
+        }
+
+        {  // Copy data from input inner dimension.
+          const Index out = output_offset + output_inner_pad_before_size;
+          const Index in = input_offset + output_inner_pad_before_size;
+
+          eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL);
+
+          LinCopy::template Run<LinCopy::Kind::Linear>(
+              typename LinCopy::Dst(out, 1, block_storage.data()),
+              typename LinCopy::Src(in, 1, m_impl.data()),
+              output_inner_copy_size);
+        }
+
+        {  // Fill with padding after copying from input inner dimension.
+          const Index out = output_offset + output_inner_pad_before_size +
+                            output_inner_copy_size;
+
+          LinCopy::template Run<LinCopy::Kind::FillLinear>(
+              typename LinCopy::Dst(out, 1, block_storage.data()),
+              typename LinCopy::Src(0, 0, &m_paddingValue),
+              output_inner_pad_after_size);
+        }
+      }
+
+      for (int j = 0; j < NumDims - 1; ++j) {
+        const int dim = IsColMajor ? j + 1 : NumDims - j - 2;
+
+        if (++it[j].count < it[j].size) {
+          input_offset += it[j].input_stride;
+          output_offset += it[j].output_stride;
+          output_coord[dim] += 1;
+          output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+          break;
+        }
+        it[j].count = 0;
+        input_offset -= it[j].input_span;
+        output_offset -= it[j].output_span;
+        output_coord[dim] -= it[j].size - 1;
+        output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : count(0),
+          size(0),
+          input_stride(0),
+          input_span(0),
+          output_stride(0),
+          output_span(0) {}
+
+    Index count;
+    Index size;
+    Index input_stride;
+    Index input_span;
+    Index output_stride;
+    Index output_span;
+  };
+
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
       Index index, int dim_index) const {
 #if defined(EIGEN_HAS_INDEX_LIST)
@@ -262,22 +569,23 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
     const Index initialIndex = index;
     Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
     for (int i = NumDims - 1; i > 0; --i) {
-      const Index first = index;
-      const Index last = index + PacketSize - 1;
+      const Index firstIdx = index;
+      const Index lastIdx = index + PacketSize - 1;
       const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
       const Index lastPaddedRight = m_outputStrides[i+1];
 
-      if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
+      if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
+      else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
+      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
         // all the coefficient are between the 2 padding zones.
         const Index idx = index / m_outputStrides[i];
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
@@ -289,21 +597,21 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       }
     }
 
-    const Index last = index + PacketSize - 1;
-    const Index first = index;
+    const Index lastIdx = index + PacketSize - 1;
+    const Index firstIdx = index;
     const Index lastPaddedLeft = m_padding[0].first;
     const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
     const Index lastPaddedRight = m_outputStrides[1];
 
-    if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) {
+    if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) {
+    else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
+    else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
       // all the coefficient are between the 2 padding zones.
       inputIndex += (index - m_padding[0].first);
       return m_impl.template packet<Unaligned>(inputIndex);
@@ -319,23 +627,23 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
     const Index initialIndex = index;
     Index inputIndex = 0;
-
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < NumDims - 1; ++i) {
-      const Index first = index;
-      const Index last = index + PacketSize - 1;
+      const Index firstIdx = index;
+      const Index lastIdx = index + PacketSize - 1;
       const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
       const Index lastPaddedRight = m_outputStrides[i];
 
-      if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
+      if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
+      else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
+      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
         // all the coefficient are between the 2 padding zones.
         const Index idx = index / m_outputStrides[i+1];
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
@@ -347,21 +655,21 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       }
     }
 
-    const Index last = index + PacketSize - 1;
-    const Index first = index;
+    const Index lastIdx = index + PacketSize - 1;
+    const Index firstIdx = index;
     const Index lastPaddedLeft = m_padding[NumDims-1].first;
     const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
     const Index lastPaddedRight = m_outputStrides[NumDims-1];
 
-    if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) {
+    if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) {
+    else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
+    else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
       // all the coefficient are between the 2 padding zones.
       inputIndex += (index - m_padding[NumDims-1].first);
       return m_impl.template packet<Unaligned>(inputIndex);
@@ -373,6 +681,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -387,6 +696,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   PaddingDimensions m_padding;
 
   Scalar m_paddingValue;
+
+  const Device EIGEN_DEVICE_REF m_device;
 };
 
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 886a254f66ffdeb8627ba74cb8aa67b934cad5bf..413d25dd4da57c6ee7c313e13d7ea92a82bd168d 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -31,6 +31,7 @@ struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions + 1;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename PatchDim, typename XprType>
@@ -87,18 +88,26 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
 
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
  };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
     Index num_patches = 1;
@@ -143,12 +152,12 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -161,6 +170,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
     Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index];
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 2; i > 0; --i) {
         const Index patchIdx = patchIndex / m_patchStrides[i];
         patchIndex -= patchIdx * m_patchStrides[i];
@@ -169,6 +179,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
         inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
       }
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 2; ++i) {
         const Index patchIdx = patchIndex / m_patchStrides[i];
         patchIndex -= patchIdx * m_patchStrides[i];
@@ -196,6 +207,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
 
     Index inputIndices[2] = {0, 0};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 2; i > 0; --i) {
         const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
                                    patchIndices[1] / m_patchStrides[i]};
@@ -211,6 +223,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
         inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
       }
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 2; ++i) {
         const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
                                    patchIndices[1] / m_patchStrides[i]};
@@ -237,6 +250,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
       EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
       values[0] = m_impl.coeff(inputIndices[0]);
       values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize-1; ++i) {
         values[i] = coeff(index+i);
       }
@@ -253,7 +267,14 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
            TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh); 
+  }
+#endif
 
  protected:
   Dimensions m_dimensions;
@@ -262,6 +283,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
   array<Index, NumDims-1> m_patchStrides;
 
   TensorEvaluator<ArgType, Device> m_impl;
+
 };
 
 } // end namespace Eigen
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index 1655a813e4f2f440037930517841b74c749d7d46..37c1d1c3d727eb1c09c56b7c3c9849e1bbb31a4e 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -16,50 +17,23 @@ namespace internal {
 namespace {
 
 EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
-#ifdef __CUDA_ARCH__
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   // We don't support 3d kernels since we currently only use 1 and
   // 2d kernels.
-  assert(threadIdx.z == 0);
-  return clock64() +
-      blockIdx.x * blockDim.x + threadIdx.x +
-      gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
-
-#elif defined _WIN32
-  // Use the current time as a baseline.
-  SYSTEMTIME st;
-  GetSystemTime(&st);
-  int time = st.wSecond + 1000 * st.wMilliseconds;
-  // Mix in a random number to make sure that we get different seeds if
-  // we try to generate seeds faster than the clock resolution.
-  // We need 2 random values since the generator only generate 16 bits at
-  // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx)
-  int rnd1 = ::rand();
-  int rnd2 = ::rand();
-  uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
-  return rnd;
-
-#elif defined __APPLE__
-  // Same approach as for win32, except that the random number generator
-  // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
-  uint64_t rnd = ::random() ^ mach_absolute_time();
-  return rnd;
-
+  gpu_assert(threadIdx.z == 0);
+  return blockIdx.x * blockDim.x + threadIdx.x 
+         + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
 #else
-  // Augment the current time with pseudo random number generation
-  // to ensure that we get different seeds if we try to generate seeds
-  // faster than the clock resolution.
-  timespec ts;
-  clock_gettime(CLOCK_REALTIME, &ts);
-  uint64_t rnd = ::random() ^ ts.tv_nsec;
-  return rnd;
+  // Rely on Eigen's random implementation.
+  return random<uint64_t>();
 #endif
 }
 
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
   // TODO: Unify with the implementation in the non blocking thread pool.
   uint64_t current = *state;
   // Update the internal state
-  *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+  *state = current * 6364136223846793005ULL + (stream << 1 | 1);
   // Generate the random output (using the PCG-XSH-RS scheme)
   return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
 }
@@ -73,34 +47,42 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t
 
 
 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-T RandomToTypeUniform(uint64_t* state) {
-  unsigned rnd = PCG_XSH_RS_generator(state);
+T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
   return static_cast<T>(rnd);
 }
 
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
-  Eigen::half result;
-  // Generate 10 random bits for the mantissa
-  unsigned rnd = PCG_XSH_RS_generator(state);
-  result.x = static_cast<uint16_t>(rnd & 0x3ffu);
-  // Set the exponent
-  result.x |= (static_cast<uint16_t>(15) << 10);
+Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
+  // Generate 10 random bits for the mantissa, merge with exponent.
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
+  Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
   // Return the final result
   return result - Eigen::half(1.0f);
 }
 
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {
+
+  // Generate 7 random bits for the mantissa, merge with exponent.
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
+  Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
+  // Return the final result
+  return result - Eigen::bfloat16(1.0f);
+}
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float RandomToTypeUniform<float>(uint64_t* state) {
+float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
   typedef union {
     uint32_t raw;
     float fp;
   } internal;
   internal result;
   // Generate 23 random bits for the mantissa mantissa
-  const unsigned rnd = PCG_XSH_RS_generator(state);
+  const unsigned rnd = PCG_XSH_RS_generator(state, stream);
   result.raw = rnd & 0x7fffffu;
   // Set the exponent
   result.raw |= (static_cast<uint32_t>(127) << 23);
@@ -109,7 +91,7 @@ float RandomToTypeUniform<float>(uint64_t* state) {
 }
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double RandomToTypeUniform<double>(uint64_t* state) {
+double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
   typedef union {
     uint64_t raw;
     double dp;
@@ -118,9 +100,9 @@ double RandomToTypeUniform<double>(uint64_t* state) {
   result.raw = 0;
   // Generate 52 random bits for the mantissa
   // First generate the upper 20 bits
-  unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
+  unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
   // The generate the lower 32 bits
-  unsigned rnd2 = PCG_XSH_RS_generator(state);
+  unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
   result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
   // Set the exponent
   result.raw |= (static_cast<uint64_t>(1023) << 52);
@@ -129,14 +111,14 @@ double RandomToTypeUniform<double>(uint64_t* state) {
 }
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
-  return std::complex<float>(RandomToTypeUniform<float>(state),
-                             RandomToTypeUniform<float>(state));
+std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
+  return std::complex<float>(RandomToTypeUniform<float>(state, stream),
+                             RandomToTypeUniform<float>(state, stream));
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
-  return std::complex<double>(RandomToTypeUniform<double>(state),
-                              RandomToTypeUniform<double>(state));
+std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
+  return std::complex<double>(RandomToTypeUniform<double>(state, stream),
+                              RandomToTypeUniform<double>(state, stream));
 }
 
 template <typename T> class UniformRandomGenerator {
@@ -147,17 +129,42 @@ template <typename T> class UniformRandomGenerator {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
       uint64_t seed = 0) {
     m_state = PCG_XSH_RS_state(seed);
+    #ifdef EIGEN_USE_SYCL
+    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+    // Therefor, we need two step to initializate the m_state.
+    // IN SYCL, the constructor of the functor is s called on the CPU
+    // and we get the clock seed here from the CPU. However, This seed is
+    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
+    // and only  available on the Operator() function (which is called on the GPU).
+    // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
+    // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
+    // the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
+    // similar to CUDA Therefore, the thread Id injection is not available at this stage.
+    //However when the operator() is called the thread ID will be avilable. So inside the opeator,
+    // we add the thrreadID, BlockId,... (which is equivalent of i)
+    //to the seed and construct the unique m_state per thead similar to cuda.
+    m_exec_once =false;
+   #endif
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
       const UniformRandomGenerator& other) {
     m_state = other.m_state;
+    #ifdef EIGEN_USE_SYCL
+     m_exec_once =other.m_exec_once;
+    #endif
   }
 
   template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   T operator()(Index i) const {
-    uint64_t local_state = m_state + i;
-    T result = RandomToTypeUniform<T>(&local_state);
-    m_state = local_state;
+    #ifdef EIGEN_USE_SYCL
+      if(!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
+       m_state += (i * 6364136223846793005ULL);
+       m_exec_once =true;
+      }
+    #endif
+    T result = RandomToTypeUniform<T>(&m_state, i);
     return result;
   }
 
@@ -165,16 +172,25 @@ template <typename T> class UniformRandomGenerator {
   Packet packetOp(Index i) const {
     const int packetSize = internal::unpacket_traits<Packet>::size;
     EIGEN_ALIGN_MAX T values[packetSize];
-    uint64_t local_state = m_state + i;
+      #ifdef EIGEN_USE_SYCL
+      if(!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+       m_state += (i * 6364136223846793005ULL);
+       m_exec_once =true;
+      }
+    #endif
+    EIGEN_UNROLL_LOOP
     for (int j = 0; j < packetSize; ++j) {
-      values[j] = RandomToTypeUniform<T>(&local_state);
+      values[j] = RandomToTypeUniform<T>(&m_state, i);
     }
-    m_state = local_state;
     return internal::pload<Packet>(values);
   }
 
  private:
   mutable uint64_t m_state;
+  #ifdef EIGEN_USE_SYCL
+  mutable bool m_exec_once;
+  #endif
 };
 
 template <typename Scalar>
@@ -190,14 +206,14 @@ struct functor_traits<UniformRandomGenerator<Scalar> > {
 
 
 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-T RandomToTypeNormal(uint64_t* state) {
+T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
   // Use the ratio of uniform method to generate numbers following a normal
   // distribution. See for example Numerical Recipes chapter 7.3.9 for the
   // details.
   T u, v, q;
   do {
-    u = RandomToTypeUniform<T>(state);
-    v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
+    u = RandomToTypeUniform<T>(state, stream);
+    v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
     const T x = u - T(0.449871);
     const T y = numext::abs(v) + T(0.386595);
     q = x*x + y * (T(0.196)*y - T(0.25472)*x);
@@ -208,14 +224,14 @@ T RandomToTypeNormal(uint64_t* state) {
 }
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
-  return std::complex<float>(RandomToTypeNormal<float>(state),
-                             RandomToTypeNormal<float>(state));
+std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
+  return std::complex<float>(RandomToTypeNormal<float>(state, stream),
+                             RandomToTypeNormal<float>(state, stream));
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
-  return std::complex<double>(RandomToTypeNormal<double>(state),
-                              RandomToTypeNormal<double>(state));
+std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
+  return std::complex<double>(RandomToTypeNormal<double>(state, stream),
+                              RandomToTypeNormal<double>(state, stream));
 }
 
 
@@ -226,17 +242,38 @@ template <typename T> class NormalRandomGenerator {
   // Uses the given "seed" if non-zero, otherwise uses a random seed.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
     m_state = PCG_XSH_RS_state(seed);
+    #ifdef EIGEN_USE_SYCL
+    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+    // Therefor, we need two steps to initializate the m_state.
+    // IN SYCL, the constructor of the functor is s called on the CPU
+    // and we get the clock seed here from the CPU. However, This seed is
+    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
+    // and only  available on the Operator() function (which is called on the GPU).
+    // Therefore, the thread Id injection is not available at this stage. However when the operator()
+    //is called the thread ID will be avilable. So inside the opeator,
+    // we add the thrreadID, BlockId,... (which is equivalent of i)
+    //to the seed and construct the unique m_state per thead similar to cuda.
+    m_exec_once =false;
+   #endif
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
       const NormalRandomGenerator& other) {
     m_state = other.m_state;
+#ifdef EIGEN_USE_SYCL
+    m_exec_once=other.m_exec_once;
+#endif
   }
 
  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   T operator()(Index i) const {
-    uint64_t local_state = m_state + i;
-    T result = RandomToTypeNormal<T>(&local_state);
-    m_state = local_state;
+    #ifdef EIGEN_USE_SYCL
+    if(!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      m_state += (i * 6364136223846793005ULL);
+      m_exec_once =true;
+    }
+    #endif
+    T result = RandomToTypeNormal<T>(&m_state, i);
     return result;
   }
 
@@ -244,16 +281,25 @@ template <typename T> class NormalRandomGenerator {
   Packet packetOp(Index i) const {
     const int packetSize = internal::unpacket_traits<Packet>::size;
     EIGEN_ALIGN_MAX T values[packetSize];
-    uint64_t local_state = m_state + i;
+    #ifdef EIGEN_USE_SYCL
+    if(!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      m_state += (i * 6364136223846793005ULL);
+      m_exec_once =true;
+    }
+    #endif
+    EIGEN_UNROLL_LOOP
     for (int j = 0; j < packetSize; ++j) {
-      values[j] = RandomToTypeNormal<T>(&local_state);
+      values[j] = RandomToTypeNormal<T>(&m_state, i);
     }
-    m_state = local_state;
     return internal::pload<Packet>(values);
   }
 
  private:
   mutable uint64_t m_state;
+   #ifdef EIGEN_USE_SYCL
+  mutable bool m_exec_once;
+  #endif
 };
 
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 41d0d0022fd36bbebb6c13db4527e0fa6fd878cb..583f46256ebd8353df03b3b3b34c19054300c387 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -11,8 +11,20 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
 
+// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
+// so we'll use a macro to make clang happy.
+#ifndef KERNEL_FRIEND
+#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
+#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
+#else
+#define KERNEL_FRIEND friend
+#endif
+#endif
+
+
 namespace Eigen {
 
+
 /** \class TensorReduction
   * \ingroup CXX11_Tensor_Module
   *
@@ -32,6 +44,7 @@ namespace internal {
   typedef typename XprType::Nested Nested;
   static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 
   template <class T> struct MakePointer {
     // Intermediate typedef to workaround MSVC issue.
@@ -152,7 +165,9 @@ struct GenericDimReducer<-1, Self, Op> {
   }
 };
 
-template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
+          bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
+                                   !Self::ReducerTraits::IsExactlyAssociative)>
 struct InnerMostDimReducer {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
     typename Self::CoeffReturnType accum = reducer.initialize();
@@ -164,23 +179,100 @@ struct InnerMostDimReducer {
 };
 
 template <typename Self, typename Op>
-struct InnerMostDimReducer<Self, Op, true> {
+struct InnerMostDimReducer<Self, Op, true, false> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
-    const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
+    const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
     const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
-    typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
+    typename Self::PacketReturnType paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
     for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
-      reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
+      reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
     }
     typename Self::CoeffReturnType accum = reducer.initialize();
     for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
       reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
     }
-    return reducer.finalizeBoth(accum, p);
+    return reducer.finalizeBoth(accum, paccum);
   }
 };
 
-template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+#if !defined(EIGEN_HIPCC) 
+static const int kLeafSize = 1024;
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, false, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
+  reduce(const Self& self, typename Self::Index firstIndex,
+         typename Self::Index numValuesToReduce, Op& reducer) {
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    if (numValuesToReduce > kLeafSize) {
+      const typename Self::Index half = numValuesToReduce / 2;
+      reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
+      reducer.reduce(
+          reduce(self, firstIndex + half, numValuesToReduce - half, reducer),
+          &accum);
+    } else {
+      for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
+        reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+      }
+    }
+    return reducer.finalize(accum);
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, true, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
+  reduce(const Self& self, typename Self::Index firstIndex,
+         typename Self::Index numValuesToReduce, Op& reducer) {
+    const typename Self::Index packetSize =
+        internal::unpacket_traits<typename Self::PacketReturnType>::size;
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    if (numValuesToReduce > packetSize * kLeafSize) {
+      // Make sure the split point is aligned on a packet boundary.
+      const typename Self::Index split =
+          packetSize *
+          divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)),
+                packetSize);
+      const typename Self::Index num_left =
+          numext::mini(split - firstIndex, numValuesToReduce);
+      reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum);
+      if (num_left < numValuesToReduce) {
+        reducer.reduce(
+            reduce(self, split, numValuesToReduce - num_left, reducer), &accum);
+      }
+      return reducer.finalize(accum);
+    } else {
+      const typename Self::Index UnrollSize =
+          (numValuesToReduce / (2*packetSize)) * 2*packetSize;
+      const typename Self::Index VectorizedSize =
+          (numValuesToReduce / packetSize) * packetSize;
+      typename Self::PacketReturnType paccum =
+          reducer.template initializePacket<typename Self::PacketReturnType>();
+      typename Self::PacketReturnType paccum2 =
+          reducer.template initializePacket<typename Self::PacketReturnType>();
+      for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
+        reducer.reducePacket(
+            self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
+        reducer.reducePacket(
+            self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
+            &paccum2);
+      }
+      for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
+        reducer.reducePacket(self.m_impl.template packet<Unaligned>(
+                                 firstIndex + j), &paccum);
+      }
+      reducer.reducePacket(paccum2, &paccum);
+      for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
+           ++j) {
+        reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+      }
+      return reducer.finalizeBoth(accum, paccum);
+    }
+  }
+};
+#endif
+ 
+template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
 struct InnerMostDimPreserver {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
     eigen_assert(false && "should never be called");
@@ -215,11 +307,11 @@ struct InnerMostDimPreserver<-1, Self, Op, true> {
 };
 
 // Default full reducer
-template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
 struct FullReducer {
   static const bool HasOptimizedImplementation = false;
 
-  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) {
     const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
     *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
   }
@@ -229,7 +321,7 @@ struct FullReducer {
 #ifdef EIGEN_USE_THREADS
 // Multithreaded full reducers
 template <typename Self, typename Op,
-          bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+          bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
 struct FullReducerShard {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
                   typename Self::Index numValuesToReduce, Op& reducer,
@@ -242,8 +334,8 @@ struct FullReducerShard {
 // Multithreaded full reducer
 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
-  static const bool HasOptimizedImplementation = !Op::IsStateful;
-  static const int PacketSize =
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
+  static const Index PacketSize =
       unpacket_traits<typename Self::PacketReturnType>::size;
 
   // launch one reducer per thread and accumulate the result.
@@ -320,29 +412,58 @@ struct OuterReducer {
   }
 };
 
+#ifdef EIGEN_USE_SYCL
+// Default Generic reducer
+template <typename Self, typename Op, typename Device>
+struct GenericReducer {
+  static const bool HasOptimizedImplementation = false;
 
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-template <int B, int N, typename S, typename R, typename I>
-__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+    eigen_assert(false && "Not implemented");
+    return true;
+  }
+};
+#endif
+
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+template <int B, int N, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
 
 
-#ifdef EIGEN_HAS_CUDA_FP16
-template <typename S, typename R, typename I>
-__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
-template <int B, int N, typename S, typename R, typename I>
-__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
-template <int NPT, typename S, typename R, typename I>
-__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+#if defined(EIGEN_HAS_GPU_FP16)
+template <typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
+template <int B, int N, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
 
 #endif
 
-template <int NPT, typename S, typename R, typename I>
-__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 
-template <int NPT, typename S, typename R, typename I>
-__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 #endif
 
+/**
+ * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op.
+ * This allows the reduction to have a different type for the accumulator than the input data type.
+ * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input
+ * with the accumulator and the other for reducing two accumulators.
+ * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for
+ * some properties of the input.
+ */
+template <typename Op, typename CoeffReturnType>
+struct ReductionReturnType {
+#if defined(EIGEN_USE_SYCL)
+  typedef typename remove_const<decltype(std::declval<Op>().initialize())>::type type;
+#else
+  typedef typename remove_const<CoeffReturnType>::type type;
+#endif
+};
+
 }  // end namespace internal
 
 
@@ -376,11 +497,15 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType,
     const Op m_reducer;
 };
 
+template<typename ArgType, typename Device>
+struct TensorReductionEvaluatorBase;
 
 // Eval as rvalue
 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
-struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
+struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
 {
+  typedef internal::reducer_traits<Op, Device> ReducerTraits;
+  typedef Dims ReducedDims;
   typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
   typedef typename XprType::Index Index;
   typedef ArgType ChildType;
@@ -390,26 +515,42 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   static const int NumOutputDims = NumInputDims - NumReducedDims;
   typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
   typedef typename XprType::Scalar Scalar;
-  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
   static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const Index PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+    // Subset of strides of the input tensor for the non-reduced dimensions.
+  // Indexed by output dimensions.
+  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
 
   enum {
     IsAligned = false,
-    PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
+    PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = true,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
   static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool RunningFullReduction = (NumOutputDims==0);
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims())
+  EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
   {
     EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
     EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
@@ -434,11 +575,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
         m_outputStrides[0] = 1;
         for (int i = 1; i < NumOutputDims; ++i) {
           m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
         }
       } else {
-        m_outputStrides.back() = 1;
+        m_outputStrides[NumOutputDims - 1] = 1;
         for (int i = NumOutputDims - 2; i >= 0; --i) {
           m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
         }
       }
     }
@@ -466,6 +609,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
           ++reduceIndex;
         } else {
           m_preservedStrides[outputIndex] = input_strides[i];
+          m_output_to_input_dim_map[outputIndex] = i;
           ++outputIndex;
         }
       }
@@ -475,13 +619,19 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
     if (NumOutputDims == 0) {
       m_preservedStrides[0] = internal::array_prod(input_dims);
     }
+
+    m_numValuesToReduce =
+        NumOutputDims == 0
+            ? internal::array_prod(input_dims)
+            : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
+                  ? m_preservedStrides[0]
+                  : m_preservedStrides[NumOutputDims - 1];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-
+  EIGEN_STRONG_INLINE
+  bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
     // Use the FullReducer if possible.
     if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
         internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
@@ -489,7 +639,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
          !RunningOnGPU))) {
       bool need_assign = false;
       if (!data) {
-        m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
+        m_result = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
         data = m_result;
         need_assign = true;
       }
@@ -497,20 +647,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
       internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
       return need_assign;
     }
-    else if(RunningOnSycl){
-      const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
-      const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
-      if (!data) {
-        data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
-        m_result = data;
-      }
-      Op reducer(m_reducer);
-      internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
-      return (m_result != NULL);
-    }
 
     // Attempt to use an optimized reduction.
-    else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
+    else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
       bool reducing_inner_dims = true;
       for (int i = 0; i < NumReducedDims; ++i) {
         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -524,8 +663,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
         if (!data) {
-          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
-            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) {
+            data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
             m_result = data;
           }
           else {
@@ -533,9 +672,10 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
           }
         }
         Op reducer(m_reducer);
+        // For SYCL this if always return false
         if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
           if (m_result) {
-            m_device.deallocate(m_result);
+            m_device.deallocate_temp(m_result);
             m_result = NULL;
           }
           return true;
@@ -557,8 +697,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
         if (!data) {
-          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
-            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) {
+            data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
             m_result = data;
           }
           else {
@@ -566,9 +706,10 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
           }
         }
         Op reducer(m_reducer);
+        // For SYCL this if always return false
         if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
           if (m_result) {
-            m_device.deallocate(m_result);
+            m_device.deallocate_temp(m_result);
             m_result = NULL;
           }
           return true;
@@ -576,21 +717,54 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
           return (m_result != NULL);
         }
       }
+      #if defined(EIGEN_USE_SYCL)
+      // If there is no Optimised version for SYCL, the reduction expression 
+      // must break into two subexpression and use the SYCL generic Reducer on the device.
+      if(RunningOnSycl) {
+         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+         if (!data) {
+           data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+           m_result = data;
+         }
+         Op reducer(m_reducer);
+         internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+         return (m_result != NULL);
+       }
+      #endif
     }
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE
+      void
+      evalSubExprsIfNeededAsync(EvaluatorPointerType data,
+                                EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) {
+      done(evalSubExprsIfNeededCommon(data));
+    });
+  }
+#endif
+
+  EIGEN_STRONG_INLINE
+  bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return evalSubExprsIfNeededCommon(data);
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
     if (m_result) {
-      m_device.deallocate(m_result);
+      m_device.deallocate_temp(m_result);
       m_result = NULL;
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) {
+    if (( RunningFullReduction || RunningOnGPU) && m_result ) {
       return *(m_result + index);
     }
     Op reducer(m_reducer);
@@ -662,37 +836,52 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
     }
   }
 
-  EIGEN_DEVICE_FUNC typename MakePointer_<Scalar>::Type data() const { return m_result; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-  /// added for sycl in order to construct the buffer from the sycl device
-  const Device& device() const{return m_device;}
-  /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel
-  const Dims& xprDims() const {return m_xpr_dims;}
-
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+    m_result.bind(cgh);
+  }
+#endif
 
   private:
   template <int, typename, typename> friend struct internal::GenericDimReducer;
-  template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
+  template <typename, typename, bool, bool> friend struct internal::InnerMostDimReducer;
   template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
   template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
 #ifdef EIGEN_USE_THREADS
   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
 #endif
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
-#ifdef EIGEN_HAS_CUDA_FP16
-  template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
-  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
-  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+  template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
+#if defined(EIGEN_HAS_GPU_FP16)
+  template <typename S, typename R, typename I_> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<Eigen::half>::type*);
+  template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<Eigen::half>::type*);
+  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
 #endif
-  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 
-  template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 #endif
 
+#if defined(EIGEN_USE_SYCL)
+ template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer;
+ // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
+ template <typename, typename, typename> friend struct internal::GenericReducer;
+#endif
+
+
   template <typename S, typename O, typename D> friend struct internal::InnerReducer;
 
+  struct BlockIteratorState {
+    Index input_dim;
+    Index output_size;
+    Index output_count;
+  };
+
   // Returns the Index in the input tensor of the first value that needs to be
   // used to compute the reduction at output index "index".
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@@ -741,10 +930,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   Dimensions m_dimensions;
   // Precomputed strides for the output tensor.
   array<Index, NumOutputDims> m_outputStrides;
-  // Subset of strides of the input tensor for the non-reduced dimensions.
-  // Indexed by output dimensions.
-  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
+  array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
   array<Index, NumPreservedStrides> m_preservedStrides;
+  // Map from output to input dimension index.
+  array<Index, NumOutputDims> m_output_to_input_dim_map;
+  // How many values go into each reduction
+  Index m_numValuesToReduce;
 
   // Subset of strides of the input tensor for the reduced dimensions.
   // Indexed by reduced dimensions.
@@ -760,7 +951,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   Op m_reducer;
 
   // For full reductions
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
   static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
   static const bool RunningOnSycl = false;
 #elif defined(EIGEN_USE_SYCL)
@@ -770,10 +961,36 @@ static const bool RunningOnGPU = false;
   static const bool RunningOnGPU = false;
   static const bool RunningOnSycl = false;
 #endif
-  typename MakePointer_<CoeffReturnType>::Type m_result;
+  EvaluatorPointerType m_result;
 
-  const Device& m_device;
-  const Dims& m_xpr_dims;
+  const Device EIGEN_DEVICE_REF m_device;
+};
+
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
+: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
+  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){}
+};
+
+
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
+: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
+
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> Base;
+  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){}
+  // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
+  //Therefore the coeff function should be overridden by for SYCL kernel
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
+    return *(this->data() + index);
+  }
+  // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
+  //Therefore the packet function should be overridden by for SYCL kernel
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
+    return internal::pload<typename Base::PacketReturnType>(this->data() + index);
+  }
 };
 
 } // end namespace Eigen
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 65638b6a8408d059f8a1d3ebcbef6e9bebb1a6c2..68780cd3cf43c60a672e3398a15b38c1ea9e4ad2 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -1,750 +1,6 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
-#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
-
-namespace Eigen {
-namespace internal {
-
-
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-// Full reducers for GPU, don't vectorize for now
-
-// Reducer function that enables multiple cuda thread to safely accumulate at the same
-// output address. It basically reads the current value of the output variable, and
-// attempts to update it with the new value. If in the meantime another cuda thread
-// updated the content of the output address it will try again.
-template <typename T, typename R>
-__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
-#if __CUDA_ARCH__ >= 300
-  if (sizeof(T) == 4)
-  {
-    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
-    unsigned int newval = oldval;
-    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-    unsigned int readback;
-    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
-      oldval = readback;
-      newval = oldval;
-      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-      if (newval == oldval) {
-        return;
-      }
-    }
-  }
-  else if (sizeof(T) == 8) {
-    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
-    unsigned long long newval = oldval;
-    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-    unsigned long long readback;
-    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
-      oldval = readback;
-      newval = oldval;
-      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-      if (newval == oldval) {
-        return;
-      }
-    }
-  }
-  else {
-    assert(0 && "Wordsize not supported");
-  }
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-// We extend atomicExch to support extra data types
-template <typename Type>
-__device__ inline Type atomicExchCustom(Type* address, Type val) {
-  return atomicExch(address, val);
-}
-
-template <>
-__device__ inline double atomicExchCustom(double* address, double val) {
-  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
-  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
-}
-
-#ifdef EIGEN_HAS_CUDA_FP16
-template <template <typename T> class R>
-__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
-  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
-  unsigned int newval = oldval;
-  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
-  if (newval == oldval) {
-    return;
-  }
-  unsigned int readback;
-  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
-    oldval = readback;
-    newval = oldval;
-    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-  }
-}
+#if defined(__clang__) || defined(__GNUC__)
+#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorReductionGpu.h file"
 #endif
 
-template <>
-__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
-#if __CUDA_ARCH__ >= 300
-  atomicAdd(output, accum);
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-
-template <typename CoeffType, typename Index>
-__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index num_threads = blockDim.x * gridDim.x;
-  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-    output[i] = val;
-  }
-}
-
-
-template <int BlockSize, int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
-                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
-#if __CUDA_ARCH__ >= 300
-  // Initialize the output value
-  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
-  if (gridDim.x == 1) {
-    if (first_index == 0) {
-      *output = reducer.initialize();
-    }
-  }
-  else {
-    if (threadIdx.x == 0) {
-      unsigned int block = atomicCAS(semaphore, 0u, 1u);
-      if (block == 0) {
-        // We're the first block to run, initialize the output value
-        atomicExchCustom(output, reducer.initialize());
-        __threadfence();
-        atomicExch(semaphore, 2u);
-      }
-      else {
-        // Wait for the first block to initialize the output value.
-        // Use atomicCAS here to ensure that the reads aren't cached
-        unsigned int val;
-        do {
-          val = atomicCAS(semaphore, 2u, 2u);
-        }
-        while (val < 2u);
-      }
-    }
-  }
-
-  __syncthreads();
-
-  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
-
-  typename Self::CoeffReturnType accum = reducer.initialize();
-  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
-  for (Index i = 0; i < max_iter; i+=BlockSize) {
-    const Index index = first_index + i;
-    eigen_assert(index < num_coeffs);
-    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
-    reducer.reduce(val, &accum);
-  }
-
-#pragma unroll
-  for (int offset = warpSize/2; offset > 0; offset /= 2) {
-    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
-  }
-
-  if ((threadIdx.x & (warpSize - 1)) == 0) {
-    atomicReduce(output, accum, reducer);
-  }
-
-  if (gridDim.x > 1 && threadIdx.x == 0) {
-    // Let the last block reset the semaphore
-    atomicInc(semaphore, gridDim.x + 1);
-  }
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-
-#ifdef EIGEN_HAS_CUDA_FP16
-template <typename Self,
-          typename Reducer, typename Index>
-__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
-  eigen_assert(blockDim.x == 1);
-  eigen_assert(gridDim.x == 1);
-  if (num_coeffs % 2 != 0) {
-    half last = input.m_impl.coeff(num_coeffs-1);
-    *scratch = __halves2half2(last, reducer.initialize());
-  } else {
-    *scratch = reducer.template initializePacket<half2>();
-  }
-}
-
-template <typename Self,
-          typename Reducer, typename Index>
-__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index num_packets = num_coeffs / 2;
-  for (Index i = thread_id; i < num_packets; i += num_threads) {
-    ((half2*)output)[i] = reducer.template initializePacket<half2>();
-  }
-
-  if (thread_id == 0 && num_coeffs % 2 != 0) {
-    output[num_coeffs-1] = reducer.initialize();
-  }
-}
-
-template <int BlockSize, int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
-                                    half* output, half2* scratch) {
-  eigen_assert(NumPerThread % 2 == 0);
-
-  const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
-
-  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
-  if (gridDim.x == 1 && first_index == 0) {
-    if (num_coeffs % 2 != 0) {
-      half last = input.m_impl.coeff(num_coeffs-1);
-      *scratch = __halves2half2(last, reducer.initialize());
-    } else {
-      *scratch = reducer.template initializePacket<half2>();
-    }
-    __syncthreads();
-  }
-
-  half2 accum = reducer.template initializePacket<half2>();
-  const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
-  for (Index i = 0; i < max_iter; i += BlockSize) {
-    const Index index = first_index + 2*i;
-    eigen_assert(index + 1 < num_coeffs);
-    half2 val = input.m_impl.template packet<Unaligned>(index);
-    reducer.reducePacket(val, &accum);
-  }
-
-#pragma unroll
-  for (int offset = warpSize/2; offset > 0; offset /= 2) {
-    reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
-  }
-
-  if ((threadIdx.x & (warpSize - 1)) == 0) {
-    atomicReduce(scratch, accum, reducer);
-  }
-
-  __syncthreads();
-
-  if (gridDim.x == 1 && first_index == 0) {
-    half tmp = __low2half(*scratch);
-    reducer.reduce(__high2half(*scratch), &tmp);
-    *output = tmp;
-  }
-}
-
-template <typename Op>
-__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
-  eigen_assert(threadIdx.x == 1);
-  half tmp = __low2half(*scratch);
-  reducer.reduce(__high2half(*scratch), &tmp);
-  *output = tmp;
-}
-
-#endif
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
-struct FullReductionLauncher {
-  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
-    assert(false && "Should only be called on doubles, floats and half floats");
-  }
-};
-
-// Specialization for float and double
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
-struct FullReductionLauncher<
-    Self, Op, OutputType, PacketAccess,
-    typename internal::enable_if<
-      internal::is_same<float, OutputType>::value ||
-      internal::is_same<double, OutputType>::value,
-    void>::type> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
-    typedef typename Self::Index Index;
-    typedef typename Self::CoeffReturnType Scalar;
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-
-    unsigned int* semaphore = NULL;
-    if (num_blocks > 1) {
-      semaphore = device.semaphore();
-    }
-
-    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
-  }
-};
-
-#ifdef EIGEN_HAS_CUDA_FP16
-template <typename Self, typename Op>
-struct FullReductionLauncher<Self, Op, Eigen::half, false> {
-  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
-    assert(false && "Should not be called since there is no packet accessor");
-  }
-};
-
-template <typename Self, typename Op>
-struct FullReductionLauncher<Self, Op, Eigen::half, true> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
-    typedef typename Self::Index Index;
-
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    half2* scratch = static_cast<half2*>(device.scratchpad());
-
-    if (num_blocks > 1) {
-      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
-                         1, 1, 0, device, reducer, self, num_coeffs, scratch);
-    }
-
-    LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
-
-    if (num_blocks > 1) {
-      LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
-                         1, 1, 0, device, reducer, output, scratch);
-    }
-  }
-};
-#endif
-
-
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple cases
-  // of doubles, floats and half floats
-#ifdef EIGEN_HAS_CUDA_FP16
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       internal::is_same<typename Self::CoeffReturnType, double>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
-#else
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif
-
-  template <typename OutputType>
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
-    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    // Don't crash when we're called with an input tensor of size 0.
-    if (num_coeffs == 0) {
-      return;
-    }
-
-    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
-  }
-};
-
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                         typename Self::CoeffReturnType* output) {
-#if __CUDA_ARCH__ >= 300
-  typedef typename Self::CoeffReturnType Type;
-  eigen_assert(blockDim.y == 1);
-  eigen_assert(blockDim.z == 1);
-  eigen_assert(gridDim.y == 1);
-  eigen_assert(gridDim.z == 1);
-
-  const int unroll_times = 16;
-  eigen_assert(NumPerThread % unroll_times == 0);
-
-  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
-  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
-
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
-    const Index row = i / input_col_blocks;
-
-    if (row < num_preserved_coeffs) {
-      const Index col_block = i % input_col_blocks;
-      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
-
-      Type reduced_val = reducer.initialize();
-
-      for (Index j = 0; j < NumPerThread; j += unroll_times) {
-        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
-        if (last_col >= num_coeffs_to_reduce) {
-          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
-            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
-            reducer.reduce(val, &reduced_val);
-          }
-          break;
-        } else {
-          // Faster version of the loop with no branches after unrolling.
-#pragma unroll
-          for (int k = 0; k < unroll_times; ++k) {
-            const Index col = col_begin + blockDim.x * (j + k);
-            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
-          }
-        }
-      }
-
-#pragma unroll
-      for (int offset = warpSize/2; offset > 0; offset /= 2) {
-        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
-      }
-
-      if ((threadIdx.x & (warpSize - 1)) == 0) {
-        atomicReduce(&(output[row]), reduced_val, reducer);
-      }
-    }
-  }
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-#ifdef EIGEN_HAS_CUDA_FP16
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                              half* output) {
-  eigen_assert(blockDim.y == 1);
-  eigen_assert(blockDim.z == 1);
-  eigen_assert(gridDim.y == 1);
-  eigen_assert(gridDim.z == 1);
-
-  const int unroll_times = 16;
-  eigen_assert(NumPerThread % unroll_times == 0);
-  eigen_assert(unroll_times % 2 == 0);
-
-  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
-  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
-
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    Index i = 2*thread_id;
-    for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
-      half* loc = output + i;
-      *((half2*)loc) = reducer.template initializePacket<half2>();
-    }
-    if (i < num_preserved_coeffs) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
-    const Index row = 2 * (i / input_col_blocks);
-
-    if (row + 1 < num_preserved_coeffs) {
-      const Index col_block = i % input_col_blocks;
-      const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x);
-
-      half2 reduced_val1 = reducer.template initializePacket<half2>();
-      half2 reduced_val2 = reducer.template initializePacket<half2>();
-
-      for (Index j = 0; j < NumPerThread; j += unroll_times) {
-        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2;
-        if (last_col >= num_coeffs_to_reduce) {
-          Index col = col_begin + blockDim.x * j;
-          for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
-            const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
-            reducer.reducePacket(val1, &reduced_val1);
-            const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
-            reducer.reducePacket(val2, &reduced_val2);
-          }
-          if (col < num_coeffs_to_reduce) {
-            // Peel;
-            const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
-            const half2 val1 = __halves2half2(last1, reducer.initialize());
-            reducer.reducePacket(val1, &reduced_val1);
-            const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
-            const half2 val2 = __halves2half2(last2, reducer.initialize());
-            reducer.reducePacket(val2, &reduced_val2);
-          }
-          break;
-        } else {
-          // Faster version of the loop with no branches after unrolling.
-#pragma unroll
-          for (int k = 0; k < unroll_times; ++k) {
-            const Index col = col_begin + blockDim.x * (j + k) * 2;
-            reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
-            reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
-          }
-        }
-      }
-
-#pragma unroll
-      for (int offset = warpSize/2; offset > 0; offset /= 2) {
-        reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
-        reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
-      }
-
-      half val1 =  __low2half(reduced_val1);
-      reducer.reduce(__high2half(reduced_val1), &val1);
-      half val2 =  __low2half(reduced_val2);
-      reducer.reduce(__high2half(reduced_val2), &val2);
-      half2 val = __halves2half2(val1, val2);
-
-      if ((threadIdx.x & (warpSize - 1)) == 0) {
-        half* loc = output + row;
-        atomicReduce((half2*)loc, val, reducer);
-      }
-    }
-  }
-}
-
-#endif
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
-struct InnerReductionLauncher {
-  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
-    return true;
-  }
-};
-
-// Specialization for float and double
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
-struct InnerReductionLauncher<
-  Self, Op, OutputType, PacketAccess,
-  typename internal::enable_if<
-    internal::is_same<float, OutputType>::value ||
-    internal::is_same<double, OutputType>::value,
-  void>::type> {
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
-                         num_blocks, 1024, 0, device, reducer.initialize(),
-                         num_preserved_vals, output);
-    }
-
-    LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-
-#ifdef EIGEN_HAS_CUDA_FP16
-template <typename Self, typename Op>
-struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
-  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should not be called since there is no packet accessor");
-    return true;
-  }
-};
-
-template <typename Self, typename Op>
-struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    if (num_preserved_vals % 2 != 0) {
-      // Not supported yet, revert to the slower code path
-      return true;
-    }
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = /*256*/128;
-    const int num_per_thread = /*128*/64;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
-                         1, 1, 0, device, reducer, self, num_preserved_vals, output);
-    }
-
-    LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-#endif
-
-
-template <typename Self, typename Op>
-struct InnerReducer<Self, Op, GpuDevice> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats and half floats.
-#ifdef EIGEN_HAS_CUDA_FP16
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       internal::is_same<typename Self::CoeffReturnType, double>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
-#else
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif
-
-  template <typename OutputType>
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    // Don't crash when we're called with an input tensor of size 0.
-    if (num_coeffs == 0) {
-      return true;
-    }
-    // It's faster to use the usual code.
-    if (num_coeffs_to_reduce <= 128) {
-      return true;
-    }
-
-    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
-  }
-};
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                     typename Self::CoeffReturnType* output) {
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  // Do the reduction.
-  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
-  for (Index i = thread_id; i < max_iter; i += num_threads) {
-    const Index input_col = i % num_preserved_coeffs;
-    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
-    typename Self::CoeffReturnType reduced_val = reducer.initialize();
-    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
-    for (Index j = input_row; j < max_row; j++) {
-      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
-      reducer.reduce(val, &reduced_val);
-    }
-    atomicReduce(&(output[input_col]), reduced_val, reducer);
-  }
-}
-
-
-template <typename Self, typename Op>
-struct OuterReducer<Self, Op, GpuDevice> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats.
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
-  template <typename Device, typename OutputType>
-  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce doubles or floats on a gpu device");
-    return true;
-  }
-
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    // It's faster to use the usual code.
-    if (num_coeffs_to_reduce <= 32) {
-      return true;
-    }
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = 256;
-    const int num_per_thread = 16;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs in the reduction kernel itself when we don't have to worry
-      // about race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumCudaMultiProcessors() *
-                             device.maxCudaThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
-                         num_blocks, 1024, 0, device, reducer.initialize(),
-                         num_preserved_vals, output);
-    }
-
-    LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-
-#endif
-
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
+#include "TensorReductionGpu.h"
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..db4e8d866f04ce0586835da2595a171856ae96dd
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -0,0 +1,966 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+
+namespace Eigen {
+namespace internal {
+
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple gpu thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another gpu thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  if (sizeof(T) == 4)
+  {
+    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+    unsigned int newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned int readback;
+    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else if (sizeof(T) == 8) {
+    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+    unsigned long long newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned long long readback;
+    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else {
+    gpu_assert(0 && "Wordsize not supported");
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+  return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
+  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+  unsigned int newval = oldval;
+  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+  if (newval == oldval) {
+    return;
+  }
+  unsigned int readback;
+  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+    oldval = readback;
+    newval = oldval;
+    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+  }
+}
+// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
+template <typename R>
+__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
+  half2* houtput=reinterpret_cast<half2*>(output);
+  half2* haccum=reinterpret_cast<half2*>(&accum);
+  for(int i=0;i<4;++i){
+    atomicReduce(houtput+i,*(haccum+i),reducer);
+  }
+}
+#endif  // EIGEN_HAS_GPU_FP16
+
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  atomicAdd(output, accum);
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+
+template <typename CoeffType, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+    output[i] = val;
+  }
+}
+
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  // Initialize the output value
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      *output = reducer.initialize();
+    }
+  }
+  else {
+    if (threadIdx.x == 0) {
+      unsigned int block = atomicCAS(semaphore, 0u, 1u);
+      if (block == 0) {
+        // We're the first block to run, initialize the output value
+        atomicExchCustom(output, reducer.initialize());
+        __threadfence();
+        atomicExch(semaphore, 2u);
+      }
+      else {
+        // Wait for the first block to initialize the output value.
+        // Use atomicCAS here to ensure that the reads aren't cached
+        unsigned int val;
+        do {
+          val = atomicCAS(semaphore, 2u, 2u);
+        }
+        while (val < 2u);
+      }
+    }
+  }
+
+  __syncthreads();
+
+  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
+
+  typename Self::CoeffReturnType accum = reducer.initialize();
+  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
+  for (Index i = 0; i < max_iter; i+=BlockSize) {
+    const Index index = first_index + i;
+    eigen_assert(index < num_coeffs);
+    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+    reducer.reduce(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+  #if defined(EIGEN_HIPCC)
+    // use std::is_floating_point to determine the type of reduced_val 
+    // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
+    // and list the float and int versions of __shfl_down as the candidate functions. 
+    if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
+      reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
+    } else {
+      reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
+    }
+  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
+  #else
+    reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
+  #endif
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(output, accum, reducer);
+  }
+
+  if (gridDim.x > 1 && threadIdx.x == 0) {
+    // Let the last block reset the semaphore
+    atomicInc(semaphore, gridDim.x + 1);
+#if defined(EIGEN_HIPCC)
+    __threadfence_system();
+#endif
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+                                                      packet_traits<Eigen::half>::type* scratch) {
+  eigen_assert(blockDim.x == 1);
+  eigen_assert(gridDim.x == 1);
+  typedef packet_traits<Eigen::half>::type packet_type;
+  Index packet_remainder =
+      num_coeffs % Index(unpacket_traits<packet_type>::size);
+  if (packet_remainder != 0) {
+    half2* h2scratch = reinterpret_cast<half2*>(scratch);
+    for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
+      *h2scratch =
+          __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1));
+      h2scratch++;
+    }
+    if ((num_coeffs & 1) != 0) {
+      half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
+      *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
+    }
+  } else {
+    *scratch = reducer.template initializePacket<packet_type>();
+  }
+}
+
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+
+  const Index num_packets =
+      num_coeffs / Index(unpacket_traits<PacketType>::size);
+  PacketType* p_output = reinterpret_cast<PacketType*>(output);
+  for (Index i = thread_id; i < num_packets; i += num_threads) {
+    p_output[i] = reducer.template initializePacket<PacketType>();
+  }
+  Index packet_remainder =
+      num_coeffs % Index(unpacket_traits<PacketType>::size);
+  if (thread_id < packet_remainder) {
+    output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
+  }
+}
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+                                    half* output, packet_traits<Eigen::half>::type* scratch) {
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+  const int packet_width = unpacket_traits<PacketType>::size;
+  eigen_assert(NumPerThread % packet_width == 0);
+  const Index first_index =
+      blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
+
+  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      int rem = num_coeffs % packet_width;
+      if (rem != 0) {
+        half2* p_scratch = reinterpret_cast<half2*>(scratch);
+        *scratch = reducer.template initializePacket<PacketType>();
+        for (int i = 0; i < rem / 2; i++) {
+          *p_scratch = __halves2half2(
+              input.m_impl.coeff(num_coeffs - packet_width + 2 * i),
+              input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1));
+          p_scratch++;
+        }
+        if ((num_coeffs & 1) != 0) {
+          half last = input.m_impl.coeff(num_coeffs - 1);
+          *p_scratch = __halves2half2(last, reducer.initialize());
+        }
+      } else {
+        *scratch = reducer.template initializePacket<PacketType>();
+      }
+    }
+    __syncthreads();
+  }
+
+  PacketType accum = reducer.template initializePacket<PacketType>();
+  const Index max_iter =
+      numext::mini<Index>((num_coeffs - first_index) / packet_width,
+                          NumPerThread * BlockSize / packet_width);
+  for (Index i = 0; i < max_iter; i += BlockSize) {
+    const Index index = first_index + packet_width * i;
+    eigen_assert(index + packet_width < num_coeffs);
+    PacketType val = input.m_impl.template packet<Unaligned>(index);
+    reducer.reducePacket(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+  #if defined(EIGEN_HIPCC)
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+      union { int i; half2 h; } wka_in, wka_out;
+      wka_in.h = hacc[i];
+      wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
+      hr[i] = wka_out.h;
+    }
+    reducer.reducePacket(r1, &accum);
+  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      hr[i] = __shfl_down(hacc[i], offset, warpSize);
+    }
+    reducer.reducePacket(r1, &accum);
+  #else
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
+    }
+    reducer.reducePacket(r1, &accum);
+
+  #endif
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(scratch, accum, reducer);
+  }
+
+  __syncthreads();
+  half2* rv1 = reinterpret_cast<half2*>(scratch);
+  if (packet_width > 2) {
+    reducer.reducePacket(rv1[2], rv1);
+    reducer.reducePacket(rv1[3], rv1 + 1);
+    reducer.reducePacket(rv1[1], rv1);
+  }
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      half tmp = __low2half(*rv1);
+      reducer.reduce(__high2half(*rv1), &tmp);
+      *output = tmp;
+    }
+  }
+}
+
+template <typename Op>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
+  eigen_assert(threadIdx.x == 1);
+  half2* pscratch = reinterpret_cast<half2*>(scratch);
+  half tmp = __float2half(0.f);
+  typedef packet_traits<Eigen::half>::type packet_type;
+  for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
+    reducer.reduce(__low2half(*pscratch), &tmp);
+    reducer.reduce(__high2half(*pscratch), &tmp);
+    pscratch++;
+  }
+  *output = tmp;
+}
+
+#endif // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+    gpu_assert(false && "Should only be called on doubles, floats and half floats");
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    typename internal::enable_if<
+      internal::is_same<float, OutputType>::value ||
+      internal::is_same<double, OutputType>::value,
+    void>::type> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
+
+    typedef typename Self::Index Index;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+
+    unsigned int* semaphore = NULL;
+    if (num_blocks > 1) {
+      semaphore = device.semaphore();
+    }
+
+    LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
+  }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+    gpu_assert(false && "Should not be called since there is no packet accessor");
+  }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+    typedef typename packet_traits<Eigen::half>::type PacketType;
+
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    PacketType* scratch = static_cast<PacketType*>(device.scratchpad());
+    // half2* scratch = static_cast<half2*>(device.scratchpad());
+
+    if (num_blocks > 1) {
+      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_coeffs, scratch);
+    }
+
+    LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
+
+    if (num_blocks > 1) {
+      LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
+                         1, 1, 0, device, reducer, output, scratch);
+    }
+  }
+};
+#endif // EIGEN_HAS_GPU_FP16
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple cases
+  // of doubles, floats and half floats
+#ifdef EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else // EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif // EIGEN_HAS_GPU_FP16
+
+  template <typename OutputType>
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return;
+    }
+
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+  }
+};
+
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                         typename Self::CoeffReturnType* output) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  typedef typename Self::CoeffReturnType Type;
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = i / input_col_blocks;
+
+    if (row < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
+
+      Type reduced_val = reducer.initialize();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
+        if (last_col >= num_coeffs_to_reduce) {
+          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
+            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            reducer.reduce(val, &reduced_val);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k);
+            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+      #if defined(EIGEN_HIPCC)
+        // use std::is_floating_point to determine the type of reduced_val 
+       // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
+       // and list the float and int versions of __shfl_down as the candidate functions. 
+        if (std::is_floating_point<Type>::value) {
+          reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
+        } else {
+          reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
+        }
+      #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+      #else
+        reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
+      #endif
+      }
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        atomicReduce(&(output[row]), reduced_val, reducer);
+      }
+    }
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                              half* output) {
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+  const int packet_width = unpacket_traits<PacketType>::size;
+  const int unroll_times = 16 / packet_width;
+  eigen_assert(NumPerThread % unroll_times == 0);
+  eigen_assert(unroll_times % 2 == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
+  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    Index i = packet_width * thread_id;
+    for (; i + packet_width <= num_preserved_coeffs;
+         i += packet_width * num_threads) {
+      PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
+      *poutput = reducer.template initializePacket<PacketType>();
+    }
+    if (i < num_preserved_coeffs) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = 2 * (i / input_col_blocks);  // everybody takes 2 rows
+
+    if (row + 1 < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin =
+          packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
+
+      PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
+      PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col =
+            col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
+        if (last_col >= num_coeffs_to_reduce) {
+          Index col = col_begin + blockDim.x * j;
+          for (; col + packet_width <= num_coeffs_to_reduce;
+               col += blockDim.x) {
+            const PacketType val1 = input.m_impl.template packet<Unaligned>(
+                row * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val1, &reduced_val1);
+            const PacketType val2 = input.m_impl.template packet<Unaligned>(
+                (row + 1) * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          if (col < num_coeffs_to_reduce) {
+            PacketType r1 = reducer.template initializePacket<PacketType>();
+            PacketType r2 = reducer.template initializePacket<PacketType>();
+            half2* hr1 = reinterpret_cast<half2*>(&r1);
+            half2* hr2 = reinterpret_cast<half2*>(&r2);
+            while (col + 1 < num_coeffs_to_reduce) {
+              *hr1 = __halves2half2(
+                  input.m_impl.coeff(row * num_coeffs_to_reduce + col),
+                  input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
+              *hr2 = __halves2half2(
+                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
+                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
+                                     1));
+              hr1++;
+              hr2++;
+              col += 2;
+            }
+            if (col < num_coeffs_to_reduce) {
+              // Peel;
+              const half last1 =
+                  input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+              *hr1 = __halves2half2(last1, reducer.initialize());
+              const half last2 =
+                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
+              *hr2 = __halves2half2(last2, reducer.initialize());
+            }
+            reducer.reducePacket(r1, &reduced_val1);
+            reducer.reducePacket(r2, &reduced_val2);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k) * packet_width;
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(
+                                     row * num_coeffs_to_reduce + col),
+                                 &reduced_val1);
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(
+                                     (row + 1) * num_coeffs_to_reduce + col),
+                                 &reduced_val2);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+      #if defined(EIGEN_HIPCC)
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+	  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+	  union { int i; half2 h; } wka_in1, wka_out1;
+	  wka_in1.h = rv1[i];
+	  wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
+	  hr1[i] = wka_out1.h;
+
+	  union { int i; half2 h; } wka_in2, wka_out2;
+	  wka_in2.h = rv2[i];
+	  wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
+	  hr2[i] = wka_out2.h;
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+      #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+          hr1[i] = __shfl_down(rv1[i], offset, warpSize);
+          hr2[i] = __shfl_down(rv2[i], offset, warpSize);
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+      #else
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+          hr1[i] =
+              __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
+          hr2[i] =
+              __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+
+      #endif
+      }
+      half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+      half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+      half2 val;
+      if (packet_width > 2) {
+        reducer.reducePacket(rv1[2], rv1);
+        reducer.reducePacket(rv1[3], rv1 + 1);
+        reducer.reducePacket(rv1[1], rv1);
+        reducer.reducePacket(rv2[2], rv2);
+        reducer.reducePacket(rv2[3], rv2 + 1);
+        reducer.reducePacket(rv2[1], rv2);
+      }
+      half val1 = __low2half(*rv1);
+      reducer.reduce(__high2half(*rv1), &val1);
+      half val2 = __low2half(*rv2);
+      reducer.reduce(__high2half(*rv2), &val2);
+      val = __halves2half2(val1, val2);
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        half* loc = output + row;
+        atomicReduce((half2*)loc, val, reducer);
+      }
+    }
+  }
+}
+
+#endif // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
+    return true;
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+  Self, Op, OutputType, PacketAccess,
+  typename internal::enable_if<
+    internal::is_same<float, OutputType>::value ||
+    internal::is_same<double, OutputType>::value,
+  void>::type> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should not be called since there is no packet accessor");
+    return true;
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    if (num_preserved_vals % 2 != 0) {
+      // Not supported yet, revert to the slower code path
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = /*256*/128;
+    const int num_per_thread = /*128*/64;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+#endif // EIGEN_HAS_GPU_FP16
+
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats and half floats.
+#ifdef EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else // EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif // EIGEN_HAS_GPU_FP16
+
+  template <typename OutputType>
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return true;
+    }
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 128) {
+      return true;
+    }
+
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+  }
+};
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                     typename Self::CoeffReturnType* output) {
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  // Do the reduction.
+  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
+  for (Index i = thread_id; i < max_iter; i += num_threads) {
+    const Index input_col = i % num_preserved_coeffs;
+    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
+    typename Self::CoeffReturnType reduced_val = reducer.initialize();
+    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
+    for (Index j = input_row; j < max_row; j++) {
+      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
+      reducer.reduce(val, &reduced_val);
+    }
+    atomicReduce(&(output[input_col]), reduced_val, reducer);
+  }
+}
+
+
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats.
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+  template <typename Device, typename OutputType>
+  static
+    #if !defined(EIGEN_HIPCC)
+    // FIXME :  leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
+    //          (in the cxx11_tensor_reduction_gpu test)
+    //
+    // terminate called after throwing an instance of 'std::runtime_error'
+    //   what():  No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
+    //
+    // don't know why this happens (and why is it a runtime error instead of a compile time error)
+    //
+    // this will be fixed by HIP PR#457
+    EIGEN_DEVICE_FUNC
+    #endif
+    bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
+    return true;
+  }
+
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 32) {
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 16;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs in the reduction kernel itself when we don't have to worry
+      // about race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumGpuMultiProcessors() *
+                             device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 3daecb045408832ee11e08d9af345ac363e70942..474eba06f2c04c0325b8b831d0f686c2952eb3ba 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -11,232 +11,572 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 /*****************************************************************
- * TensorSyclPlaceHolderExpr.h
+ * TensorReductionSycl.h
  *
  * \brief:
- *  This is the specialisation of the placeholder expression based on the
- * operation type
+ *  This is the specialization of the reduction operation. Two phase reduction approach 
+ * is used since the GPU does not have Global Synchronization for global memory among 
+ * different work-group/thread block. To solve the problem, we need to create two kernels 
+ * to reduce the data, where the first kernel reduce the data locally and each local 
+ * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
+ * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element. 
+ * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
+ * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
  *
-*****************************************************************/
+ *****************************************************************/
 
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
-
 namespace Eigen {
+namespace TensorSycl {
 namespace internal {
 
-template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
-template<typename BufferTOut, typename BufferTIn>
-static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
-  do {
-          auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
-            cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
-                                    cl::sycl::range<1>{std::min(length, local)}};
-            /* Two accessors are used: one to the buffer that is being reduced,
-             * and a second to local memory, used to store intermediate data. */
-            auto aI =
-                bufI.template get_access<cl::sycl::access::mode::read_write>(h);
-            auto aOut =
-                bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
-            cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
-                               cl::sycl::access::target::local>
-                scratch(cl::sycl::range<1>(local), h);
-
-            /* The parallel_for invocation chosen is the variant with an nd_item
-             * parameter, since the code requires barriers for correctness. */
-            h.parallel_for<KernelName>(
-                r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
-                  size_t globalid = id.get_global(0);
-                  size_t localid = id.get_local(0);
-                  /* All threads collectively read from global memory into local.
-                   * The barrier ensures all threads' IO is resolved before
-                   * execution continues (strictly speaking, all threads within
-                   * a single work-group - there is no co-ordination between
-                   * work-groups, only work-items). */
-                  if (globalid < length) {
-                    scratch[localid] = aI[globalid];
-                  }
-                  id.barrier(cl::sycl::access::fence_space::local_space);
-
-                  /* Apply the reduction operation between the current local
-                   * id and the one on the other half of the vector. */
-                  if (globalid < length) {
-                    int min = (length < local) ? length : local;
-                    for (size_t offset = min / 2; offset > 0; offset /= 2) {
-                      if (localid < offset) {
-                        scratch[localid] += scratch[localid + offset];
-                      }
-                      id.barrier(cl::sycl::access::fence_space::local_space);
-                    }
-                    /* The final result will be stored in local id 0. */
-                    if (localid == 0) {
-                      aI[id.get_group(0)] = scratch[localid];
-                      if((length<=local) && globalid ==0){
-                        aOut[globalid]=scratch[localid];
-                      }
-                    }
-                  }
-                });
-          };
-            dev.m_queue.submit(f);
-            dev.m_queue.throw_asynchronous();
-
-          /* At this point, you could queue::wait_and_throw() to ensure that
-           * errors are caught quickly. However, this would likely impact
-           * performance negatively. */
-          length = length / local;
-
-        } while (length > 1);
-
-
-
-}
+template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
+struct OpDefiner {
+  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
+  typedef Op type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
 
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+                                                                            const Index &) {
+    return accumulator;
+  }
 };
 
-/// For now let's start with a full reducer
-/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
-/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
-/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
-// a leafNode.
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
+  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+    return type();
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
+                                                                           const Index &scale) {
+    ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
+    return quotient_op(accumulator, CoeffReturnType(scale));
+  }
+};
+
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
+  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
+  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+    return type();
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+                                                                            const Index &scale) {
+    return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
+  }
+};
+
+template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
+          Index local_range>
+struct SecondStepFullReducer {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
+  typedef typename OpDef::type Op;
+  LocalAccessor scratch;
+  InputAccessor aI;
+  OutputAccessor outAcc;
+  Op op;
+  SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
+      : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) {
+    // Our empirical research shows that the best performance will be achieved
+    // when there is only one element per thread to reduce in the second step.
+    // in this step the second step reduction time is almost negligible.
+    // Hence, in the second step of reduction the input size is fixed to the
+    // local size, thus, there is only one element read per thread. The
+    // algorithm must be changed if the number of reduce per thread in the
+    // second step is greater than 1. Otherwise, the result will be wrong.
+    const Index localid = itemID.get_local_id(0);
+    auto aInPtr = aI.get_pointer() + localid;
+    auto aOutPtr = outAcc.get_pointer();
+    CoeffReturnType *scratchptr = scratch.get_pointer();
+    CoeffReturnType accumulator = *aInPtr;
+
+    scratchptr[localid] = op.finalize(accumulator);
+    for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.reduce(scratchptr[localid + offset], &accumulator);
+        scratchptr[localid] = op.finalize(accumulator);
+      }
+    }
+    if (localid == 0) *aOutPtr = op.finalize(accumulator);
+  }
+};
+
+// Full reduction first phase. In this version the vectorization is true and the reduction accept 
+// any generic reducerOp  e.g( max, min, sum, mean, iamax, iamin, etc ). 
+template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
+class FullReductionKernelFunctor {
+ public:
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
+                    (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+      OpDef;
+
+  typedef typename OpDef::type Op;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::PacketReturnType PacketReturnType;
+  typedef
+      typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
+                                              PacketReturnType, CoeffReturnType>::type OutType;
+  typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  LocalAccessor scratch;
+  Evaluator evaluator;
+  EvaluatorPointerType final_output;
+  Index rng;
+  Op op;
+
+  FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
+                             Index rng_, OpType op_)
+      : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
+
+  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
+      const cl::sycl::nd_item<1> &itemID) {
+    auto output_ptr = final_output.get_pointer();
+    Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
+    Index globalid = itemID.get_global_id(0);
+    Index localid = itemID.get_local_id(0);
+    Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+    Index start = Evaluator::PacketSize * globalid;
+    // vectorizable parts
+    PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
+    for (Index i = start; i < VectorizedRange; i += step) {
+      op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
+    }
+    globalid += VectorizedRange;
+    // non vectorizable parts
+    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      op.template reducePacket<PacketReturnType>(
+          ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
+              evaluator.impl().coeff(i), op.initialize()),
+          &packetAccumulator);
+    }
+    scratch[localid] = packetAccumulator =
+        OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
+    // reduction parts // Local size is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
+        scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
+      }
+    }
+    if (localid == 0) {
+      output_ptr[itemID.get_group(0)] =
+          op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
+    }
+  }
+
+  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
+      const cl::sycl::nd_item<1> &itemID) {
+    auto output_ptr = final_output.get_pointer();
+    Index globalid = itemID.get_global_id(0);
+    Index localid = itemID.get_local_id(0);
+    // vectorizable parts
+    CoeffReturnType accumulator = op.initialize();
+    // non vectorizable parts
+    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      op.reduce(evaluator.impl().coeff(i), &accumulator);
+    }
+    scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
+
+    // reduction parts. the local size is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.reduce(scratch[localid + offset], &accumulator);
+        scratch[localid] = op.finalize(accumulator);
+      }
+    }
+    if (localid == 0) {
+      output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
+    }
+  }
+};
+
+template <typename Evaluator, typename OpType>
+class GenericNondeterministicReducer {
+ public:
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  template <typename Scratch>
+  GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
+                       Index range_, Index num_values_to_reduce_)
+      : evaluator(evaluator_),
+        output_accessor(output_accessor_),
+        functor(OpDef::get_op(functor_)),
+        range(range_),
+        num_values_to_reduce(num_values_to_reduce_) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) {
+    auto output_accessor_ptr = output_accessor.get_pointer();
+    /// const cast added as a naive solution to solve the qualifier drop error
+    Index globalid = static_cast<Index>(itemID.get_global_linear_id());
+    if (globalid < range) {
+      CoeffReturnType accum = functor.initialize();
+      Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
+          evaluator, evaluator.firstInput(globalid), functor, &accum);
+      output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
+    }
+  }
+
+ private:
+  Evaluator evaluator;
+  EvaluatorPointerType output_accessor;
+  Op functor;
+  Index range;
+  Index num_values_to_reduce;
+};
+
+enum class reduction_dim { inner_most, outer_most };
+// default is preserver
+template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
+struct PartialReductionKernel {
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      ScratchAcc;
+  ScratchAcc scratch;
+  Evaluator evaluator;
+  EvaluatorPointerType output_accessor;
+  Op op;
+  const Index preserve_elements_num_groups;
+  const Index reduce_elements_num_groups;
+  const Index num_coeffs_to_preserve;
+  const Index num_coeffs_to_reduce;
+
+  PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
+                         const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
+                         const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
+      : scratch(scratch_),
+        evaluator(evaluator_),
+        output_accessor(output_accessor_),
+        op(OpDef::get_op(op_)),
+        preserve_elements_num_groups(preserve_elements_num_groups_),
+        reduce_elements_num_groups(reduce_elements_num_groups_),
+        num_coeffs_to_preserve(num_coeffs_to_preserve_),
+        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
+                                                                 CoeffReturnType &accumulator) {
+    if (globalPId >= num_coeffs_to_preserve) {
+      return;
+    }
+    Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
+                                                          : globalRId + (globalPId * num_coeffs_to_reduce);
+    Index localOffset = globalRId;
+
+    const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
+    const Index per_thread_global_stride =
+        rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
+    for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
+      op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
+      localOffset += per_thread_local_stride;
+      global_offset += per_thread_global_stride;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    const Index linearLocalThreadId = itemID.get_local_id(0);
+    Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
+                                                           : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
+    Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
+                                                           : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
+    const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
+                                                           : itemID.get_group(0) / reduce_elements_num_groups;
+    const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
+                                                           : itemID.get_group(0) % reduce_elements_num_groups;
+
+    Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+    const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
+    auto scratchPtr = scratch.get_pointer().get();
+    auto outPtr =
+        output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
+    CoeffReturnType accumulator = op.initialize();
+
+    element_wise_reduce(globalRId, globalPId, accumulator);
 
+    accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
+    scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
+        accumulator;
+    if (rt == reduction_dim::inner_most) {
+      pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
+      rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
+      globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+    }
+
+    /* Apply the reduction operation between the current local
+     * id and the one on the other half of the vector. */
+    auto out_scratch_ptr =
+        scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+    if (rt == reduction_dim::inner_most) {
+      accumulator = *out_scratch_ptr;
+    }
+    // The Local LocalThreadSizeR is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
+      if (rLocalThreadId < offset) {
+        op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
+        // The result has already been divided for mean reducer in the
+        // previous reduction so no need to divide furthermore
+        *out_scratch_ptr = op.finalize(accumulator);
+      }
+      /* All threads collectively read from global memory into local.
+       * The barrier ensures all threads' IO is resolved before
+       * execution continues (strictly speaking, all threads within
+       * a single work-group - there is no co-ordination between
+       * work-groups, only work-items). */
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+    }
+
+    if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
+      outPtr[globalPId] = op.finalize(accumulator);
+    }
+  }
+};
+
+template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
+struct SecondStepPartialReduction {
+  typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      ScratchAccessor;
+  InputAccessor input_accessor;
+  OutputAccessor output_accessor;
+  Op op;
+  const Index num_coeffs_to_preserve;
+  const Index num_coeffs_to_reduce;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
+                                                                   OutputAccessor output_accessor_, OpType op_,
+                                                                   const Index num_coeffs_to_preserve_,
+                                                                   const Index num_coeffs_to_reduce_)
+      : input_accessor(input_accessor_),
+        output_accessor(output_accessor_),
+        op(OpDef::get_op(op_)),
+        num_coeffs_to_preserve(num_coeffs_to_preserve_),
+        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    const Index globalId = itemID.get_global_id(0);
+
+    if (globalId >= num_coeffs_to_preserve) return;
+
+    auto in_ptr = input_accessor.get_pointer() + globalId;
+
+    OutScalar accumulator = op.initialize();
+// num_coeffs_to_reduce is not bigger that 256
+    for (Index i = 0; i < num_coeffs_to_reduce; i++) {
+      op.reduce(*in_ptr, &accumulator);
+      in_ptr += num_coeffs_to_preserve;
+    }
+    output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
+  }
+};  // namespace internal
+
+template <typename Index, Index LTP, Index LTR, bool BC_>
+struct ReductionPannel {
+  static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
+  static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
+  static EIGEN_CONSTEXPR bool BC = BC_;
+};
+
+template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
+struct PartialReducerLauncher {
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
   typedef typename Self::CoeffReturnType CoeffReturnType;
-  static const bool HasOptimizedImplementation = false;
-
-  static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
-    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
-    typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-    auto functors = TensorSycl::internal::extractFunctors(self.impl());
-    int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
-    size_t inputSize =self.impl().dimensions().TotalSize();
-    size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
-    size_t remaining = inputSize% red_factor;
-    if(rng ==0) {
-      red_factor=1;
-    };
-    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
-    size_t GRange=std::max((size_t )1, rng);
-
-    // convert global range to power of 2 for redecution
-    GRange--;
-    GRange |= GRange >> 1;
-    GRange |= GRange >> 2;
-    GRange |= GRange >> 4;
-    GRange |= GRange >> 8;
-    GRange |= GRange >> 16;
-#if __x86_64__ || __ppc64__ || _WIN64
-    GRange |= GRange >> 32;
-#endif
-    GRange++;
-    size_t  outTileSize = tileSize;
-    /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
-    if (GRange < outTileSize) outTileSize=GRange;
-    // getting final out buffer at the moment the created buffer is true because there is no need for assign
-    auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
-    /// creating the shared memory for calculating reduction.
-    /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
-    /// recursively apply reduction on it in order to reduce the whole.
-    auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
-    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
-    Dims dims= self.xprDims();
-    Op functor = reducer;
-    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
-      // create a tuple of accessors from Evaluator
-      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
-      auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
-
-      cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
-        typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
-        auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-        /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-        /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
-        /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
-        const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
-        /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
-        /// the device_evaluator is detectable and recognisable on the device.
-        auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
-        /// const cast added as a naive solution to solve the qualifier drop error
-        auto globalid=itemID.get_global_linear_id();
-
-        if(globalid<rng)
-          tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
-        else
-          tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
-
-        if(remaining!=0 && globalid==0 )
-          // this will add the rest of input buffer when the input size is not devidable to red_factor.
-          tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
-      });
-    });
-  dev.m_queue.throw_asynchronous();
-
-/// This is used to recursively reduce the tmp value to an element of 1;
-  syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
+  typedef typename Self::Storage Storage;
+  typedef typename Self::Index Index;
+  typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
+      PannelParameters;
+
+  typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
+
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
+                  Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
+    Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
+
+    // getPowerOfTwo makes sure local range is power of 2 and <=
+    // maxSyclThreadPerBlock this will help us to avoid extra check on the
+    // kernel
+    static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
+                    (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
+                  "The Local thread size must be a power of 2 for the reduction "
+                  "operation");
+
+    EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
+    // In this step, we force the code not to be more than 2-step reduction:
+    // Our empirical research shows that if each thread reduces at least 64
+    // elemnts individually, we get better performance. However, this can change
+    // on different platforms. In this step we force the code not to be
+    // morthan step reduction: Our empirical research shows that for inner_most
+    // dim reducer, it is better to have 8 group in a reduce dimension for sizes
+    // > 1024 to achieve the best performance.
+    const Index reductionPerThread = 64;
+    Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
+    const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
+    Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
+    const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
+    const Index globalRange = pNumGroups * rNumGroups * localRange;
+
+    EIGEN_CONSTEXPR Index scratchSize =
+        PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (rNumGroups > 1) {
+      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+          dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
+      EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
+      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+          self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+          num_coeffs_to_reduce);
+
+      typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
+          SecondStepPartialReductionKernel;
+
+      dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
+          temp_accessor, output,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
+          reducer, num_coeffs_to_preserve, rNumGroups);
+
+      self.device().deallocate_temp(temp_pointer);
+    } else {
+      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+          self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+          num_coeffs_to_reduce);
+    }
+    return false;
+  }
+};
+}  // namespace internal
+}  // namespace TensorSycl
+
+namespace internal {
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+  static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
+  static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
+    typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
+    static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+                    (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+                  "The Local thread size must be a power of 2 for the reduction "
+                  "operation");
+    EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+
+    typename Self::Index inputSize = self.impl().dimensions().TotalSize();
+    // In this step we force the code not to be more than 2-step reduction:
+    // Our empirical research shows that if each thread reduces at least 512
+    // elemnts individually, we get better performance.
+    const Index reductionPerThread = 2048;
+    // const Index num_work_group =
+    Index reductionGroup = dev.getPowerOfTwo(
+        (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
+    const Index num_work_group = std::min(reductionGroup, local_range);
+    // 1
+    // ? local_range
+    // : 1);
+    const Index global_range = num_work_group * local_range;
+
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+    typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
+    if (num_work_group > 1) {
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+      typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
+                                                                      local_range, inputSize, reducer);
+
+      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+                                                          EvaluatorPointerType, Index, local_range>
+          GenericRKernel;
+      dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+          tmp_global_accessor, data,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
+          reducer);
+
+      dev.deallocate_temp(temp_pointer);
+    } else {
+      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
+                                                                      reducer);
+    }
+  }
+};
+// vectorizable inner_most most dim preserver
+// col reduction
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, Eigen::SyclDevice> {
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
+                                                                                 num_coeffs_to_reduce,
+                                                                                 num_coeffs_to_preserve);
   }
+};
+// row reduction
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, Eigen::SyclDevice> {
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
 
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
+                                                                                 num_coeffs_to_reduce,
+                                                                                 num_coeffs_to_preserve);
+  }
 };
 
+// ArmgMax uses this kernel for partial reduction//
+// TODO(@mehdi.goli) come up with a better kernel
+// generic partial reduction
 template <typename Self, typename Op>
-struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
+struct GenericReducer<Self, Op, Eigen::SyclDevice> {
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    typename Self::Index range, GRange, tileSize;
+    dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
 
-  typedef typename Self::CoeffReturnType CoeffReturnType;
-  static const bool HasOptimizedImplementation = false;
-
-  static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
-    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
-    typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-    auto functors = TensorSycl::internal::extractFunctors(self.impl());
-
-    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
-
-    size_t GRange=num_coeffs_to_preserve;
-    if (tileSize>GRange) tileSize=GRange;
-    else if(GRange>tileSize){
-      size_t xMode = GRange % tileSize;
-      if (xMode != 0) GRange += (tileSize - xMode);
-    }
-    // getting final out buffer at the moment the created buffer is true because there is no need for assign
-    /// creating the shared memory for calculating reduction.
-    /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
-    /// recursively apply reduction on it in order to reduce the whole.
-    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
-    Dims dims= self.xprDims();
-    Op functor = reducer;
-
-    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
-      // create a tuple of accessors from Evaluator
-      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
-      auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
-
-      cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
-        typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
-        auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-        /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-        /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
-        /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
-        const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
-        /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
-        /// the device_evaluator is detectable and recognisable on the device.
-        typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
-        auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
-        /// const cast added as a naive solution to solve the qualifier drop error
-        auto globalid=itemID.get_global_linear_id();
-        if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
-          typename DeiceSelf::CoeffReturnType accum = functor.initialize();
-          GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
-          functor.finalize(accum);
-          output_accessor.get_pointer()[globalid]= accum;
-        }
-      });
-    });
-  dev.m_queue.throw_asynchronous();
+    dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
+                                       TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
+        self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
+        reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
     return false;
   }
 };
 
-}  // end namespace internal
+}  // namespace internal
 }  // namespace Eigen
 
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index 99245f7789ce9e410c73dbaffdb2ea8c9376f272..a27d3646de15aa5c7656973c57f1c99c717dda07 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -31,7 +31,7 @@ class TensorLazyBaseEvaluator {
   int refCount() const { return m_refcount; }
 
  private:
-  // No copy, no assigment;
+  // No copy, no assignment;
   TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
   TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other);
 
@@ -44,6 +44,9 @@ class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, t
  public:
   //  typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
   typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef  TensorEvaluator<Expr, Device> EvalType;
 
   TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) {
     m_dims = m_impl.dimensions();
@@ -79,6 +82,8 @@ class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimension
  public:
   typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
   typedef typename Base::Scalar Scalar;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {
   }
@@ -136,11 +141,17 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
     enum {
       IsAligned = false,
       PacketAccess = false,
+      BlockAccess = false,
+      PreferBlockAccess = false,
       Layout = PlainObjectType::Layout,
       CoordAccess = false,  // to be implemented
       RawAccess = false
     };
 
+    //===- Tensor block evaluation strategy (see TensorBlock.h) -----------===//
+    typedef internal::TensorBlockNotImplemented TensorBlock;
+    //===------------------------------------------------------------------===//
+
     EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
     }
 
@@ -360,26 +371,34 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
   typedef typename Derived::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename Derived::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
     PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorRef<Derived>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
       : m_ref(m)
   { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+  EIGEN_STRONG_INLINE void cleanup() { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_ref.coeff(index);
@@ -389,7 +408,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
     return m_ref.coeffRef(index);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_ref.data(); }
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_ref.data(); }
 
  protected:
   TensorRef<Derived> m_ref;
@@ -411,10 +430,16 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
   enum {
     IsAligned = false,
     PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
   { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 14e392e365eafd3005d4b0177945738e02088e54..586ce68ab0ae3363127f0b5118ad9dd938195916 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -31,6 +31,7 @@ struct traits<TensorReverseOp<ReverseDimensions,
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename ReverseDimensions, typename XprType>
@@ -53,15 +54,16 @@ class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
                                           XprType>, WriteAccessors>
 {
   public:
-  typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
-                                                                    StorageKind;
-  typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
+    typedef TensorBase<TensorReverseOp<ReverseDimensions, XprType>, WriteAccessors>Base;
+    typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
+                                                                      StorageKind;
+    typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
       const XprType& expr, const ReverseDimensions& reverse_dims)
       : m_xpr(expr), m_reverse_dims(reverse_dims) { }
 
@@ -72,24 +74,8 @@ class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorReverseOp& operator = (const TensorReverseOp& other)
-    {
-      typedef TensorAssignOp<TensorReverseOp, const TensorReverseOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReverseOp)
 
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorReverseOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorReverseOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
 
   protected:
     typename XprType::Nested m_xpr;
@@ -107,19 +93,38 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = NumDims > 0,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
-      : m_impl(op.expression(), device), m_reverse(op.reverse())
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device),
+        m_reverse(op.reverse()),
+        m_device(device)
   {
     // Reversing a scalar isn't supported yet. It would be a no-op anyway.
     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -130,11 +135,13 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
       m_strides[0] = 1;
       for (int i = 1; i < NumDims; ++i) {
         m_strides[i] = m_strides[i-1] * m_dimensions[i-1];
+        if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
       }
     } else {
       m_strides[NumDims-1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
         m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
+        if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
       }
     }
   }
@@ -142,11 +149,20 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -155,8 +171,9 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     eigen_assert(index < dimensions().TotalSize());
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
-        Index idx = index / m_strides[i];
+        Index idx = index / m_fastStrides[i];
         index -= idx * m_strides[i];
         if (m_reverse[i]) {
           idx = m_dimensions[i] - idx - 1;
@@ -169,8 +186,9 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
         inputIndex += index;
       }
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
-        Index idx = index / m_strides[i];
+        Index idx = index / m_fastStrides[i];
         index -= idx * m_strides[i];
         if (m_reverse[i]) {
           idx = m_dimensions[i] - idx - 1;
@@ -202,6 +220,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     // local structure.
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type
                                                             values[PacketSize];
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -209,6 +228,130 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     return rslt;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    // Block evaluation reads underlying memory in reverse order, and default
+    // cost model does not properly catch this in bytes stored/loaded.
+    return internal::TensorBlockResourceRequirements::skewed<Scalar>(
+               target_size)
+        .addCostPerCoeff({0, 0, 24});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    // TODO(ezhulenev): If underlying tensor expression supports and prefers
+    // block evaluation we must use it. Currently we use coeff and packet
+    // access into the underlying tensor expression.
+    // static const bool useBlockAccessForArgType =
+    //     TensorEvaluator<ArgType, Device>::BlockAccess &&
+    //     TensorEvaluator<ArgType, Device>::PreferBlockAccess;
+
+    static const bool isColMajor =
+        static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+    static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
+    const bool inner_dim_reversed = m_reverse[inner_dim_idx];
+
+    // Offset in the output block.
+    Index block_offset = 0;
+
+    // Offset in the input Tensor.
+    Index input_offset = reverseIndex(desc.offset());
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = isColMajor ? i : NumDims - 1 - i;
+      it[i].size = desc.dimension(dim);
+      it[i].count = 0;
+      it[i].reverse = m_reverse[dim];
+
+      it[i].block_stride =
+          i == 0 ? 1 : (it[i - 1].size * it[i - 1].block_stride);
+      it[i].block_span = it[i].block_stride * (it[i].size - 1);
+
+      it[i].input_stride = m_strides[dim];
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      if (it[i].reverse) {
+        it[i].input_stride = -1 * it[i].input_stride;
+        it[i].input_span = -1 * it[i].input_span;
+      }
+    }
+
+    // If multiple inner dimensions have the same reverse flag, check if we can
+    // merge them into a single virtual inner dimension.
+    int effective_inner_dim = 0;
+    for (int i = 1; i < NumDims; ++i) {
+      if (it[i].reverse != it[effective_inner_dim].reverse) break;
+      if (it[i].block_stride != it[effective_inner_dim].size) break;
+      if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
+
+      it[i].size = it[effective_inner_dim].size * it[i].size;
+
+      it[i].block_stride = 1;
+      it[i].input_stride = (inner_dim_reversed ? -1 : 1);
+
+      it[i].block_span = it[i].block_stride * (it[i].size - 1);
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      effective_inner_dim = i;
+    }
+
+    eigen_assert(it[effective_inner_dim].block_stride == 1);
+    eigen_assert(it[effective_inner_dim].input_stride ==
+                 (inner_dim_reversed ? -1 : 1));
+
+    const Index inner_dim_size = it[effective_inner_dim].size;
+
+    // Prepare storage for the materialized reverse result.
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch);
+    CoeffReturnType* block_buffer = block_storage.data();
+
+    while (it[NumDims - 1].count < it[NumDims - 1].size) {
+      // Copy inner-most dimension data from reversed location in input.
+      Index dst = block_offset;
+      Index src = input_offset;
+
+      // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
+      // worse results in benchmarks than a simple coefficient loop.
+      if (inner_dim_reversed) {
+        for (Index i = 0; i < inner_dim_size; ++i) {
+          block_buffer[dst] = m_impl.coeff(src);
+          ++dst;
+          --src;
+        }
+      } else {
+        for (Index i = 0; i < inner_dim_size; ++i) {
+          block_buffer[dst] = m_impl.coeff(src);
+          ++dst;
+          ++src;
+        }
+      }
+
+      // For the 1d tensor we need to generate only one inner-most dimension.
+      if ((NumDims - effective_inner_dim) == 1) break;
+
+      // Update offset.
+      for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
+        if (++it[i].count < it[i].size) {
+          block_offset += it[i].block_stride;
+          input_offset += it[i].input_stride;
+          break;
+        }
+        if (i != NumDims - 1) it[i].count = 0;
+        block_offset -= it[i].block_span;
+        input_offset -= it[i].input_span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
                                      2 * TensorOpCost::MulCost<Index>() +
@@ -222,13 +365,42 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
            TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  protected:
   Dimensions m_dimensions;
   array<Index, NumDims> m_strides;
+  array<IndexDivisor, NumDims> m_fastStrides;
   TensorEvaluator<ArgType, Device> m_impl;
   ReverseDimensions m_reverse;
+  const Device EIGEN_DEVICE_REF m_device;
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : size(0),
+          count(0),
+          reverse(false),
+          block_stride(0),
+          block_span(0),
+          input_stride(0),
+          input_span(0) {}
+
+    Index size;
+    Index count;
+    bool reverse;
+    Index block_stride;
+    Index block_span;
+    Index input_stride;
+    Index input_span;
+  };
 };
 
 // Eval as lvalue
@@ -247,18 +419,23 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : Base(op, device) {}
 
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Dimensions& dimensions() const { return this->m_dimensions; }
@@ -275,11 +452,11 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
     // This code is pilfered from TensorMorphing.h
     EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       this->coeffRef(index+i) = values[i];
     }
   }
-
 };
 
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 8501466ce602e8f8adc4a1b6aea11ac9b32a7f66..beae854ddba66753048e97a4a0037ec56d12c73f 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -24,6 +24,7 @@ struct traits<TensorScanOp<Op, XprType> >
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Op, typename XprType>
@@ -76,8 +77,299 @@ protected:
   const bool m_exclusive;
 };
 
-template <typename Self, typename Reducer, typename Device>
-struct ScanLauncher;
+
+namespace internal {
+
+template <typename Self>
+EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset,
+                                      typename Self::CoeffReturnType* data) {
+  // Compute the scan along the axis, starting at the given offset
+  typename Self::CoeffReturnType accum = self.accumulator().initialize();
+  if (self.stride() == 1) {
+    if (self.exclusive()) {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      }
+    } else {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  } else {
+    if (self.exclusive()) {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        Index curr = offset + idx3 * self.stride();
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      }
+    } else {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        Index curr = offset + idx3 * self.stride();
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  }
+}
+
+template <typename Self>
+EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset,
+                                      typename Self::CoeffReturnType* data) {
+  using Scalar = typename Self::CoeffReturnType;
+  using Packet = typename Self::PacketReturnType;
+  // Compute the scan along the axis, starting at the calculated offset
+  Packet accum = self.accumulator().template initializePacket<Packet>();
+  if (self.stride() == 1) {
+    if (self.exclusive()) {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+      }
+    } else {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+      }
+    }
+  } else {
+    if (self.exclusive()) {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        const Index curr = offset + idx3 * self.stride();
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+      }
+    } else {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        const Index curr = offset + idx3 * self.stride();
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+      }
+    }
+  }
+}
+
+template <typename Self, bool Vectorize, bool Parallel>
+struct ReduceBlock {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
+                                      typename Self::CoeffReturnType* data) {
+    for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
+      // Calculate the starting offset for the scan
+      Index offset = idx1 + idx2;
+      ReduceScalar(self, offset, data);
+    }
+  }
+};
+
+// Specialization for vectorized reduction.
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
+                                      typename Self::CoeffReturnType* data) {
+    using Packet = typename Self::PacketReturnType;
+    const int PacketSize = internal::unpacket_traits<Packet>::size;
+    Index idx2 = 0;
+    for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) {
+      // Calculate the starting offset for the packet scan
+      Index offset = idx1 + idx2;
+      ReducePacket(self, offset, data);
+    }
+    for (; idx2 < self.stride(); idx2++) {
+      // Calculate the starting offset for the scan
+      Index offset = idx1 + idx2;
+      ReduceScalar(self, offset, data);
+    }
+  }
+};
+
+// Single-threaded CPU implementation of scan
+template <typename Self, typename Reducer, typename Device,
+          bool Vectorize =
+              (TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess &&
+               internal::reducer_traits<Reducer, Device>::PacketAccess)>
+struct ScanLauncher {
+  void operator()(Self& self, typename Self::CoeffReturnType* data) {
+    Index total_size = internal::array_prod(self.dimensions());
+
+    // We fix the index along the scan axis to 0 and perform a
+    // scan per remaining entry. The iteration is split into two nested
+    // loops to avoid an integer division by keeping track of each idx1 and
+    // idx2.
+    for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
+      ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer;
+      block_reducer(self, idx1, data);
+    }
+  }
+};
+
+#ifdef EIGEN_USE_THREADS
+
+// Adjust block_size to avoid false sharing of cachelines among
+// threads. Currently set to twice the cache line size on Intel and ARM
+// processors.
+EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
+  EIGEN_CONSTEXPR Index kBlockAlignment = 128;
+  const Index items_per_cacheline =
+      numext::maxi<Index>(1, kBlockAlignment / item_size);
+  return items_per_cacheline * divup(block_size, items_per_cacheline);
+}
+
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
+                                      typename Self::CoeffReturnType* data) {
+    using Scalar = typename Self::CoeffReturnType;
+    using Packet = typename Self::PacketReturnType;
+    const int PacketSize = internal::unpacket_traits<Packet>::size;
+    Index num_scalars = self.stride();
+    Index num_packets = 0;
+    if (self.stride() >= PacketSize) {
+      num_packets = self.stride() / PacketSize;
+      self.device().parallelFor(
+          num_packets,
+        TensorOpCost(PacketSize * self.size(), PacketSize * self.size(),
+                     16 * PacketSize * self.size(), true, PacketSize),
+        // Make the shard size large enough that two neighboring threads
+        // won't write to the same cacheline of `data`.
+        [=](Index blk_size) {
+          return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size);
+        },
+        [&](Index first, Index last) {
+          for (Index packet = first; packet < last; ++packet) {
+            const Index idx2 = packet * PacketSize;
+            ReducePacket(self, idx1 + idx2, data);
+          }
+        });
+      num_scalars -= num_packets * PacketSize;
+    }
+    self.device().parallelFor(
+        num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()),
+        // Make the shard size large enough that two neighboring threads
+        // won't write to the same cacheline of `data`.
+        [=](Index blk_size) {
+          return AdjustBlockSize(sizeof(Scalar), blk_size);
+        },
+        [&](Index first, Index last) {
+          for (Index scalar = first; scalar < last; ++scalar) {
+            const Index idx2 = num_packets * PacketSize + scalar;
+            ReduceScalar(self, idx1 + idx2, data);
+          }
+        });
+  }
+};
+
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
+                                      typename Self::CoeffReturnType* data) {
+    using Scalar = typename Self::CoeffReturnType;
+    self.device().parallelFor(
+        self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
+        // Make the shard size large enough that two neighboring threads
+        // won't write to the same cacheline of `data`.
+        [=](Index blk_size) {
+          return AdjustBlockSize(sizeof(Scalar), blk_size);
+        },
+        [&](Index first, Index last) {
+          for (Index idx2 = first; idx2 < last; ++idx2) {
+            ReduceScalar(self, idx1 + idx2, data);
+          }
+        });
+  }
+};
+
+// Specialization for multi-threaded execution.
+template <typename Self, typename Reducer, bool Vectorize>
+struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
+  void operator()(Self& self, typename Self::CoeffReturnType* data) {
+    using Scalar = typename Self::CoeffReturnType;
+    using Packet = typename Self::PacketReturnType;
+    const int PacketSize = internal::unpacket_traits<Packet>::size;
+    const Index total_size = internal::array_prod(self.dimensions());
+    const Index inner_block_size = self.stride() * self.size();
+    bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size));
+
+    if ((parallelize_by_outer_blocks && total_size <= 4096) ||
+        (!parallelize_by_outer_blocks && self.stride() < PacketSize)) {
+      ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher;
+      launcher(self, data);
+      return;
+    }
+
+    if (parallelize_by_outer_blocks) {
+      // Parallelize over outer blocks.
+      const Index num_outer_blocks = total_size / inner_block_size;
+      self.device().parallelFor(
+          num_outer_blocks,
+          TensorOpCost(inner_block_size, inner_block_size,
+                       16 * PacketSize * inner_block_size, Vectorize,
+                       PacketSize),
+          [=](Index blk_size) {
+            return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size);
+          },
+          [&](Index first, Index last) {
+            for (Index idx1 = first; idx1 < last; ++idx1) {
+              ReduceBlock<Self, Vectorize, /*Parallelize=*/false> block_reducer;
+              block_reducer(self, idx1 * inner_block_size, data);
+            }
+          });
+    } else {
+      // Parallelize over inner packets/scalars dimensions when the reduction
+      // axis is not an inner dimension.
+      ReduceBlock<Self, Vectorize, /*Parallelize=*/true> block_reducer;
+      for (Index idx1 = 0; idx1 < total_size;
+           idx1 += self.stride() * self.size()) {
+        block_reducer(self, idx1, data);
+      }
+    }
+  }
+};
+#endif  // EIGEN_USE_THREADS
+
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+
+// GPU implementation of scan
+// TODO(ibab) This placeholder implementation performs multiple scans in
+// parallel, but it would be better to use a parallel scan algorithm and
+// optimize memory access.
+template <typename Self, typename Reducer>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
+  // Compute offset as in the CPU version
+  Index val = threadIdx.x + blockIdx.x * blockDim.x;
+  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
+
+  if (offset + (self.size() - 1) * self.stride() < total_size) {
+    // Compute the scan along the axis, starting at the calculated offset
+    typename Self::CoeffReturnType accum = self.accumulator().initialize();
+    for (Index idx = 0; idx < self.size(); idx++) {
+      Index curr = offset + idx * self.stride();
+      if (self.exclusive()) {
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      } else {
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  }
+  __syncthreads();
+
+}
+
+template <typename Self, typename Reducer, bool Vectorize>
+struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
+  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
+     Index total_size = internal::array_prod(self.dimensions());
+     Index num_blocks = (total_size / self.size() + 63) / 64;
+     Index block_size = 64;
+
+     LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+  }
+};
+#endif  // EIGEN_USE_GPU && (EIGEN_GPUCC)
+
+}  // namespace internal
 
 // Eval as rvalue
 template <typename Op, typename ArgType, typename Device>
@@ -85,30 +377,38 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
 
   typedef TensorScanOp<Op, ArgType> XprType;
   typedef typename XprType::Index Index;
+  typedef const ArgType ChildTypeNoConst;
+  typedef const ArgType ChildType;
   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
-    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = true
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device),
         m_device(device),
         m_exclusive(op.exclusive()),
         m_accumulator(op.accumulator()),
         m_size(m_impl.dimensions()[op.axis()]),
-        m_stride(1),
+        m_stride(1), m_consume_dim(op.axis()),
         m_output(NULL) {
 
     // Accumulating a scalar isn't supported.
@@ -122,7 +422,11 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
         m_stride = m_stride * dims[i];
       }
     } else {
-      for (int i = NumDims - 1; i > op.axis(); --i) {
+      // dims can only be indexed through unsigned integers,
+      // so let's use an unsigned type to let the compiler knows.
+      // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized in this function"
+      unsigned int axis = internal::convert_index<unsigned int>(op.axis());
+      for (unsigned int i = NumDims - 1; i > axis; --i) {
         m_stride = m_stride * dims[i];
       }
     }
@@ -136,6 +440,10 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
     return m_stride;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const {
+    return m_consume_dim;
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
     return m_size;
   }
@@ -156,16 +464,16 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
     return m_device;
   }
 
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     m_impl.evalSubExprsIfNeeded(NULL);
-    ScanLauncher<Self, Op, Device> launcher;
+    internal::ScanLauncher<Self, Op, Device> launcher;
     if (data) {
       launcher(*this, data);
       return false;
     }
 
     const Index total_size = internal::array_prod(dimensions());
-    m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
+    m_output = static_cast<EvaluatorPointerType>(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar))));
     launcher(*this, m_output);
     return true;
   }
@@ -175,7 +483,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
     return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
   {
     return m_output;
   }
@@ -189,98 +497,31 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    if (m_output != NULL) {
-      m_device.deallocate(m_output);
+  EIGEN_STRONG_INLINE void cleanup() {
+    if (m_output) {
+      m_device.deallocate_temp(m_output);
       m_output = NULL;
     }
     m_impl.cleanup();
   }
 
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+    m_output.bind(cgh);
+  }
+#endif
 protected:
   TensorEvaluator<ArgType, Device> m_impl;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
   const bool m_exclusive;
   Op m_accumulator;
   const Index m_size;
   Index m_stride;
-  CoeffReturnType* m_output;
-};
-
-// CPU implementation of scan
-// TODO(ibab) This single-threaded implementation should be parallelized,
-// at least by running multiple scans at the same time.
-template <typename Self, typename Reducer, typename Device>
-struct ScanLauncher {
-  void operator()(Self& self, typename Self::CoeffReturnType *data) {
-    Index total_size = internal::array_prod(self.dimensions());
-
-    // We fix the index along the scan axis to 0 and perform a
-    // scan per remaining entry. The iteration is split into two nested
-    // loops to avoid an integer division by keeping track of each idx1 and idx2.
-    for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
-      for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
-        // Calculate the starting offset for the scan
-        Index offset = idx1 + idx2;
-
-        // Compute the scan along the axis, starting at the calculated offset
-        typename Self::CoeffReturnType accum = self.accumulator().initialize();
-        for (Index idx3 = 0; idx3 < self.size(); idx3++) {
-          Index curr = offset + idx3 * self.stride();
-
-          if (self.exclusive()) {
-            data[curr] = self.accumulator().finalize(accum);
-            self.accumulator().reduce(self.inner().coeff(curr), &accum);
-          } else {
-            self.accumulator().reduce(self.inner().coeff(curr), &accum);
-            data[curr] = self.accumulator().finalize(accum);
-          }
-        }
-      }
-    }
-  }
-};
-
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-
-// GPU implementation of scan
-// TODO(ibab) This placeholder implementation performs multiple scans in
-// parallel, but it would be better to use a parallel scan algorithm and
-// optimize memory access.
-template <typename Self, typename Reducer>
-__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
-  // Compute offset as in the CPU version
-  Index val = threadIdx.x + blockIdx.x * blockDim.x;
-  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
-
-  if (offset + (self.size() - 1) * self.stride() < total_size) {
-    // Compute the scan along the axis, starting at the calculated offset
-    typename Self::CoeffReturnType accum = self.accumulator().initialize();
-    for (Index idx = 0; idx < self.size(); idx++) {
-      Index curr = offset + idx * self.stride();
-      if (self.exclusive()) {
-        data[curr] = self.accumulator().finalize(accum);
-        self.accumulator().reduce(self.inner().coeff(curr), &accum);
-      } else {
-        self.accumulator().reduce(self.inner().coeff(curr), &accum);
-        data[curr] = self.accumulator().finalize(accum);
-      }
-    }
-  }
-  __syncthreads();
-
-}
-
-template <typename Self, typename Reducer>
-struct ScanLauncher<Self, Reducer, GpuDevice> {
-  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
-     Index total_size = internal::array_prod(self.dimensions());
-     Index num_blocks = (total_size / self.size() + 63) / 64;
-     Index block_size = 64;
-     LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
-  }
+  Index m_consume_dim;
+  EvaluatorPointerType m_output;
 };
-#endif  // EIGEN_USE_GPU && __CUDACC__
 
 }  // end namespace Eigen
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f68ecb6afb4d6209be3284725318be5c5352241
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
@@ -0,0 +1,513 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorScanSycl.h
+ *
+ * \brief:
+ *  Tensor Scan Sycl implement the extend  version of
+ * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
+ * The algorithm requires up to 3 stage (consequently 3 kernels) depending on
+ * the size of the tensor. In the first kernel (ScanKernelFunctor), each
+ * threads within the work-group individually reduces the allocated elements per
+ * thread in order to reduces the total number of blocks. In the next step all
+ * thread within the work-group will reduce the associated blocks into the
+ * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
+ * buffer is given as an input and all the threads within a work-group scan and
+ * reduces the boundaries between the blocks (generated from the previous
+ * kernel). and write the data on the temporary buffer. If the second kernel is
+ * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will
+ * adjust the final result into the output buffer.
+ * The original algorithm for the parallel prefix sum can be found here:
+ *
+ * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
+ * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
+ *1, no. 1 (2008): 1-17.
+ *****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
+#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
+#endif
+
+template <typename index_t>
+struct ScanParameters {
+  // must be power of 2
+  static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
+  const index_t total_size;
+  const index_t non_scan_size;
+  const index_t scan_size;
+  const index_t non_scan_stride;
+  const index_t scan_stride;
+  const index_t panel_threads;
+  const index_t group_threads;
+  const index_t block_threads;
+  const index_t elements_per_group;
+  const index_t elements_per_block;
+  const index_t loop_range;
+
+  ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
+                 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
+                 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
+      : total_size(total_size_),
+        non_scan_size(non_scan_size_),
+        scan_size(scan_size_),
+        non_scan_stride(non_scan_stride_),
+        scan_stride(scan_stride_),
+        panel_threads(panel_threads_),
+        group_threads(group_threads_),
+        block_threads(block_threads_),
+        elements_per_group(elements_per_group_),
+        elements_per_block(elements_per_block_),
+        loop_range(loop_range_) {}
+};
+
+enum class scan_step { first, second };
+template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
+          scan_step stp>
+struct ScanKernelFunctor {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+
+  LocalAccessor scratch;
+  Evaluator dev_eval;
+  OutAccessor out_accessor;
+  OutAccessor temp_accessor;
+  const ScanParameters<Index> scanParameters;
+  Op accumulator;
+  const bool inclusive;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,
+                                                          OutAccessor out_accessor_, OutAccessor temp_accessor_,
+                                                          const ScanParameters<Index> scanParameters_, Op accumulator_,
+                                                          const bool inclusive_)
+      : scratch(scratch_),
+        dev_eval(dev_eval_),
+        out_accessor(out_accessor_),
+        temp_accessor(temp_accessor_),
+        scanParameters(scanParameters_),
+        accumulator(accumulator_),
+        inclusive(inclusive_) {}
+
+  template <scan_step sst = stp, typename Input>
+  typename ::Eigen::internal::enable_if<sst == scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
+      EIGEN_STRONG_INLINE
+      read(const Input &inpt, Index global_id) {
+    return inpt.coeff(global_id);
+  }
+
+  template <scan_step sst = stp, typename Input>
+  typename ::Eigen::internal::enable_if<sst != scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
+      EIGEN_STRONG_INLINE
+      read(const Input &inpt, Index global_id) {
+    return inpt[global_id];
+  }
+
+  template <scan_step sst = stp, typename InclusiveOp>
+  typename ::Eigen::internal::enable_if<sst == scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  first_step_inclusive_Operation(InclusiveOp inclusive_op) {
+    inclusive_op();
+  }
+
+  template <scan_step sst = stp, typename InclusiveOp>
+  typename ::Eigen::internal::enable_if<sst != scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  first_step_inclusive_Operation(InclusiveOp) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    auto out_ptr = out_accessor.get_pointer();
+    auto tmp_ptr = temp_accessor.get_pointer();
+    auto scratch_ptr = scratch.get_pointer().get();
+
+    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
+      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
+      Index tmp = data_offset % scanParameters.panel_threads;
+      const Index panel_id = data_offset / scanParameters.panel_threads;
+      const Index group_id = tmp / scanParameters.group_threads;
+      tmp = tmp % scanParameters.group_threads;
+      const Index block_id = tmp / scanParameters.block_threads;
+      const Index local_id = tmp % scanParameters.block_threads;
+      // we put one element per packet in scratch_mem
+      const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
+      const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
+      CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
+      CoeffReturnType inclusive_scan;
+      // the actual panel size is scan_size * non_scan_size.
+      // elements_per_panel is roundup to power of 2 for binary tree
+      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
+      const Index group_offset = group_id * scanParameters.non_scan_stride;
+      // This will be effective when the size is bigger than elements_per_block
+      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
+      const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
+      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
+      Index next_elements = 0;
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
+                             (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) &&
+                           (global_id < scanParameters.total_size))
+                              ? read(dev_eval, global_id)
+                              : accumulator.initialize();
+        next_elements += scanParameters.scan_stride;
+      }
+      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
+        if (inclusive) {
+          inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
+        }
+      });
+      // This for loop must be 2
+      EIGEN_UNROLL_LOOP
+      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
+        Index private_offset = 1;
+        // build sum in place up the tree
+        EIGEN_UNROLL_LOOP
+        for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
+          EIGEN_UNROLL_LOOP
+          for (Index l = 0; l < d; l++) {
+            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
+            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
+            CoeffReturnType accum = accumulator.initialize();
+            accumulator.reduce(private_scan[ai], &accum);
+            accumulator.reduce(private_scan[bi], &accum);
+            private_scan[bi] = accumulator.finalize(accum);
+          }
+          private_offset *= 2;
+        }
+        scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
+            private_scan[PacketSize - 1 + packetIndex];
+        private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
+        // traverse down tree & build scan
+        EIGEN_UNROLL_LOOP
+        for (Index d = 1; d < PacketSize; d *= 2) {
+          private_offset >>= 1;
+          EIGEN_UNROLL_LOOP
+          for (Index l = 0; l < d; l++) {
+            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
+            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
+            CoeffReturnType accum = accumulator.initialize();
+            accumulator.reduce(private_scan[ai], &accum);
+            accumulator.reduce(private_scan[bi], &accum);
+            private_scan[ai] = private_scan[bi];
+            private_scan[bi] = accumulator.finalize(accum);
+          }
+        }
+      }
+
+      Index offset = 1;
+      // build sum in place up the tree
+      for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
+        // Synchronise
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (local_id < d) {
+          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
+          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
+          CoeffReturnType accum = accumulator.initialize();
+          accumulator.reduce(scratch_ptr[ai], &accum);
+          accumulator.reduce(scratch_ptr[bi], &accum);
+          scratch_ptr[bi] = accumulator.finalize(accum);
+        }
+        offset *= 2;
+      }
+      // Synchronise
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      // next step optimisation
+      if (local_id == 0) {
+        if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) {
+          const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) *
+                                    scanParameters.non_scan_size +
+                                group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
+                                block_id;
+          tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
+        }
+        // clear the last element
+        scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
+      }
+      // traverse down tree & build scan
+      for (Index d = 1; d < scratch_stride; d *= 2) {
+        offset >>= 1;
+        // Synchronise
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (local_id < d) {
+          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
+          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
+          CoeffReturnType accum = accumulator.initialize();
+          accumulator.reduce(scratch_ptr[ai], &accum);
+          accumulator.reduce(scratch_ptr[bi], &accum);
+          scratch_ptr[ai] = scratch_ptr[bi];
+          scratch_ptr[bi] = accumulator.finalize(accum);
+        }
+      }
+      // Synchronise
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      // This for loop must be 2
+      EIGEN_UNROLL_LOOP
+      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
+        EIGEN_UNROLL_LOOP
+        for (Index i = 0; i < PacketSize; i++) {
+          CoeffReturnType accum = private_scan[packetIndex + i];
+          accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
+          private_scan[packetIndex + i] = accumulator.finalize(accum);
+        }
+      }
+      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
+        if (inclusive) {
+          accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
+          private_scan[0] = accumulator.finalize(inclusive_scan);
+        }
+      });
+      next_elements = 0;
+      // right the first set of private param
+      EIGEN_UNROLL_LOOP
+      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
+             scanParameters.scan_size) &&
+            (global_id < scanParameters.total_size)) {
+          Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
+          out_ptr[global_id] = private_scan[private_id];
+        }
+        next_elements += scanParameters.scan_stride;
+      }
+    }  // end for loop
+  }
+};
+
+template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
+struct ScanAdjustmentKernelFunctor {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+  InAccessor in_accessor;
+  OutAccessor out_accessor;
+  const ScanParameters<Index> scanParameters;
+  Op accumulator;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
+                                                                    OutAccessor out_accessor_,
+                                                                    const ScanParameters<Index> scanParameters_,
+                                                                    Op accumulator_)
+      : in_accessor(in_accessor_),
+        out_accessor(out_accessor_),
+        scanParameters(scanParameters_),
+        accumulator(accumulator_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    auto in_ptr = in_accessor.get_pointer();
+    auto out_ptr = out_accessor.get_pointer();
+
+    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
+      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
+      Index tmp = data_offset % scanParameters.panel_threads;
+      const Index panel_id = data_offset / scanParameters.panel_threads;
+      const Index group_id = tmp / scanParameters.group_threads;
+      tmp = tmp % scanParameters.group_threads;
+      const Index block_id = tmp / scanParameters.block_threads;
+      const Index local_id = tmp % scanParameters.block_threads;
+
+      // the actual panel size is scan_size * non_scan_size.
+      // elements_per_panel is roundup to power of 2 for binary tree
+      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
+      const Index group_offset = group_id * scanParameters.non_scan_stride;
+      // This will be effective when the size is bigger than elements_per_block
+      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
+      const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;
+
+      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
+      const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block;
+      const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
+      CoeffReturnType adjust_val = in_ptr[in_id];
+
+      Index next_elements = 0;
+      EIGEN_UNROLL_LOOP
+      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
+             scanParameters.scan_size) &&
+            (global_id < scanParameters.total_size)) {
+          CoeffReturnType accum = adjust_val;
+          accumulator.reduce(out_ptr[global_id], &accum);
+          out_ptr[global_id] = accumulator.finalize(accum);
+        }
+        next_elements += scanParameters.scan_stride;
+      }
+    }
+  }
+};
+
+template <typename Index>
+struct ScanInfo {
+  const Index &total_size;
+  const Index &scan_size;
+  const Index &panel_size;
+  const Index &non_scan_size;
+  const Index &scan_stride;
+  const Index &non_scan_stride;
+
+  Index max_elements_per_block;
+  Index block_size;
+  Index panel_threads;
+  Index group_threads;
+  Index block_threads;
+  Index elements_per_group;
+  Index elements_per_block;
+  Index loop_range;
+  Index global_range;
+  Index local_range;
+  const Eigen::SyclDevice &dev;
+  EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
+                               const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
+                               const Eigen::SyclDevice &dev_)
+      : total_size(total_size_),
+        scan_size(scan_size_),
+        panel_size(panel_size_),
+        non_scan_size(non_scan_size_),
+        scan_stride(scan_stride_),
+        non_scan_stride(non_scan_stride_),
+        dev(dev_) {
+    // must be power of 2
+    local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
+                           Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
+
+    max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread;
+
+    elements_per_group =
+        dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
+    const Index elements_per_panel = elements_per_group * non_scan_size;
+    elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block));
+    panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;
+    group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;
+    block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;
+    block_size = elements_per_group / elements_per_block;
+#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
+    const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));
+#else
+    const Index max_threads = panel_threads * panel_size;
+#endif
+    global_range = roundUp(max_threads, local_range);
+    loop_range = Index(
+        std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
+  }
+  inline ScanParameters<Index> get_scan_parameter() {
+    return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads,
+                                 group_threads, block_threads, elements_per_group, elements_per_block, loop_range);
+  }
+  inline cl::sycl::nd_range<1> get_thread_range() {
+    return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+  }
+};
+
+template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
+struct SYCLAdjustBlockOffset {
+  EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
+                                                           Reducer &accumulator, const Index total_size,
+                                                           const Index scan_size, const Index panel_size,
+                                                           const Index non_scan_size, const Index scan_stride,
+                                                           const Index non_scan_stride, const Eigen::SyclDevice &dev) {
+    auto scan_info =
+        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
+
+    typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index>
+        AdjustFuctor;
+    dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
+                                                                      scan_info.max_elements_per_block,
+                                                                      scan_info.get_scan_parameter(), accumulator);
+  }
+};
+
+template <typename CoeffReturnType, scan_step stp>
+struct ScanLauncher_impl {
+  template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
+  EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
+                                             const Index total_size, const Index scan_size, const Index panel_size,
+                                             const Index non_scan_size, const Index scan_stride,
+                                             const Index non_scan_stride, const bool inclusive,
+                                             const Eigen::SyclDevice &dev) {
+    auto scan_info =
+        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
+    const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
+    const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
+    CoeffReturnType *temp_pointer =
+        static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
+    EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+
+    typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
+    dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
+        in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
+        scan_info.get_scan_parameter(), accumulator, inclusive);
+
+    if (scan_info.block_size > 1) {
+      ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(
+          tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
+          non_scan_size, Index(1), scan_info.block_size, false, dev);
+
+      SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset(
+          tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
+          non_scan_stride, dev);
+    }
+    dev.deallocate_temp(temp_pointer);
+  }
+};
+
+}  // namespace internal
+}  // namespace TensorSycl
+namespace internal {
+template <typename Self, typename Reducer, bool vectorize>
+struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
+  typedef typename Self::Index Index;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::Storage Storage;
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+  void operator()(Self &self, EvaluatorPointerType data) {
+    const Index total_size = internal::array_prod(self.dimensions());
+    const Index scan_size = self.size();
+    const Index scan_stride = self.stride();
+    // this is the scan op (can be sum or ...)
+    auto accumulator = self.accumulator();
+    auto inclusive = !self.exclusive();
+    auto consume_dim = self.consume_dim();
+    auto dev = self.device();
+
+    auto dims = self.inner().dimensions();
+
+    Index non_scan_size = 1;
+    Index panel_size = 1;
+    if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < consume_dim; i++) {
+        non_scan_size *= dims[i];
+      }
+      for (int i = consume_dim + 1; i < Self::NumDims; i++) {
+        panel_size *= dims[i];
+      }
+    } else {
+      for (int i = Self::NumDims - 1; i > consume_dim; i--) {
+        non_scan_size *= dims[i];
+      }
+      for (int i = consume_dim - 1; i >= 0; i--) {
+        panel_size *= dims[i];
+      }
+    }
+    const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
+    auto eval_impl = self.inner();
+    TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block(
+        eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
+        inclusive, dev);
+  }
+};
+} // namespace internal
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 113c060e3f12c39ff07d965fe61fbc79f45f557e..e5e5efdeecad146089fea2fdf3697c030adbdb07 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -31,6 +31,7 @@ struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Shuffle, typename XprType>
@@ -53,15 +54,16 @@ template<typename Shuffle, typename XprType>
 class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
 {
   public:
-  typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
+    typedef TensorBase<TensorShufflingOp<Shuffle, XprType> > Base;
+    typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shuffle)
-      : m_xpr(expr), m_shuffle(shuffle) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl)
+      : m_xpr(expr), m_shuffle(shfl) {}
 
     EIGEN_DEVICE_FUNC
     const Shuffle& shufflePermutation() const { return m_shuffle; }
@@ -70,24 +72,8 @@ class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType>
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const TensorShufflingOp& other)
-    {
-      typedef TensorAssignOp<TensorShufflingOp, const TensorShufflingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp)
 
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorShufflingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
 
   protected:
     typename XprType::Nested m_xpr;
@@ -99,6 +85,7 @@ class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType>
 template<typename Shuffle, typename ArgType, typename Device>
 struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
 {
+  typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self;
   typedef TensorShufflingOp<Shuffle, ArgType> XprType;
   typedef typename XprType::Index Index;
   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
@@ -106,100 +93,246 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_device(device),
+        m_impl(op.expression(), device)
   {
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     const Shuffle& shuffle = op.shufflePermutation();
+    m_is_identity = true;
     for (int i = 0; i < NumDims; ++i) {
+      m_shuffle[i] = static_cast<int>(shuffle[i]);
       m_dimensions[i] = input_dims[shuffle[i]];
+      m_inverseShuffle[shuffle[i]] = i;
+      if (m_is_identity && shuffle[i] != i) {
+        m_is_identity = false;
+      }
     }
 
-    array<Index, NumDims> inputStrides;
-
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      inputStrides[0] = 1;
+      m_unshuffledInputStrides[0] = 1;
       m_outputStrides[0] = 1;
+
       for (int i = 1; i < NumDims; ++i) {
-        inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
+        m_unshuffledInputStrides[i] =
+            m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
         m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
+                  m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
       }
     } else {
-      inputStrides[NumDims - 1] = 1;
+      m_unshuffledInputStrides[NumDims - 1] = 1;
       m_outputStrides[NumDims - 1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
-        inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
+        m_unshuffledInputStrides[i] =
+            m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
         m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
+                  m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
       }
     }
 
     for (int i = 0; i < NumDims; ++i) {
-      m_inputStrides[i] = inputStrides[shuffle[i]];
+      m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    return m_impl.coeff(srcCoeff(index));
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
   }
 
+  template <int LoadMode, typename Self, bool ImplPacketAccess>
+  struct PacketLoader {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    static PacketReturnType Run(const Self& self, Index index) {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = self.coeff(index + i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  };
+
+  template<int LoadMode, typename Self>
+  struct PacketLoader<LoadMode, Self, true> {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    static PacketReturnType Run(const Self& self, Index index) {
+      if (self.m_is_identity) {
+        return self.m_impl.template packet<LoadMode>(index);
+      } else {
+        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+        EIGEN_UNROLL_LOOP
+        for (int i = 0; i < PacketSize; ++i) {
+          values[i] = self.coeff(index + i);
+        }
+        PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+        return rslt;
+      }
+    }
+  };
+
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+        eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+    return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
+  }
 
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    for (int i = 0; i < PacketSize; ++i) {
-      values[i] = coeff(index+i);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    static const int inner_dim =
+        Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+
+    const size_t target_size = m_device.firstLevelCacheSize();
+    const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
+
+    // Shuffled inner dimensions leads to a random memory access, which is not
+    // captured by default cost model bytes loaded/stored. We add this cost
+    // explicitly. The number of cycles picked based on the benchmarks.
+    // TODO(ezhulenev): This number was picked based on a very questionable
+    // benchmarks, add benchmarks that are representative of real workloads.
+    using BlockRequirements = internal::TensorBlockResourceRequirements;
+    if (inner_dim_shuffled) {
+      return BlockRequirements::uniform<Scalar>(target_size)
+          .addCostPerCoeff({0, 0, NumDims * 28});
+    } else {
+      return BlockRequirements::skewed<Scalar>(target_size);
     }
-    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool root_of_expr_ast = false) const {
+    assert(m_impl.data() != NULL);
+
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
+        TensorBlockIO;
+    typedef typename TensorBlockIO::Dst TensorBlockIODst;
+    typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(
+            desc, scratch, /*allow_strided_storage=*/root_of_expr_ast);
+
+    typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
+    TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
+
+    TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(),
+                         block_storage.data());
+
+    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
+    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
+
+    return block_storage.AsTensorMaterializedBlock();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
+    const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
+                                NumDims * (2 * TensorOpCost::AddCost<Index>() +
                                            2 * TensorOpCost::MulCost<Index>() +
                                            TensorOpCost::DivCost<Index>());
     return m_impl.costPerCoeff(vectorized) +
-           TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+           TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
 
+#ifdef EIGEN_USE_SYCL
+   // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
  protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
+      Index input_index,
+      const DSizes<Index, NumDims>& input_block_strides,
+      const DSizes<Index, NumDims>& output_block_strides,
+      const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const {
+    Index output_index = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = input_index / fast_input_block_strides[i];
+        output_index += idx * output_block_strides[m_inverseShuffle[i]];
+        input_index -= idx * input_block_strides[i];
+      }
+      return output_index + input_index *
+          output_block_strides[m_inverseShuffle[0]];
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = input_index / fast_input_block_strides[i];
+        output_index += idx * output_block_strides[m_inverseShuffle[i]];
+        input_index -= idx * input_block_strides[i];
+      }
+      return output_index + input_index *
+          output_block_strides[m_inverseShuffle[NumDims - 1]];
+    }
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_outputStrides[i];
+        const Index idx = index / m_fastOutputStrides[i];
         inputIndex += idx * m_inputStrides[i];
         index -= idx * m_outputStrides[i];
       }
       return inputIndex + index * m_inputStrides[0];
     } else {
       for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_outputStrides[i];
+        const Index idx = index / m_fastOutputStrides[i];
         inputIndex += idx * m_inputStrides[i];
         index -= idx * m_outputStrides[i];
       }
@@ -208,8 +341,15 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   }
 
   Dimensions m_dimensions;
+  bool m_is_identity;
+  array<int, NumDims> m_shuffle;
+  array<Index, NumDims> m_inverseShuffle;  // TODO(ezhulenev): Make it int type.
   array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
+  array<Index, NumDims> m_unshuffledInputStrides;
+
+  const Device EIGEN_DEVICE_REF m_device;
   TensorEvaluator<ArgType, Device> m_impl;
 };
 
@@ -228,15 +368,24 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : Base(op, device)
   { }
 
@@ -252,10 +401,68 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       this->coeffRef(index+i) = values[i];
     }
   }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlockDesc& desc, const TensorBlock& block) {
+    eigen_assert(this->m_impl.data() != NULL);
+
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
+        TensorBlockIO;
+    typedef typename TensorBlockIO::Dst TensorBlockIODst;
+    typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+    const Scalar* block_buffer = block.data();
+
+    // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen
+    // expression with coefficient and packet access as `src`.
+    void* mem = NULL;
+    if (block_buffer == NULL) {
+      mem = this->m_device.allocate(desc.size() * sizeof(Scalar));
+      ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
+
+      typedef internal::TensorBlockAssignment<
+          ScalarNoConst, NumDims, typename TensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(
+              desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
+              buf),
+          block.expr());
+
+      block_buffer = buf;
+    }
+
+    // Read from block.
+    TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()),
+                         block_buffer);
+
+    // Write to the output buffer.
+    typename TensorBlockIO::Dimensions output_strides(
+        this->m_unshuffledInputStrides);
+    typename TensorBlockIO::Dimensions output_dimensions;
+    for (int i = 0; i < NumDims; ++i) {
+      output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
+    }
+    TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(),
+                         this->srcCoeff(desc.offset()));
+
+    // Reorder dimensions according to the shuffle.
+    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
+    for (int i = 0; i < NumDims; ++i) {
+      dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]);
+    }
+    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
+
+    // Deallocate temporary buffer used for the block materialization.
+    if (mem != NULL) this->m_device.deallocate(mem);
+  }
 };
 
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index e6a666f78177caf56cb2dbd961dbd0e7c4aa4719..5ff0880e78ef3a48f30b85257176cbde3dea0935 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -45,8 +45,6 @@ class TensorStorage
   static const std::size_t MinSize = max_n_1<Size>::size;
   EIGEN_ALIGN_MAX T m_data[MinSize];
 
-  FixedDimensions m_dimensions;
-
  public:
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE TensorStorage() {
@@ -57,14 +55,17 @@ class TensorStorage
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE const T *data() const { return m_data; }
 
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const FixedDimensions& dimensions() const { return m_dimensions; }
+  static EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const FixedDimensions& dimensions()
+  {
+    static const FixedDimensions* singleton_dimensions = new FixedDimensions();
+    return *singleton_dimensions;
+  }
 
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); }
+  EIGEN_STRONG_INLINE DenseIndex size() const { return Size; }
 };
 
-
 // pure dynamic
 template<typename T, typename IndexType, int NumIndices_, int Options_>
 class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
@@ -107,6 +108,20 @@ class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
       return *this;
     }
 
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC TensorStorage(Self&& other) : TensorStorage()
+    {
+      *this = std::move(other);
+    }
+    
+    EIGEN_DEVICE_FUNC Self& operator=(Self&& other)
+    {
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_dimensions, other.m_dimensions);
+      return *this;
+    }
+#endif
+
     EIGEN_DEVICE_FUNC  ~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); }
     EIGEN_DEVICE_FUNC  void swap(Self& other)
     { numext::swap(m_data,other.m_data); numext::swap(m_dimensions,other.m_dimensions); }
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 6c35bfdb6ce48b0dac8f95075591f3e4d58d6ca4..2f62a668f4dac17452dc3f0cf6323bfad130e8ca 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -31,12 +31,13 @@ struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Strides, typename XprType>
 struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense>
 {
-  typedef const TensorStridingOp<Strides, XprType>& type;
+  typedef const TensorStridingOp<Strides, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename Strides, typename XprType>
@@ -53,14 +54,15 @@ template<typename Strides, typename XprType>
 class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
 {
   public:
-  typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
+    typedef TensorBase<TensorStridingOp<Strides, XprType> > Base;
+    typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
       : m_xpr(expr), m_dims(dims) {}
 
     EIGEN_DEVICE_FUNC
@@ -70,24 +72,7 @@ class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorStridingOp& operator = (const TensorStridingOp& other)
-    {
-      typedef TensorAssignOp<TensorStridingOp, const TensorStridingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorStridingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorStridingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -106,22 +91,30 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
     m_dimensions = m_impl.dimensions();
     for (int i = 0; i < NumDims; ++i) {
-      m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+      m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
     }
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -146,13 +139,14 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     }
   }
 
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -170,6 +164,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + PacketSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / m_outputStrides[i];
         const Index idx1 = indices[1] / m_outputStrides[i];
@@ -181,6 +176,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
       inputIndices[0] += indices[0] * m_inputStrides[0];
       inputIndices[1] += indices[1] * m_inputStrides[0];
     } else {  // RowMajor
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx0 = indices[0] / m_outputStrides[i];
         const Index idx1 = indices[1] / m_outputStrides[i];
@@ -200,6 +196,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndices[0]);
       values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize-1; ++i) {
         values[i] = coeff(index+i);
       }
@@ -222,13 +219,20 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
         TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
 
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_outputStrides[i];
         inputIndex += idx * m_inputStrides[i];
@@ -236,6 +240,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
       }
       inputIndex += index * m_inputStrides[0];
     } else {  // RowMajor
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_outputStrides[i];
         inputIndex += idx * m_inputStrides[i];
@@ -252,7 +257,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   TensorEvaluator<ArgType, Device> m_impl;
 };
 
-
 // Eval as lvalue
 template<typename Strides, typename ArgType, typename Device>
 struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
@@ -267,19 +271,20 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : Base(op, device) { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
   {
@@ -295,6 +300,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + PacketSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / this->m_outputStrides[i];
         const Index idx1 = indices[1] / this->m_outputStrides[i];
@@ -306,6 +312,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
       inputIndices[0] += indices[0] * this->m_inputStrides[0];
       inputIndices[1] += indices[1] * this->m_inputStrides[0];
     } else {  // RowMajor
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx0 = indices[0] / this->m_outputStrides[i];
         const Index idx1 = indices[1] / this->m_outputStrides[i];
@@ -325,6 +332,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
       internal::pstore<Scalar, PacketReturnType>(values, x);
       this->m_impl.coeffRef(inputIndices[0]) = values[0];
       this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1];
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize-1; ++i) {
         this->coeffRef(index+i) = values[i];
       }
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
deleted file mode 100644
index bb8800d458ca4b8a41f12a004ab2f9b2e477c92b..0000000000000000000000000000000000000000
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ /dev/null
@@ -1,82 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: eigen@codeplay.com
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// General include header of SYCL target for Tensor Module
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
-
-#ifdef EIGEN_USE_SYCL
-
-// global pointer to set different attribute state for a class
-template <class T>
-struct MakeGlobalPointer {
-  typedef typename cl::sycl::global_ptr<T>::pointer_t Type;
-};
-
-// global pointer to set different attribute state for a class
-template <class T>
-struct MakeLocalPointer {
-  typedef typename cl::sycl::local_ptr<T>::pointer_t Type;
-};
-
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-/// This struct is used for special expression nodes with no operations (for example assign and selectOP).
-  struct NoOP;
-
-template<bool IsConst, typename T> struct GetType{
-  typedef const T Type;
-};
-template<typename T> struct GetType<false, T>{
-  typedef T Type;
-};
-
-}
-}
-}
-
-// tuple construction
-#include "TensorSyclTuple.h"
-
-// counting number of leaf at compile time
-#include "TensorSyclLeafCount.h"
-
-// The index PlaceHolder takes the actual expression and replaces the actual
-// data on it with the place holder. It uses the same pre-order expression tree
-// traverse as the leaf count in order to give the right access number to each
-// node in the expression
-#include "TensorSyclPlaceHolderExpr.h"
-
-// creation of an accessor tuple from a tuple of SYCL buffers
-#include "TensorSyclExtractAccessor.h"
-
-// this is used to change the address space type in tensor map for GPU
-#include "TensorSyclConvertToDeviceExpression.h"
-
-// this is used to extract the functors
-#include "TensorSyclExtractFunctors.h"
-
-// this is used to create tensormap on the device
-// this is used to construct the expression on the device
-#include "TensorSyclExprConstructor.h"
-
-/// this is used for extracting tensor reduction
-#include "TensorReductionSycl.h"
-
-// kernel execution using fusion
-#include "TensorSyclRun.h"
-
-#endif  // end of EIGEN_USE_SYCL
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
deleted file mode 100644
index 8729c86ee8a270ceb208ee6e295fc98951f0d054..0000000000000000000000000000000000000000
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+++ /dev/null
@@ -1,121 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclConvertToDeviceExpression.h
- *
- * \brief:
- *  Conversion from host pointer to device pointer
- *  inside leaf nodes of the expression.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-/// \struct ConvertToDeviceExpression
-/// \brief This struct is used to convert the MakePointer in the host expression
-/// to the MakeGlobalPointer for the device expression. For the leafNodes
-/// containing the pointer. This is due to the fact that the address space of
-/// the pointer T* is different on the host and the device.
-template <typename Expr>
-struct ConvertToDeviceExpression;
-
-template<template<class...> class NonOpCategory, bool IsConst, typename... Args>
-struct NonOpConversion{
-  typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type...> >::Type Type;
-};
-
-
-template<template<class, template <class> class > class NonOpCategory, bool IsConst, typename Args>
-struct DeviceConvertor{
-  typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type, MakeGlobalPointer> >::Type Type;
-};
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is TensorMap
-#define TENSORMAPCONVERT(CVQual)\
-template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_>\
-struct ConvertToDeviceExpression<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_> > {\
-  typedef CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer> Type;\
-};
-
-TENSORMAPCONVERT(const)
-TENSORMAPCONVERT()
-#undef TENSORMAPCONVERT
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, TensorBroadcastingOp
-#define CATEGORYCONVERT(CVQual)\
-template <template<class, class...> class Category, typename OP, typename... subExprs>\
-struct ConvertToDeviceExpression<CVQual Category<OP, subExprs...> > {\
-  typedef CVQual Category<OP, typename ConvertToDeviceExpression<subExprs>::Type... > Type;\
-};
-CATEGORYCONVERT(const)
-CATEGORYCONVERT()
-#undef CATEGORYCONVERT
-
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is  TensorCwiseSelectOp
-#define SELECTOPCONVERT(CVQual, Res)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr>\
-struct ConvertToDeviceExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >\
-: NonOpConversion<TensorSelectOp, Res, IfExpr, ThenExpr, ElseExpr> {};
-SELECTOPCONVERT(const, true)
-SELECTOPCONVERT(, false)
-#undef SELECTOPCONVERT
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is const AssingOP
-#define ASSIGNCONVERT(CVQual, Res)\
-template <typename LHSExpr, typename RHSExpr>\
-struct ConvertToDeviceExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr> >\
-: NonOpConversion<TensorAssignOp, Res, LHSExpr, RHSExpr>{};
-
-ASSIGNCONVERT(const, true)
-ASSIGNCONVERT(, false)
-#undef ASSIGNCONVERT
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is either TensorForcedEvalOp or TensorEvalToOp
-#define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\
-template <typename Expr>\
-struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
-: DeviceConvertor<ExprNode, Res, Expr>{};
-
-KERNELBROKERCONVERT(const, true, TensorForcedEvalOp)
-KERNELBROKERCONVERT(, false, TensorForcedEvalOp)
-KERNELBROKERCONVERT(const, true, TensorEvalToOp)
-KERNELBROKERCONVERT(, false, TensorEvalToOp)
-#undef KERNELBROKERCONVERT
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
-#define KERNELBROKERCONVERTREDUCTION(CVQual)\
-template <typename OP, typename Dim, typename subExpr, template <class> class MakePointer_>\
-struct ConvertToDeviceExpression<CVQual TensorReductionOp<OP, Dim, subExpr, MakePointer_> > {\
-  typedef CVQual TensorReductionOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type, MakeGlobalPointer> Type;\
-};
-
-KERNELBROKERCONVERTREDUCTION(const)
-KERNELBROKERCONVERTREDUCTION()
-#undef KERNELBROKERCONVERTREDUCTION
-
-}  // namespace internal
-}  // namespace TensorSycl
-}  // namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX1
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
deleted file mode 100644
index 983f6318085f9a7652e3233db386f5680c972b2b..0000000000000000000000000000000000000000
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+++ /dev/null
@@ -1,239 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclExprConstructor.h
- *
- * \brief:
- *  This file re-create an expression on the SYCL device in order
- *  to use the original tensor evaluator.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-/// this class is used by EvalToOp in order to create an lhs expression which is
-/// a pointer from an accessor on device-only buffer
-template <typename PtrType, size_t N, typename... Params>
-struct EvalToLHSConstructor {
-  PtrType expr;
-  EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t): expr((&(*(utility::tuple::get<N>(t).get_pointer())))) {}
-};
-
-/// struct ExprConstructor is used to reconstruct the expression on the device and
-/// recreate the expression with MakeGlobalPointer containing the device address
-/// space for the TensorMap pointers used in eval function.
-/// It receives the original expression type, the functor of the node, the tuple
-/// of accessors, and the device expression type to re-instantiate the
-/// expression tree for the device
-template <typename OrigExpr, typename IndexExpr, typename... Params>
-struct ExprConstructor;
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorMap
-#define TENSORMAP(CVQual)\
-template <typename Scalar_, int Options_, int Options2_, int Options3_, int NumIndices_, typename IndexType_,\
-template <class> class MakePointer_, size_t N, typename... Params>\
-struct ExprConstructor< CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options3_, MakePointer_>, N>, Params...>{\
-  typedef  CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer>  Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
-  : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
-};
-
-TENSORMAP(const)
-TENSORMAP()
-#undef TENSORMAP
-
-#define UNARYCATEGORY(CVQual)\
-template <template<class, class> class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\
-struct ExprConstructor<CVQual UnaryCategory<OP, OrigRHSExpr>, CVQual UnaryCategory<OP, RHSExpr>, Params...> {\
-  typedef  ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_type;\
-  my_type rhsExpr;\
-  typedef CVQual UnaryCategory<OP, typename my_type::Type> Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : rhsExpr(funcD.rhsExpr, t), expr(rhsExpr.expr, funcD.func) {}\
-};
-
-UNARYCATEGORY(const)
-UNARYCATEGORY()
-#undef UNARYCATEGORY
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorBinaryOp
-#define BINARYCATEGORY(CVQual)\
-template <template<class, class, class> class BinaryCategory, typename OP, typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr,\
-typename RHSExpr, typename... Params>\
-struct ExprConstructor<CVQual BinaryCategory<OP, OrigLHSExpr, OrigRHSExpr>,  CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Params...> {\
-  typedef  ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\
-  typedef  ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\
-  typedef  CVQual BinaryCategory<OP, typename my_left_type::Type, typename my_right_type::Type> Type;\
-  my_left_type lhsExpr;\
-  my_right_type rhsExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : lhsExpr(funcD.lhsExpr, t),rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr, funcD.func) {}\
-};
-
-BINARYCATEGORY(const)
-BINARYCATEGORY()
-#undef BINARYCATEGORY
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorCwiseTernaryOp
-#define TERNARYCATEGORY(CVQual)\
-template <template <class, class, class, class> class TernaryCategory, typename OP, typename OrigArg1Expr, typename OrigArg2Expr,typename OrigArg3Expr,\
-typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename... Params>\
-struct ExprConstructor<CVQual TernaryCategory<OP, OrigArg1Expr, OrigArg2Expr, OrigArg3Expr>, CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Params...> {\
-  typedef ExprConstructor<OrigArg1Expr, Arg1Expr, Params...> my_arg1_type;\
-  typedef ExprConstructor<OrigArg2Expr, Arg2Expr, Params...> my_arg2_type;\
-  typedef ExprConstructor<OrigArg3Expr, Arg3Expr, Params...> my_arg3_type;\
-  typedef  CVQual TernaryCategory<OP, typename my_arg1_type::Type, typename my_arg2_type::Type, typename my_arg3_type::Type> Type;\
-  my_arg1_type arg1Expr;\
-  my_arg2_type arg2Expr;\
-  my_arg3_type arg3Expr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD,const utility::tuple::Tuple<Params...> &t)\
-  : arg1Expr(funcD.arg1Expr, t), arg2Expr(funcD.arg2Expr, t), arg3Expr(funcD.arg3Expr, t), expr(arg1Expr.expr, arg2Expr.expr, arg3Expr.expr, funcD.func) {}\
-};
-
-TERNARYCATEGORY(const)
-TERNARYCATEGORY()
-#undef TERNARYCATEGORY
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorCwiseSelectOp
-#define SELECTOP(CVQual)\
-template <typename OrigIfExpr, typename OrigThenExpr, typename OrigElseExpr, typename IfExpr, typename ThenExpr, typename ElseExpr, typename... Params>\
-struct ExprConstructor< CVQual TensorSelectOp<OrigIfExpr, OrigThenExpr, OrigElseExpr>, CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Params...> {\
-  typedef  ExprConstructor<OrigIfExpr, IfExpr, Params...> my_if_type;\
-  typedef  ExprConstructor<OrigThenExpr, ThenExpr, Params...> my_then_type;\
-  typedef  ExprConstructor<OrigElseExpr, ElseExpr, Params...> my_else_type;\
-  typedef CVQual TensorSelectOp<typename my_if_type::Type, typename my_then_type::Type, typename my_else_type::Type> Type;\
-  my_if_type ifExpr;\
-  my_then_type thenExpr;\
-  my_else_type elseExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : ifExpr(funcD.ifExpr, t), thenExpr(funcD.thenExpr, t), elseExpr(funcD.elseExpr, t), expr(ifExpr.expr, thenExpr.expr, elseExpr.expr) {}\
-};
-
-SELECTOP(const)
-SELECTOP()
-#undef SELECTOP
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// const TensorAssignOp
-#define ASSIGN(CVQual)\
-template <typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr, typename RHSExpr, typename... Params>\
-struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>,  CVQual TensorAssignOp<LHSExpr, RHSExpr>, Params...> {\
-  typedef ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\
-  typedef ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\
-  typedef CVQual TensorAssignOp<typename my_left_type::Type, typename my_right_type::Type>  Type;\
-  my_left_type lhsExpr;\
-  my_right_type rhsExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : lhsExpr(funcD.lhsExpr, t), rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr) {}\
- };
-
- ASSIGN(const)
- ASSIGN()
- #undef ASSIGN
-/// specialisation of the \ref ExprConstructor struct when the node type is
-///  TensorEvalToOp
-#define EVALTO(CVQual)\
-template <typename OrigExpr, typename Expr, typename... Params>\
-struct ExprConstructor<CVQual TensorEvalToOp<OrigExpr, MakeGlobalPointer>, CVQual TensorEvalToOp<Expr>, Params...> {\
-  typedef ExprConstructor<OrigExpr, Expr, Params...> my_expr_type;\
-  typedef typename TensorEvalToOp<OrigExpr, MakeGlobalPointer>::PointerType my_buffer_type;\
-  typedef CVQual TensorEvalToOp<typename my_expr_type::Type, MakeGlobalPointer> Type;\
-  my_expr_type nestedExpression;\
-  EvalToLHSConstructor<my_buffer_type, 0, Params...> buffer;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : nestedExpression(funcD.rhsExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\
-};
-
-EVALTO(const)
-EVALTO()
-#undef EVALTO
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorForcedEvalOp
-#define FORCEDEVAL(CVQual)\
-template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
-  typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::Scalar,\
-  TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::NumDimensions, 0, typename TensorForcedEvalOp<DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
-  : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
-};
-
-FORCEDEVAL(const)
-FORCEDEVAL()
-#undef FORCEDEVAL
-
-template <bool Conds,  size_t X , size_t Y > struct ValueCondition {
-  static const size_t Res =X;
-};
-template<size_t X, size_t Y> struct ValueCondition<false, X , Y> {
-  static const size_t Res =Y;
-};
-
-/// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp
-#define SYCLREDUCTIONEXPR(CVQual)\
-template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
-  static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0,  1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
-  typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
-  NumIndices, 0, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
-  : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
-};
-
-SYCLREDUCTIONEXPR(const)
-SYCLREDUCTIONEXPR()
-#undef SYCLREDUCTIONEXPR
-
-/// template deduction for \ref ExprConstructor struct
-template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params>
-auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple<Params...> &t)
-    -> decltype(ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t)) {
-  return ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t);
-}
-
-} /// namespace TensorSycl
-} /// namespace internal
-} /// namespace Eigen
-
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
deleted file mode 100644
index cc18fcdf908f8786d2c23ae09abf507d53002519..0000000000000000000000000000000000000000
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+++ /dev/null
@@ -1,204 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclExtractAccessor.h
- *
- * \brief:
- * ExtractAccessor takes Expression placeHolder expression and the tuple of sycl
- * buffers as an input. Using pre-order tree traversal, ExtractAccessor
- * recursively calls itself for its children in the expression tree. The
- * leaf node in the PlaceHolder expression is nothing but a container preserving
- * the order of the actual data in the tuple of sycl buffer. By invoking the
- * extract accessor for the PlaceHolder<N>, an accessor is created for the Nth
- * buffer in the tuple of buffers. This accessor is then added as an Nth
- * element in the tuple of accessors. In this case we preserve the order of data
- * in the expression tree.
- *
- * This is the specialisation of extract accessor method for different operation
- * type in the PlaceHolder expression.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-/// struct ExtractAccessor: Extract Accessor Class is used to extract the
-/// accessor from a buffer.
-/// Depending on the type of the leaf node we can get a read accessor or a
-/// read_write accessor
-template <typename Evaluator>
-struct ExtractAccessor;
-
-struct AccessorConstructor{
-  template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, Arg eval)
-  -> decltype(ExtractAccessor<Arg>::getTuple(cgh, eval)) {
-  return ExtractAccessor<Arg>::getTuple(cgh, eval);
-  }
-
-  template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1, Arg2 eval2)
-  -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) {
-    return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2));
-  }
-  template<typename Arg1, typename Arg2, typename Arg3>	static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1 , Arg2 eval2 , Arg3 eval3)
-  -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) {
-    return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)));
-  }
-  template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, Arg eval)
-  -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM,
-  typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()))){
-    return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM, typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()));
-  }
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp and const TensorBroadcastingOp
-template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> eval)
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){
-    return AccessorConstructor::getTuple(cgh, eval.impl());
-  }
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseNullaryOp,  TensorCwiseUnaryOp and  TensorBroadcastingOp
-template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
-: ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorCwiseBinaryOp
-template <template<class, class, class> class BinaryCategory, typename OP,  typename LHSExpr, typename RHSExpr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> eval)
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){
-    return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
-  }
-};
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp
-template <template<class, class, class> class BinaryCategory, typename OP,  typename LHSExpr, typename RHSExpr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >
-: ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >{};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorCwiseTernaryOp
-template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> eval)
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl())){
-    return AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl());
-  }
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseTernaryOp
-template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
-: ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorCwiseSelectOp. This is a special case where there is no OP
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> eval)
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl())){
-    return AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl());
-  }
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorCwiseSelectOp. This is a special case where there is no OP
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
-: ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >{};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorAssignOp
-template <typename LHSExpr, typename RHSExpr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> eval)
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){
-    return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
- }
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorAssignOp
-template <typename LHSExpr, typename RHSExpr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
-: ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorMap
-#define TENSORMAPEXPR(CVQual, ACCType)\
-template <typename PlainObjectType, int Options_, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> eval)\
-  -> decltype(AccessorConstructor::template getAccessor<ACCType>(cgh, eval)){\
-    return AccessorConstructor::template getAccessor<ACCType>(cgh, eval);\
-  }\
-};
-TENSORMAPEXPR(const, cl::sycl::access::mode::read)
-TENSORMAPEXPR(, cl::sycl::access::mode::read_write)
-#undef TENSORMAPEXPR
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorForcedEvalOp
-template <typename Expr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> > {
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> eval)
-  -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
-    return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
-  }
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp
-template <typename Expr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<TensorForcedEvalOp<Expr>, Dev> >
-: ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> >{};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorEvalToOp
-template <typename Expr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> > {
-  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<const TensorEvalToOp<Expr>, Dev> eval)
-  -> decltype(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){
-    return utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()));
-  }
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
-template <typename Expr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<TensorEvalToOp<Expr>, Dev> >
-: ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> >{};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorReductionOp
-template <typename OP, typename Dim, typename Expr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> > {
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> eval)
-  -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
-    return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
-  }
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
-template <typename OP, typename Dim, typename Expr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<TensorReductionOp<OP, Dim, Expr>, Dev> >
-: ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> >{};
-
-/// template deduction for \ref ExtractAccessor
-template <typename Evaluator>
-auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& expr)
--> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, expr)) {
-  return ExtractAccessor<Evaluator>::getTuple(cgh, expr);
-}
-
-} /// namespace TensorSycl
-} /// namespace internal
-} /// namespace Eigen
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
deleted file mode 100644
index 9edd38ea432379f7cfe7fa0195696d2c6bc30e49..0000000000000000000000000000000000000000
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ /dev/null
@@ -1,177 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclextractFunctors.h
- *
- * \brief:
- *  Used to extract all the functors allocated to each node of the expression
-*tree.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-/// struct FunctorExtractor:  This struct is used to extract the functors
-/// constructed on
-/// the host-side, to pack them and reuse them in reconstruction of the
-/// expression on the device.
-/// We have to do that as in Eigen the functors are not stateless so we cannot
-/// re-instantiate them on the device.
-/// We have to pass instantiated functors to the device.
-// This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval).
-template <typename Evaluator> struct FunctorExtractor{
-  typedef typename Evaluator::Dimensions Dimensions;
-  const Dimensions m_dimensions;
-  const Dimensions& dimensions() const { return m_dimensions; }
-  FunctorExtractor(const Evaluator& expr)
-  : m_dimensions(expr.dimensions()) {}
-
-};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp, and const TensorBroadcastingOp
-template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
-  OP func;
-  FunctorExtractor(const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev>& expr)
-  : rhsExpr(expr.impl()), func(expr.functor()) {}
-};
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, and TensorBroadcastingOp
-template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
-: FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> >{};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorCwiseBinaryOp
-template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
-  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
-  OP func;
-  FunctorExtractor(const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)
-  : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}
-};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorCwiseBinaryOp
-template <template <class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<BinaryCategory<OP,  LHSExpr, RHSExpr>, Dev> >
-: FunctorExtractor<TensorEvaluator<const BinaryCategory<OP,  LHSExpr, RHSExpr>, Dev> >{};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorCwiseTernaryOp
-template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev>
-struct FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {
-  FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;
-  FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;
-  FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;
-  OP func;
-  FunctorExtractor(const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)
-  : arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}
-};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseTernaryOp
-template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
-struct FunctorExtractor<TensorEvaluator< TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
-:FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
-struct FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {
-  FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr;
-  FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr;
-  FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr;
-  FunctorExtractor(const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr)
-  : ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {}
-};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
-:FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorAssignOp. This is an specialisation without OP so it has to be separated.
-template <typename LHSExpr, typename RHSExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
-  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
-  FunctorExtractor(const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr)
-  : lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {}
-};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorAssignOp. This is an specialisation without OP so it has to be separated.
-template <typename LHSExpr, typename RHSExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
-:FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};
-
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorEvalToOp, This is an specialisation without OP so it has to be separated.
-template <typename RHSExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
-  FunctorExtractor(const TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev>& expr)
-  : rhsExpr(expr.impl()) {}
-};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorEvalToOp. This is a specialisation without OP so it has to be separated.
-template <typename RHSExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<TensorEvalToOp<RHSExpr>, Dev> >
-: FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {};
-
-template<typename Dim, size_t NumOutputDim> struct DimConstr {
-template<typename InDim>
-  static inline Dim getDim(InDim dims ) {return dims;}
-};
-
-template<typename Dim> struct DimConstr<Dim, 0> {
-  template<typename InDim>
-    static inline Dim getDim(InDim dims ) {return Dim(dims.TotalSize());}
-};
-
-template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
-struct FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{
-  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;
-  typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;
-  const Dimensions m_dimensions;
-  const Dimensions& dimensions() const { return m_dimensions; }
-  FunctorExtractor(const TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)
-  : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}
-};
-
-
-template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
-struct FunctorExtractor<TensorEvaluator<TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>
-: FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{};
-/// template deduction function for FunctorExtractor
-template <typename Evaluator>
-auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {
-  return FunctorExtractor<Evaluator>(evaluator);
-}
-}  // namespace internal
-}  // namespace TensorSycl
-}  // namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
deleted file mode 100644
index 25d1fac9bfc5d3ef2aaf6707c181c856c0140402..0000000000000000000000000000000000000000
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
+++ /dev/null
@@ -1,114 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclLeafCount.h
- *
- * \brief:
- *  The leaf count used the pre-order expression tree traverse in order to name
- *  count the number of leaf nodes in the expression
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-/// \brief LeafCount used to counting terminal nodes. The total number of
-/// leaf nodes is used by MakePlaceHolderExprHelper to find the order
-/// of the leaf node in a expression tree at compile time.
-template <typename Expr>
-struct LeafCount;
-
-template<typename... Args> struct CategoryCount;
-
-template<> struct CategoryCount<>
-{
-  static const size_t Count =0;
-};
-
-template<typename Arg, typename... Args>
-struct CategoryCount<Arg,Args...>{
-  static const size_t Count = LeafCount<Arg>::Count + CategoryCount<Args...>::Count;
-};
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorMap
-template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
-struct LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> > {
-  static const size_t Count =1;
-};
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorMap
-template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
-struct LeafCount<TensorMap<PlainObjectType, Options_, MakePointer_> > :LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> >{};
-
-// const TensorCwiseUnaryOp, const TensorCwiseNullaryOp, const TensorCwiseBinaryOp, const TensorCwiseTernaryOp, and Const TensorBroadcastingOp
-template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>
-struct LeafCount<const CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {};
-// TensorCwiseUnaryOp,  TensorCwiseNullaryOp,  TensorCwiseBinaryOp,  TensorCwiseTernaryOp, and  TensorBroadcastingOp
-template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>
-struct LeafCount<CategoryExpr<OP, RHSExpr...> > :LeafCount<const CategoryExpr<OP, RHSExpr...> >{};
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorSelectOp is an exception
-template <typename IfExpr, typename ThenExpr, typename ElseExpr>
-struct LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {};
-/// specialisation of the \ref LeafCount struct when the node type is TensorSelectOp
-template <typename IfExpr, typename ThenExpr, typename ElseExpr>
-struct LeafCount<TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >: LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > {};
-
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorAssignOp
-template <typename LHSExpr, typename RHSExpr>
-struct LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {};
-
-/// specialisation of the \ref LeafCount struct when the node type is
-/// TensorAssignOp is an exception. It is not the same as Unary
-template <typename LHSExpr, typename RHSExpr>
-struct LeafCount<TensorAssignOp<LHSExpr, RHSExpr> > :LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >{};
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorForcedEvalOp
-template <typename Expr>
-struct LeafCount<const TensorForcedEvalOp<Expr> > {
-    static const size_t Count =1;
-};
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorForcedEvalOp
-template <typename Expr>
-struct LeafCount<TensorForcedEvalOp<Expr> >: LeafCount<const TensorForcedEvalOp<Expr> > {};
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorEvalToOp
-template <typename Expr>
-struct LeafCount<const TensorEvalToOp<Expr> > {
-  static const size_t Count = 1 + CategoryCount<Expr>::Count;
-};
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp
-template <typename OP, typename Dim, typename Expr>
-struct LeafCount<const TensorReductionOp<OP, Dim, Expr> > {
-    static const size_t Count =1;
-};
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorReductionOp
-template <typename OP, typename Dim, typename Expr>
-struct LeafCount<TensorReductionOp<OP, Dim, Expr> >: LeafCount<const TensorReductionOp<OP, Dim, Expr> >{};
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
-template <typename Expr>
-struct LeafCount<TensorEvalToOp<Expr> >: LeafCount<const TensorEvalToOp<Expr> >{};
-
-} /// namespace TensorSycl
-} /// namespace internal
-} /// namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
deleted file mode 100644
index d4c250c6dd35be5b444c0ff8b1308cb21bc1c729..0000000000000000000000000000000000000000
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
+++ /dev/null
@@ -1,181 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclPlaceHolderExpr.h
- *
- * \brief:
- *  This is the specialisation of the placeholder expression based on the
- * operation type
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-/// \struct PlaceHolder
-/// \brief PlaceHolder is used to replace the \ref TensorMap in the expression
-/// tree.
-/// PlaceHolder contains the order of the leaf node in the expression tree.
-template <typename Scalar, size_t N>
-struct PlaceHolder {
-  static constexpr size_t I = N;
-  typedef Scalar Type;
-};
-
-/// \sttruct PlaceHolderExpression
-/// \brief it is used to create the PlaceHolder expression. The PlaceHolder
-/// expression is a copy of expression type in which the TensorMap of the has
-/// been replaced with PlaceHolder.
-template <typename Expr, size_t N>
-struct PlaceHolderExpression;
-
-template<size_t N, typename... Args>
-struct CalculateIndex;
-
-template<size_t N, typename Arg>
-struct CalculateIndex<N, Arg>{
-  typedef typename PlaceHolderExpression<Arg, N>::Type ArgType;
-  typedef utility::tuple::Tuple<ArgType> ArgsTuple;
-};
-
-template<size_t N, typename Arg1, typename Arg2>
-struct CalculateIndex<N, Arg1, Arg2>{
-  static const size_t Arg2LeafCount = LeafCount<Arg2>::Count;
-  typedef typename PlaceHolderExpression<Arg1, N - Arg2LeafCount>::Type Arg1Type;
-  typedef typename PlaceHolderExpression<Arg2, N>::Type Arg2Type;
-  typedef utility::tuple::Tuple<Arg1Type, Arg2Type> ArgsTuple;
-};
-
-template<size_t N, typename Arg1, typename Arg2, typename Arg3>
-struct CalculateIndex<N, Arg1, Arg2, Arg3> {
-  static const size_t Arg3LeafCount = LeafCount<Arg3>::Count;
-  static const size_t Arg2LeafCount = LeafCount<Arg2>::Count;
-  typedef typename PlaceHolderExpression<Arg1, N - Arg3LeafCount - Arg2LeafCount>::Type Arg1Type;
-  typedef typename PlaceHolderExpression<Arg2, N - Arg3LeafCount>::Type Arg2Type;
-  typedef typename PlaceHolderExpression<Arg3, N>::Type Arg3Type;
-  typedef utility::tuple::Tuple<Arg1Type, Arg2Type, Arg3Type> ArgsTuple;
-};
-
-template<template<class...> class Category , class OP, class TPL>
-struct CategoryHelper;
-
-template<template<class...> class Category , class OP, class ...T >
-struct CategoryHelper<Category, OP, utility::tuple::Tuple<T...> > {
-  typedef Category<OP, T... > Type;
-};
-
-template<template<class...> class Category , class ...T >
-struct CategoryHelper<Category, NoOP, utility::tuple::Tuple<T...> > {
-  typedef Category<T... > Type;
-};
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorBroadcastingOp, TensorCwiseBinaryOp,  TensorCwiseTernaryOp
-#define OPEXPRCATEGORY(CVQual)\
-template <template <class, class... > class Category, typename OP, typename... SubExpr, size_t N>\
-struct PlaceHolderExpression<CVQual Category<OP, SubExpr...>, N>{\
-  typedef CVQual typename CategoryHelper<Category, OP, typename CalculateIndex<N, SubExpr...>::ArgsTuple>::Type Type;\
-};
-
-OPEXPRCATEGORY(const)
-OPEXPRCATEGORY()
-#undef OPEXPRCATEGORY
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorCwiseSelectOp
-#define SELECTEXPR(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, N> {\
-  typedef CVQual typename CategoryHelper<TensorSelectOp, NoOP, typename CalculateIndex<N, IfExpr, ThenExpr, ElseExpr>::ArgsTuple>::Type Type;\
-};
-
-SELECTEXPR(const)
-SELECTEXPR()
-#undef SELECTEXPR
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorAssignOp
-#define ASSIGNEXPR(CVQual)\
-template <typename LHSExpr, typename RHSExpr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr>, N> {\
-  typedef CVQual typename CategoryHelper<TensorAssignOp, NoOP, typename CalculateIndex<N, LHSExpr, RHSExpr>::ArgsTuple>::Type Type;\
-};
-
-ASSIGNEXPR(const)
-ASSIGNEXPR()
-#undef ASSIGNEXPR
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorMap
-#define TENSORMAPEXPR(CVQual)\
-template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_, size_t N>\
-struct PlaceHolderExpression< CVQual TensorMap< Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> {\
-  typedef CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> Type;\
-};
-
-TENSORMAPEXPR(const)
-TENSORMAPEXPR()
-#undef TENSORMAPEXPR
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorForcedEvalOp
-#define FORCEDEVAL(CVQual)\
-template <typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorForcedEvalOp<Expr>, N> {\
-  typedef CVQual PlaceHolder<CVQual TensorForcedEvalOp<Expr>, N> Type;\
-};
-
-FORCEDEVAL(const)
-FORCEDEVAL()
-#undef FORCEDEVAL
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorEvalToOp
-#define EVALTO(CVQual)\
-template <typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorEvalToOp<Expr>, N> {\
-  typedef CVQual TensorEvalToOp<typename CalculateIndex <N, Expr>::ArgType> Type;\
-};
-
-EVALTO(const)
-EVALTO()
-#undef EVALTO
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorReductionOp
-#define SYCLREDUCTION(CVQual)\
-template <typename OP, typename Dims, typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorReductionOp<OP, Dims, Expr>, N>{\
-  typedef CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dims,Expr>, N> Type;\
-};
-SYCLREDUCTION(const)
-SYCLREDUCTION()
-#undef SYCLREDUCTION
-
-/// template deduction for \ref PlaceHolderExpression struct
-template <typename Expr>
-struct createPlaceHolderExpression {
-  static const size_t TotalLeaves = LeafCount<Expr>::Count;
-  typedef typename PlaceHolderExpression<Expr, TotalLeaves - 1>::Type Type;
-};
-
-}  // internal
-}  // TensorSycl
-}  // namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
deleted file mode 100644
index 7914b6fad64d7e7c1feef35d0f4d60cf68a32a0d..0000000000000000000000000000000000000000
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Cummins Chris PhD student at The University of Edinburgh.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclRun.h
- *
- * \brief:
- * Schedule_kernel invoke an specialised version of kernel struct. The
- * specialisation is based on the data dimension in sycl buffer
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-/// The run function in tensor sycl convert the expression tree to a buffer
-/// based expression tree;
-/// creates the expression tree for the device with accessor to buffers;
-/// construct the kernel and submit it to the sycl queue.
-template <typename Expr, typename Dev>
-void run(Expr &expr, Dev &dev) {
-  Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
-  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-  if (needs_assign) {
-    typedef  typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
-    auto functors = internal::extractFunctors(evaluator);
-
-    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
-    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
-
-      // create a tuple of accessors from Evaluator
-      auto tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator);
-      const auto range = utility::tuple::get<0>(tuple_of_accessors).get_range()[0];
-      size_t GRange=range;
-      if (tileSize>GRange) tileSize=GRange;
-      else if(GRange>tileSize){
-        size_t xMode = GRange % tileSize;
-        if (xMode != 0) GRange += (tileSize - xMode);
-      }
-      // run the kernel
-      cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
-        typedef  typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
-        auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-        auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
-        if (itemID.get_global_linear_id() < range) {
-          device_evaluator.evalScalar(static_cast<int>(itemID.get_global_linear_id()));
-        }
-      });
-    });
-    dev.m_queue.throw_asynchronous();
-  }
-
-  evaluator.cleanup();
-}
-}  // namespace TensorSycl
-}  // namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
deleted file mode 100644
index 83915f31a67f15e8dcb1ab96b4faed3204ec4755..0000000000000000000000000000000000000000
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
+++ /dev/null
@@ -1,237 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensroSyclTuple.h
- *
- * \brief:
- *  Minimal implementation of std::tuple that can be used inside a SYCL kernel.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
-namespace utility {
-namespace tuple {
-/// \struct StaticIf
-/// \brief The StaticIf struct is used to statically choose the type based on the
-/// condition.
-template <bool, typename T = void> struct StaticIf;
-/// \brief specialisation of the \ref StaticIf when the condition is true
-template <typename T>
-struct StaticIf<true, T> {
-  typedef T type;
-};
-
-/// \struct Tuple
-/// \brief is a fixed-size collection of heterogeneous values
-/// \tparam Ts...	-	the types of the elements that the tuple stores.
-/// Empty list is supported.
-template <class... Ts>
-struct Tuple {};
-
-/// \brief specialisation of the \ref Tuple class when the tuple has at least
-/// one element.
-/// \tparam T : the type of the first element in the tuple.
-/// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty.
-template <class T, class... Ts>
-struct Tuple<T, Ts...> {
-  Tuple(T t, Ts... ts) : head(t), tail(ts...) {}
-  T head;
-  Tuple<Ts...> tail;
-};
-
-///\ struct ElemTypeHolder
-/// \brief ElemTypeHolder class is used to specify the types of the
-/// elements inside the tuple
-/// \tparam size_t the number of elements inside the tuple
-/// \tparam class the tuple class
-template <size_t, class>
-struct ElemTypeHolder;
-
-/// \brief specialisation of the \ref ElemTypeHolder class when the number of
-/// elements inside the tuple is 1
-template <class T, class... Ts>
-struct ElemTypeHolder<0, Tuple<T, Ts...> > {
-  typedef T type;
-};
-
-/// \brief specialisation of the \ref ElemTypeHolder class when the number of
-/// elements inside the tuple is bigger than 1. It recursively calls itself to
-/// detect the type of each element in the tuple
-/// \tparam T : the type of the first element in the tuple.
-/// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty.
-/// \tparam K is the Kth element in the tuple
-template <size_t k, class T, class... Ts>
-struct ElemTypeHolder<k, Tuple<T, Ts...> > {
-  typedef typename ElemTypeHolder<k - 1, Tuple<Ts...> >::type type;
-};
-
-/// get
-/// \brief Extracts the first element from the tuple.
-/// K=0 represents the first element of the tuple. The tuple cannot be empty.
-/// \tparam Ts... are the type of the elements in the tuple.
-/// \param t is the tuple whose contents to extract
-/// \return  typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type
-
-#define TERMINATE_CONDS_TUPLE_GET(CVQual) \
-template <size_t k, class... Ts> \
-typename StaticIf<k == 0, CVQual typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type \
-get(CVQual Tuple<Ts...> &t) { \
-  static_assert(sizeof...(Ts)!=0, "The requseted value is bigger than the size of the tuple"); \
-  return t.head; \
-}
-
-TERMINATE_CONDS_TUPLE_GET(const)
-TERMINATE_CONDS_TUPLE_GET()
-#undef TERMINATE_CONDS_TUPLE_GET
-/// get
-/// \brief Extracts the Kth element from the tuple.
-///\tparam K is an integer value in [0,sizeof...(Types)).
-/// \tparam T is the (sizeof...(Types) -(K+1)) element in the tuple
-/// \tparam Ts... are the type of the elements  in the tuple.
-/// \param t is the tuple whose contents to extract
-/// \return  typename ElemTypeHolder<K, Tuple<Ts...> >::type &>::type
-#define RECURSIVE_TUPLE_GET(CVQual) \
-template <size_t k, class T, class... Ts> \
-typename StaticIf<k != 0, CVQual typename ElemTypeHolder<k, Tuple<T, Ts...> >::type &>::type \
-get(CVQual Tuple<T, Ts...> &t) { \
-  return utility::tuple::get<k - 1>(t.tail); \
-}
-RECURSIVE_TUPLE_GET(const)
-RECURSIVE_TUPLE_GET()
-#undef RECURSIVE_TUPLE_GET
-
-/// make_tuple
-/// \brief Creates a tuple object, deducing the target type from the types of
-/// arguments.
-/// \tparam Args the type of the arguments to construct the tuple from
-/// \param args zero or more arguments to construct the tuple from
-/// \return Tuple<Args...>
-template <typename... Args>
-Tuple<Args...> make_tuple(Args... args) {
-  return Tuple<Args...>(args...);
-}
-
-/// size
-/// \brief Provides access to the number of elements in a tuple as a
-/// compile-time constant expression.
-/// \tparam Args the type of the arguments to construct the tuple from
-/// \return size_t
-template <typename... Args>
-static constexpr size_t size(Tuple<Args...> &) {
-  return sizeof...(Args);
-}
-
-/// \struct IndexList
-/// \brief Creates a list of index from the elements in the tuple
-/// \tparam Is... a list of index from [0 to sizeof...(tuple elements))
-template <size_t... Is>
-struct IndexList {};
-
-/// \struct RangeBuilder
-/// \brief Collects internal details for generating index ranges [MIN, MAX)
-/// Declare primary template for index range builder
-/// \tparam MIN is the starting index in the tuple
-/// \tparam N represents sizeof..(elemens)- sizeof...(Is)
-/// \tparam Is... are the list of generated index so far
-template <size_t MIN, size_t N, size_t... Is>
-struct RangeBuilder;
-
-// FIXME Doxygen has problems with recursive inheritance
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-/// \brief base Step: Specialisation of the \ref RangeBuilder when the
-/// MIN==MAX. In this case the Is... is [0 to sizeof...(tuple elements))
-/// \tparam MIN is the starting index of the tuple
-/// \tparam Is is [0 to sizeof...(tuple elements))
-template <size_t MIN, size_t... Is>
-struct RangeBuilder<MIN, MIN, Is...> {
-  typedef IndexList<Is...> type;
-};
-
-/// Induction step: Specialisation of the RangeBuilder class when N!=MIN
-/// in this case we are recursively subtracting N by one and adding one
-/// index to Is... list until MIN==N
-/// \tparam MIN is the starting index in the tuple
-/// \tparam N represents sizeof..(elemens)- sizeof...(Is)
-/// \tparam Is... are the list of generated index so far
-template <size_t MIN, size_t N, size_t... Is>
-struct RangeBuilder : public RangeBuilder<MIN, N - 1, N - 1, Is...> {};
-#endif // EIGEN_PARSED_BY_DOXYGEN
-
-/// \brief IndexRange that returns a [MIN, MAX) index range
-/// \tparam MIN is the starting index in the tuple
-/// \tparam MAX is the size of the tuple
-template <size_t MIN, size_t MAX>
-struct IndexRange: RangeBuilder<MIN, MAX>::type {};
-
-/// append_base
-/// \brief unpacking the elements of the input tuple t and creating a new tuple
-/// by adding element a at the end of it.
-///\tparam Args... the type of the elements inside the tuple t
-/// \tparam T the type of the new element going to be added at the end of tuple
-/// \tparam I... is the list of index from [0 to sizeof...(t))
-/// \param t the tuple on which we want to append a.
-/// \param a the new elements going to be added to the tuple
-/// \return Tuple<Args..., T>
-template <typename... Args, typename T, size_t... I>
-Tuple<Args..., T> append_base(Tuple<Args...> t, T a,IndexList<I...>) {
-  return utility::tuple::make_tuple(get<I>(t)..., a);
-}
-
-/// append
-/// \brief the deduction function for \ref append_base that automatically
-/// generate the \ref IndexRange
-///\tparam Args... the type of the elements inside the tuple t
-/// \tparam T the type of the new element going to be added at the end of tuple
-/// \param t the tuple on which we want to append a.
-/// \param a the new elements going to be added to the tuple
-/// \return Tuple<Args..., T>
-template <typename... Args, typename T>
-Tuple<Args..., T> append(Tuple<Args...> t, T a) {
-  return utility::tuple::append_base(t, a,  IndexRange<0, sizeof...(Args)>());
-}
-
-/// append_base
-/// \brief This is a specialisation of \ref append_base when we want to
-/// concatenate
-/// tuple t2 at the end of the tuple t1. Here we unpack both tuples, generate the
-/// IndexRange for each of them and create an output tuple T that contains both
-/// elements of t1 and t2.
-///\tparam Args1... the type of the elements inside the tuple t1
-///\tparam Args2... the type of the elements inside the tuple t2
-/// \tparam I1... is the list of index from [0 to sizeof...(t1))
-/// \tparam I2... is the list of index from [0 to sizeof...(t2))
-/// \param t1 is the tuple on which we want to append t2.
-/// \param t2 is the tuple that is going to be added on t1.
-/// \return Tuple<Args1..., Args2...>
-template <typename... Args1, typename... Args2, size_t... I1, size_t... I2>
-Tuple<Args1..., Args2...> append_base(Tuple<Args1...> t1, Tuple<Args2...> t2, IndexList<I1...>, IndexList<I2...>) {
-  return utility::tuple::make_tuple(get<I1>(t1)...,get<I2>(t2)...);
-}
-
-/// append
-/// \brief deduction function for \ref append_base when we are appending tuple
-/// t1 by tuple t2. In this case the \ref IndexRange for both tuple are
-/// automatically generated.
-///\tparam Args1... the type of the elements inside the tuple t1
-///\tparam Args2... the type of the elements inside the tuple t2
-/// \param t1 is the tuple on which we want to append t2.
-/// \param t2 is the tuple that is going to be added on t1.
-/// \return Tuple<Args1..., Args2...>
-template <typename... Args1, typename... Args2>
-Tuple<Args1..., Args2...> append(Tuple<Args1...> t1,Tuple<Args2...> t2) {
-  return utility::tuple::append_base(t1, t2, IndexRange<0, sizeof...(Args1)>(), IndexRange<0, sizeof...(Args2)>());
-}
-}  // tuple
-}  // utility
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
new file mode 100644
index 0000000000000000000000000000000000000000..926ecdd38f7c85fa498537af44f25d7ea4879c6d
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -0,0 +1,303 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com>
+// Copyright (C) 2017 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
+
+namespace Eigen {
+
+/** \class TensorTrace
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor Trace class.
+  *
+  *
+  */
+
+namespace internal {
+template<typename Dims, typename XprType>
+struct traits<TensorTraceOp<Dims, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Dims, typename XprType>
+struct eval<TensorTraceOp<Dims, XprType>, Eigen::Dense>
+{
+  typedef const TensorTraceOp<Dims, XprType>& type;
+};
+
+template<typename Dims, typename XprType>
+struct nested<TensorTraceOp<Dims, XprType>, 1, typename eval<TensorTraceOp<Dims, XprType> >::type>
+{
+  typedef TensorTraceOp<Dims, XprType> type;
+};
+
+} // end namespace internal
+
+
+template<typename Dims, typename XprType>
+class TensorTraceOp : public TensorBase<TensorTraceOp<Dims, XprType> >
+{
+  public:
+    typedef typename Eigen::internal::traits<TensorTraceOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorTraceOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorTraceOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorTraceOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTraceOp(const XprType& expr, const Dims& dims)
+      : m_xpr(expr), m_dims(dims) {
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Dims& dims() const { return m_dims; }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<typename XprType::Nested>::type& expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Dims m_dims;
+};
+
+
+// Eval as rvalue
+template<typename Dims, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
+{
+  typedef TensorTraceOp<Dims, ArgType> XprType;
+  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumReducedDims = internal::array_size<Dims>::value;
+  static const int NumOutputDims = NumInputDims - NumReducedDims;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumOutputDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : m_impl(op.expression(), device), m_traceDim(1), m_device(device)
+  {
+
+    EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumReducedDims >= 2) || ((NumReducedDims == 0) && (NumInputDims == 0)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    for (int i = 0; i < NumInputDims; ++i) {
+      m_reduced[i] = false;
+    }
+
+    const Dims& op_dims = op.dims();
+    for (int i = 0; i < NumReducedDims; ++i) {
+      eigen_assert(op_dims[i] >= 0);
+      eigen_assert(op_dims[i] < NumInputDims);
+      m_reduced[op_dims[i]] = true;
+    }
+
+    // All the dimensions should be distinct to compute the trace
+    int num_distinct_reduce_dims = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (m_reduced[i]) {
+        ++num_distinct_reduce_dims;
+      }
+    }
+
+    eigen_assert(num_distinct_reduce_dims == NumReducedDims);
+
+    // Compute the dimensions of the result.
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    int output_index = 0;
+    int reduced_index = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (m_reduced[i]) {
+        m_reducedDims[reduced_index] = input_dims[i];
+        if (reduced_index > 0) {
+          // All the trace dimensions must have the same size
+          eigen_assert(m_reducedDims[0] == m_reducedDims[reduced_index]);
+        }
+        ++reduced_index;
+      }
+      else {
+        m_dimensions[output_index] = input_dims[i];
+        ++output_index;
+      }
+    }
+
+    if (NumReducedDims != 0) {
+      m_traceDim = m_reducedDims[0];
+    }
+
+    // Compute the output strides
+    if (NumOutputDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_outputStrides[0] = 1;
+        for (int i = 1; i < NumOutputDims; ++i) {
+          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        }
+      }
+      else {
+        m_outputStrides.back() = 1;
+        for (int i = NumOutputDims - 2; i >= 0; --i) {
+          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        }
+      }
+    }
+
+    // Compute the input strides
+    if (NumInputDims > 0) {
+      array<Index, NumInputDims> input_strides;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        input_strides[0] = 1;
+        for (int i = 1; i < NumInputDims; ++i) {
+          input_strides[i] = input_strides[i - 1] * input_dims[i - 1];
+        }
+      }
+      else {
+        input_strides.back() = 1;
+        for (int i = NumInputDims - 2; i >= 0; --i) {
+          input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+        }
+      }
+
+      output_index = 0;
+      reduced_index = 0;
+      for (int i = 0; i < NumInputDims; ++i) {
+        if(m_reduced[i]) {
+          m_reducedStrides[reduced_index] = input_strides[i];
+          ++reduced_index;
+        }
+        else {
+          m_preservedStrides[output_index] = input_strides[i];
+          ++output_index;
+        }
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_dimensions;
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    // Initialize the result
+    CoeffReturnType result = internal::cast<int, CoeffReturnType>(0);
+    Index index_stride = 0;
+    for (int i = 0; i < NumReducedDims; ++i) {
+      index_stride += m_reducedStrides[i];
+    }
+
+    // If trace is requested along all dimensions, starting index would be 0
+    Index cur_index = 0;
+    if (NumOutputDims != 0)
+      cur_index = firstInput(index);
+    for (Index i = 0; i < m_traceDim; ++i) {
+        result += m_impl.coeff(cur_index);
+        cur_index += index_stride;
+    }
+
+    return result;
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
+        values[i] = coeff(index + i);
+    }
+    PacketReturnType result = internal::ploadt<PacketReturnType, LoadMode>(values);
+    return result;
+  }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
+
+ protected:
+  // Given the output index, finds the first index in the input tensor used to compute the trace
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    Index startInput = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumOutputDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      startInput += index * m_preservedStrides[0];
+    }
+    else {
+      for (int i = 0; i < NumOutputDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      startInput += index * m_preservedStrides[NumOutputDims - 1];
+    }
+    return startInput;
+  }
+
+  Dimensions m_dimensions;
+  TensorEvaluator<ArgType, Device> m_impl;
+  // Initialize the size of the trace dimension
+  Index m_traceDim;
+  const Device EIGEN_DEVICE_REF m_device;
+  array<bool, NumInputDims> m_reduced;
+  array<Index, NumReducedDims> m_reducedDims;
+  array<Index, NumOutputDims> m_outputStrides;
+  array<Index, NumReducedDims> m_reducedStrides;
+  array<Index, NumOutputDims> m_preservedStrides;
+};
+
+
+} // End namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index ffcf8b00f3c071b7bdf6ec8232664d4605d9624f..4f7fd340ee2492e6d0eb8525439b23a6d39583cd 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -59,6 +59,7 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
   template <typename T> struct MakePointer {
     typedef T* Type;
   };
+  typedef typename MakePointer<Scalar>::Type PointerType;
 };
 
 
@@ -77,6 +78,7 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
   template <typename T> struct MakePointer {
     typedef T* Type;
   };
+  typedef typename MakePointer<Scalar>::Type PointerType;
 };
 
 
@@ -99,6 +101,7 @@ struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
     typedef MakePointer_<T> MakePointerT;
     typedef typename MakePointerT::Type Type;
   };
+  typedef typename MakePointer<Scalar>::Type PointerType;
 };
 
 template<typename PlainObjectType>
@@ -115,55 +118,56 @@ struct traits<TensorRef<PlainObjectType> >
     Options = BaseTraits::Options,
     Flags = BaseTraits::Flags
   };
+  typedef typename BaseTraits::PointerType PointerType;
 };
 
 
 template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
 struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
+  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
 struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
+  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template<typename PlainObjectType, int Options, template <class> class MakePointer>
 struct eval<TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
 {
-  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
 };
 
 template<typename PlainObjectType, int Options, template <class> class MakePointer>
 struct eval<const TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
 {
-  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
 };
 
 template<typename PlainObjectType>
 struct eval<TensorRef<PlainObjectType>, Eigen::Dense>
 {
-  typedef const TensorRef<PlainObjectType>& type;
+  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
 };
 
 template<typename PlainObjectType>
 struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
 {
-  typedef const TensorRef<PlainObjectType>& type;
+  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
 };
 
 // TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector.
@@ -175,50 +179,38 @@ template<typename T, int n=1, typename PlainObject = void> struct nested
 template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
 struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
-  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
 struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
-  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 
-template <typename PlainObjectType, int Options, template <class> class MakePointer>
-struct nested<TensorMap<PlainObjectType, Options, MakePointer> >
-{
-  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
-};
-
-template <typename PlainObjectType, int Options, template <class> class MakePointer>
-struct nested<const TensorMap<PlainObjectType, Options, MakePointer> >
-{
-  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
-};
-
 template <typename PlainObjectType>
 struct nested<TensorRef<PlainObjectType> >
 {
-  typedef const TensorRef<PlainObjectType>& type;
+  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
 };
 
 template <typename PlainObjectType>
 struct nested<const TensorRef<PlainObjectType> >
 {
-  typedef const TensorRef<PlainObjectType>& type;
+  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
 };
 
 }  // end namespace internal
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 3523e7c94519c1d2d91cd41b7f499749e62752d9..d23f2e4c81eabb3e3a68e0fb35c49ac5f03c2a68 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -23,6 +23,7 @@ struct static_val {
 
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
+    EIGEN_UNUSED_VARIABLE(v);
     eigen_assert(v == n);
   }
 };
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 0ca2cac8451a2a1c525bdfb9c3ecacb661a3fcf4..0beb9ff09b832f16006d8a57f3079f3aedb91599 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -22,6 +22,7 @@ namespace Eigen {
   * dimensions.
   */
 namespace internal {
+
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
 struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType>
 {
@@ -33,6 +34,8 @@ struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions + 1;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+
 };
 
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
@@ -65,12 +68,12 @@ class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows,
                                                             DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
                                                             DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
                                                             PaddingType padding_type, Scalar padding_value)
-      : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-        m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
-        m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-        m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-        m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
-        m_padding_type(padding_type), m_padding_value(padding_value) {}
+                                                            : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+                                                            m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+                                                            m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+                                                            m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+                                                            m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+                                                            m_padding_type(padding_type), m_padding_value(padding_value) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
                                                            DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
@@ -80,13 +83,13 @@ class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows,
                                                            DenseIndex padding_top, DenseIndex padding_bottom,
                                                            DenseIndex padding_left, DenseIndex padding_right,
                                                            Scalar padding_value)
-      : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-        m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
-        m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-        m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-        m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
-        m_padding_left(padding_left), m_padding_right(padding_right),
-        m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+                                                           : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+                                                           m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+                                                           m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+                                                           m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+                                                           m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+                                                           m_padding_left(padding_left), m_padding_right(padding_right),
+                                                           m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
 
     EIGEN_DEVICE_FUNC
     DenseIndex patch_planes() const { return m_patch_planes; }
@@ -173,19 +176,26 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) :
+ m_impl(op.expression(), device)
   {
     EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -248,12 +258,12 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
           m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast<float>(m_plane_strides));
           m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
           m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
-          const Index dz = m_outputPlanes * m_plane_strides + m_patch_planes_eff - 1 - m_input_planes_eff;
-          const Index dy = m_outputRows * m_row_strides + m_patch_rows_eff - 1 - m_input_rows_eff;
-          const Index dx = m_outputCols * m_col_strides + m_patch_cols_eff - 1 - m_input_cols_eff;
-          m_planePaddingTop = dz - dz / 2;
-          m_rowPaddingTop = dy - dy / 2;
-          m_colPaddingLeft = dx - dx / 2;
+          const Index dz = (m_outputPlanes - 1) * m_plane_strides + m_patch_planes_eff - m_input_planes_eff;
+          const Index dy = (m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff;
+          const Index dx = (m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff;
+          m_planePaddingTop = dz / 2;
+          m_rowPaddingTop = dy / 2;
+          m_colPaddingLeft = dx / 2;
           break;
         }
         default:
@@ -322,6 +332,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
 
     // Fast representations of different variables.
     m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+
     m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
     m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
     m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
@@ -341,12 +352,12 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -502,30 +513,38 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
     return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
-  Index planePaddingTop() const { return m_planePaddingTop; }
-  Index rowPaddingTop() const { return m_rowPaddingTop; }
-  Index colPaddingLeft() const { return m_colPaddingLeft; }
-  Index outputPlanes() const { return m_outputPlanes; }
-  Index outputRows() const { return m_outputRows; }
-  Index outputCols() const { return m_outputCols; }
-  Index userPlaneStride() const { return m_plane_strides; }
-  Index userRowStride() const { return m_row_strides; }
-  Index userColStride() const { return m_col_strides; }
-  Index userInPlaneStride() const { return m_in_plane_strides; }
-  Index userInRowStride() const { return m_in_row_strides; }
-  Index userInColStride() const { return m_in_col_strides; }
-  Index planeInflateStride() const { return m_plane_inflate_strides; }
-  Index rowInflateStride() const { return m_row_inflate_strides; }
-  Index colInflateStride() const { return m_col_inflate_strides; }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planePaddingTop() const { return m_planePaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputPlanes() const { return m_outputPlanes; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userPlaneStride() const { return m_plane_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInPlaneStride() const { return m_in_plane_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planeInflateStride() const { return m_plane_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -535,7 +554,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
 
   Dimensions m_dimensions;
 
-  // Parameters passed to the costructor.
+  // Parameters passed to the constructor.
   Index m_plane_strides;
   Index m_row_strides;
   Index m_col_strides;
@@ -600,6 +619,8 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   Scalar m_paddingValue;
 
   TensorEvaluator<ArgType, Device> m_impl;
+
+
 };
 
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/contrib/eigen/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
index 5e97d07a9565eebbad9fc630a1a7dd4da9bd4867..54bf9dbb3bee8f8b148b72afb3cd51e1062fe374 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
@@ -244,7 +244,7 @@ struct dimino_first_step_elements
   * multiplying all elements in the given subgroup with the new
   * coset representative. Note that the first element of the
   * subgroup is always the identity element, so the first element of
-  * ther result of this template is going to be the coset
+  * the result of this template is going to be the coset
   * representative itself.
   *
   * Note that this template accepts an additional boolean parameter
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4c59dc3dff9dbf48f248668674abbc64d8338ec
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h
@@ -0,0 +1,67 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Barrier is an object that allows one or more threads to wait until
+// Notify has been called a specified number of times.
+
+#ifndef EIGEN_CXX11_THREADPOOL_BARRIER_H
+#define EIGEN_CXX11_THREADPOOL_BARRIER_H
+
+namespace Eigen {
+
+class Barrier {
+ public:
+  Barrier(unsigned int count) : state_(count << 1), notified_(false) {
+    eigen_plain_assert(((count << 1) >> 1) == count);
+  }
+  ~Barrier() { eigen_plain_assert((state_ >> 1) == 0); }
+
+  void Notify() {
+    unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
+    if (v != 1) {
+      // Clear the lowest bit (waiter flag) and check that the original state
+      // value was not zero. If it was zero, it means that notify was called
+      // more times than the original count.
+      eigen_plain_assert(((v + 2) & ~1) != 0);
+      return;  // either count has not dropped to 0, or waiter is not waiting
+    }
+    std::unique_lock<std::mutex> l(mu_);
+    eigen_plain_assert(!notified_);
+    notified_ = true;
+    cv_.notify_all();
+  }
+
+  void Wait() {
+    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
+    if ((v >> 1) == 0) return;
+    std::unique_lock<std::mutex> l(mu_);
+    while (!notified_) {
+      cv_.wait(l);
+    }
+  }
+
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  std::atomic<unsigned int> state_;  // low bit is waiter flag
+  bool notified_;
+};
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object,
+// but only one caller must call Notify() on the object.
+struct Notification : Barrier {
+  Notification() : Barrier(1){};
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_BARRIER_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
index 4749d6240af3f327528b20a6d39a36a04802abb4..4549aa0691722fa393c1d086aefc294bd4f6d33c 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -33,10 +33,10 @@ namespace Eigen {
 //   ec.Notify(true);
 //
 // Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
-// cheap, but they are executed only if the preceeding predicate check has
+// cheap, but they are executed only if the preceding predicate check has
 // failed.
 //
-// Algorihtm outline:
+// Algorithm outline:
 // There are two main variables: predicate (managed by user) and state_.
 // Operation closely resembles Dekker mutual algorithm:
 // https://en.wikipedia.org/wiki/Dekker%27s_algorithm
@@ -50,117 +50,114 @@ class EventCount {
  public:
   class Waiter;
 
-  EventCount(MaxSizeVector<Waiter>& waiters) : waiters_(waiters) {
-    eigen_assert(waiters.size() < (1 << kWaiterBits) - 1);
-    // Initialize epoch to something close to overflow to test overflow.
-    state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2);
+  EventCount(MaxSizeVector<Waiter>& waiters)
+      : state_(kStackMask), waiters_(waiters) {
+    eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1);
   }
 
   ~EventCount() {
     // Ensure there are no waiters.
-    eigen_plain_assert((state_.load() & (kStackMask | kWaiterMask)) == kStackMask);
+    eigen_plain_assert(state_.load() == kStackMask);
   }
 
   // Prewait prepares for waiting.
-  // After calling this function the thread must re-check the wait predicate
-  // and call either CancelWait or CommitWait passing the same Waiter object.
-  void Prewait(Waiter* w) {
-    w->epoch = state_.fetch_add(kWaiterInc, std::memory_order_relaxed);
-    std::atomic_thread_fence(std::memory_order_seq_cst);
+  // After calling Prewait, the thread must re-check the wait predicate
+  // and then call either CancelWait or CommitWait.
+  void Prewait() {
+    uint64_t state = state_.load(std::memory_order_relaxed);
+    for (;;) {
+      CheckState(state);
+      uint64_t newstate = state + kWaiterInc;
+      CheckState(newstate);
+      if (state_.compare_exchange_weak(state, newstate,
+                                       std::memory_order_seq_cst))
+        return;
+    }
   }
 
-  // CommitWait commits waiting.
+  // CommitWait commits waiting after Prewait.
   void CommitWait(Waiter* w) {
+    eigen_plain_assert((w->epoch & ~kEpochMask) == 0);
     w->state = Waiter::kNotSignaled;
-    // Modification epoch of this waiter.
-    uint64_t epoch =
-        (w->epoch & kEpochMask) +
-        (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
+    const uint64_t me = (w - &waiters_[0]) | w->epoch;
     uint64_t state = state_.load(std::memory_order_seq_cst);
     for (;;) {
-      if (int64_t((state & kEpochMask) - epoch) < 0) {
-        // The preceeding waiter has not decided on its fate. Wait until it
-        // calls either CancelWait or CommitWait, or is notified.
-        EIGEN_THREAD_YIELD();
-        state = state_.load(std::memory_order_seq_cst);
-        continue;
+      CheckState(state, true);
+      uint64_t newstate;
+      if ((state & kSignalMask) != 0) {
+        // Consume the signal and return immidiately.
+        newstate = state - kWaiterInc - kSignalInc;
+      } else {
+        // Remove this thread from pre-wait counter and add to the waiter stack.
+        newstate = ((state & kWaiterMask) - kWaiterInc) | me;
+        w->next.store(state & (kStackMask | kEpochMask),
+                      std::memory_order_relaxed);
       }
-      // We've already been notified.
-      if (int64_t((state & kEpochMask) - epoch) > 0) return;
-      // Remove this thread from prewait counter and add it to the waiter list.
-      eigen_assert((state & kWaiterMask) != 0);
-      uint64_t newstate = state - kWaiterInc + kEpochInc;
-      newstate = (newstate & ~kStackMask) | (w - &waiters_[0]);
-      if ((state & kStackMask) == kStackMask)
-        w->next.store(nullptr, std::memory_order_relaxed);
-      else
-        w->next.store(&waiters_[state & kStackMask], std::memory_order_relaxed);
+      CheckState(newstate);
       if (state_.compare_exchange_weak(state, newstate,
-                                       std::memory_order_release))
-        break;
+                                       std::memory_order_acq_rel)) {
+        if ((state & kSignalMask) == 0) {
+          w->epoch += kEpochInc;
+          Park(w);
+        }
+        return;
+      }
     }
-    Park(w);
   }
 
   // CancelWait cancels effects of the previous Prewait call.
-  void CancelWait(Waiter* w) {
-    uint64_t epoch =
-        (w->epoch & kEpochMask) +
-        (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
+  void CancelWait() {
     uint64_t state = state_.load(std::memory_order_relaxed);
     for (;;) {
-      if (int64_t((state & kEpochMask) - epoch) < 0) {
-        // The preceeding waiter has not decided on its fate. Wait until it
-        // calls either CancelWait or CommitWait, or is notified.
-        EIGEN_THREAD_YIELD();
-        state = state_.load(std::memory_order_relaxed);
-        continue;
-      }
-      // We've already been notified.
-      if (int64_t((state & kEpochMask) - epoch) > 0) return;
-      // Remove this thread from prewait counter.
-      eigen_assert((state & kWaiterMask) != 0);
-      if (state_.compare_exchange_weak(state, state - kWaiterInc + kEpochInc,
-                                       std::memory_order_relaxed))
+      CheckState(state, true);
+      uint64_t newstate = state - kWaiterInc;
+      // We don't know if the thread was also notified or not,
+      // so we should not consume a signal unconditionaly.
+      // Only if number of waiters is equal to number of signals,
+      // we know that the thread was notified and we must take away the signal.
+      if (((state & kWaiterMask) >> kWaiterShift) ==
+          ((state & kSignalMask) >> kSignalShift))
+        newstate -= kSignalInc;
+      CheckState(newstate);
+      if (state_.compare_exchange_weak(state, newstate,
+                                       std::memory_order_acq_rel))
         return;
     }
   }
 
   // Notify wakes one or all waiting threads.
   // Must be called after changing the associated wait predicate.
-  void Notify(bool all) {
+  void Notify(bool notifyAll) {
     std::atomic_thread_fence(std::memory_order_seq_cst);
     uint64_t state = state_.load(std::memory_order_acquire);
     for (;;) {
+      CheckState(state);
+      const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+      const uint64_t signals = (state & kSignalMask) >> kSignalShift;
       // Easy case: no waiters.
-      if ((state & kStackMask) == kStackMask && (state & kWaiterMask) == 0)
-        return;
-      uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+      if ((state & kStackMask) == kStackMask && waiters == signals) return;
       uint64_t newstate;
-      if (all) {
-        // Reset prewait counter and empty wait list.
-        newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask;
-      } else if (waiters) {
+      if (notifyAll) {
+        // Empty wait stack and set signal to number of pre-wait threads.
+        newstate =
+            (state & kWaiterMask) | (waiters << kSignalShift) | kStackMask;
+      } else if (signals < waiters) {
         // There is a thread in pre-wait state, unblock it.
-        newstate = state + kEpochInc - kWaiterInc;
+        newstate = state + kSignalInc;
       } else {
         // Pop a waiter from list and unpark it.
         Waiter* w = &waiters_[state & kStackMask];
-        Waiter* wnext = w->next.load(std::memory_order_relaxed);
-        uint64_t next = kStackMask;
-        if (wnext != nullptr) next = wnext - &waiters_[0];
-        // Note: we don't add kEpochInc here. ABA problem on the lock-free stack
-        // can't happen because a waiter is re-pushed onto the stack only after
-        // it was in the pre-wait state which inevitably leads to epoch
-        // increment.
-        newstate = (state & kEpochMask) + next;
+        uint64_t next = w->next.load(std::memory_order_relaxed);
+        newstate = (state & (kWaiterMask | kSignalMask)) | next;
       }
+      CheckState(newstate);
       if (state_.compare_exchange_weak(state, newstate,
-                                       std::memory_order_acquire)) {
-        if (!all && waiters) return;  // unblocked pre-wait thread
+                                       std::memory_order_acq_rel)) {
+        if (!notifyAll && (signals < waiters))
+          return;  // unblocked pre-wait thread
         if ((state & kStackMask) == kStackMask) return;
         Waiter* w = &waiters_[state & kStackMask];
-        if (!all) w->next.store(nullptr, std::memory_order_relaxed);
+        if (!notifyAll) w->next.store(kStackMask, std::memory_order_relaxed);
         Unpark(w);
         return;
       }
@@ -169,12 +166,13 @@ class EventCount {
 
   class Waiter {
     friend class EventCount;
-    // Align to 128 byte boundary to prevent false sharing with other Waiter objects in the same vector.
-    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<Waiter*> next;
+    // Align to 128 byte boundary to prevent false sharing with other Waiter
+    // objects in the same vector.
+    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<uint64_t> next;
     std::mutex mu;
     std::condition_variable cv;
-    uint64_t epoch;
-    unsigned state;
+    uint64_t epoch = 0;
+    unsigned state = kNotSignaled;
     enum {
       kNotSignaled,
       kWaiting,
@@ -184,23 +182,41 @@ class EventCount {
 
  private:
   // State_ layout:
-  // - low kStackBits is a stack of waiters committed wait.
+  // - low kWaiterBits is a stack of waiters committed wait
+  //   (indexes in waiters_ array are used as stack elements,
+  //   kStackMask means empty stack).
   // - next kWaiterBits is count of waiters in prewait state.
-  // - next kEpochBits is modification counter.
-  static const uint64_t kStackBits = 16;
-  static const uint64_t kStackMask = (1ull << kStackBits) - 1;
-  static const uint64_t kWaiterBits = 16;
-  static const uint64_t kWaiterShift = 16;
+  // - next kWaiterBits is count of pending signals.
+  // - remaining bits are ABA counter for the stack.
+  //   (stored in Waiter node and incremented on push).
+  static const uint64_t kWaiterBits = 14;
+  static const uint64_t kStackMask = (1ull << kWaiterBits) - 1;
+  static const uint64_t kWaiterShift = kWaiterBits;
   static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1)
                                       << kWaiterShift;
-  static const uint64_t kWaiterInc = 1ull << kWaiterBits;
-  static const uint64_t kEpochBits = 32;
-  static const uint64_t kEpochShift = 32;
+  static const uint64_t kWaiterInc = 1ull << kWaiterShift;
+  static const uint64_t kSignalShift = 2 * kWaiterBits;
+  static const uint64_t kSignalMask = ((1ull << kWaiterBits) - 1)
+                                      << kSignalShift;
+  static const uint64_t kSignalInc = 1ull << kSignalShift;
+  static const uint64_t kEpochShift = 3 * kWaiterBits;
+  static const uint64_t kEpochBits = 64 - kEpochShift;
   static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
   static const uint64_t kEpochInc = 1ull << kEpochShift;
   std::atomic<uint64_t> state_;
   MaxSizeVector<Waiter>& waiters_;
 
+  static void CheckState(uint64_t state, bool waiter = false) {
+    static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem");
+    const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+    const uint64_t signals = (state & kSignalMask) >> kSignalShift;
+    eigen_plain_assert(waiters >= signals);
+    eigen_plain_assert(waiters < (1 << kWaiterBits) - 1);
+    eigen_plain_assert(!waiter || waiters > 0);
+    (void)waiters;
+    (void)signals;
+  }
+
   void Park(Waiter* w) {
     std::unique_lock<std::mutex> lock(w->mu);
     while (w->state != Waiter::kSignaled) {
@@ -209,10 +225,10 @@ class EventCount {
     }
   }
 
-  void Unpark(Waiter* waiters) {
-    Waiter* next = nullptr;
-    for (Waiter* w = waiters; w; w = next) {
-      next = w->next.load(std::memory_order_relaxed);
+  void Unpark(Waiter* w) {
+    for (Waiter* next; w; w = next) {
+      uint64_t wnext = w->next.load(std::memory_order_relaxed) & kStackMask;
+      next = wnext == kStackMask ? nullptr : &waiters_[wnext];
       unsigned state;
       {
         std::unique_lock<std::mutex> lock(w->mu);
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 354bce52aee3dce4d66a0bb607811aa789b275a3..23a2b5467cdc5b3c7ae1ec898d17700b6dbceaef 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -10,79 +10,116 @@
 #ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
 #define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
 
-
 namespace Eigen {
 
 template <typename Environment>
-class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
+class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
  public:
   typedef typename Environment::Task Task;
   typedef RunQueue<Task, 1024> Queue;
 
-  NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
+  ThreadPoolTempl(int num_threads, Environment env = Environment())
+      : ThreadPoolTempl(num_threads, true, env) {}
+
+  ThreadPoolTempl(int num_threads, bool allow_spinning,
+                  Environment env = Environment())
       : env_(env),
-        threads_(num_threads),
-        queues_(num_threads),
-        coprimes_(num_threads),
+        num_threads_(num_threads),
+        allow_spinning_(allow_spinning),
+        thread_data_(num_threads),
+        all_coprimes_(num_threads),
         waiters_(num_threads),
+        global_steal_partition_(EncodePartition(0, num_threads_)),
         blocked_(0),
         spinning_(0),
         done_(false),
+        cancelled_(false),
         ec_(waiters_) {
-    waiters_.resize(num_threads);
-
-    // Calculate coprimes of num_threads.
-    // Coprimes are used for a random walk over all threads in Steal
+    waiters_.resize(num_threads_);
+    // Calculate coprimes of all numbers [1, num_threads].
+    // Coprimes are used for random walks over all threads in Steal
     // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
-    // a walk starting thread index t and calculate num_threads - 1 subsequent
+    // a random starting thread index t and calculate num_threads - 1 subsequent
     // indices as (t + coprime) % num_threads, we will cover all threads without
     // repetitions (effectively getting a presudo-random permutation of thread
     // indices).
-    for (int i = 1; i <= num_threads; i++) {
-      unsigned a = i;
-      unsigned b = num_threads;
-      // If GCD(a, b) == 1, then a and b are coprimes.
-      while (b != 0) {
-        unsigned tmp = a;
-        a = b;
-        b = tmp % b;
-      }
-      if (a == 1) {
-        coprimes_.push_back(i);
-      }
-    }
-    for (int i = 0; i < num_threads; i++) {
-      queues_.push_back(new Queue());
+    eigen_plain_assert(num_threads_ < kMaxThreads);
+    for (int i = 1; i <= num_threads_; ++i) {
+      all_coprimes_.emplace_back(i);
+      ComputeCoprimes(i, &all_coprimes_.back());
     }
-    for (int i = 0; i < num_threads; i++) {
-      threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
+#ifndef EIGEN_THREAD_LOCAL
+    init_barrier_.reset(new Barrier(num_threads_));
+#endif
+    thread_data_.resize(num_threads_);
+    for (int i = 0; i < num_threads_; i++) {
+      SetStealPartition(i, EncodePartition(0, num_threads_));
+      thread_data_[i].thread.reset(
+          env_.CreateThread([this, i]() { WorkerLoop(i); }));
     }
+#ifndef EIGEN_THREAD_LOCAL
+    // Wait for workers to initialize per_thread_map_. Otherwise we might race
+    // with them in Schedule or CurrentThreadId.
+    init_barrier_->Wait();
+#endif
   }
 
-  ~NonBlockingThreadPoolTempl() {
+  ~ThreadPoolTempl() {
     done_ = true;
+
     // Now if all threads block without work, they will start exiting.
     // But note that threads can continue to work arbitrary long,
     // block, submit new work, unblock and otherwise live full life.
-    ec_.Notify(true);
+    if (!cancelled_) {
+      ec_.Notify(true);
+    } else {
+      // Since we were cancelled, there might be entries in the queues.
+      // Empty them to prevent their destructor from asserting.
+      for (size_t i = 0; i < thread_data_.size(); i++) {
+        thread_data_[i].queue.Flush();
+      }
+    }
+    // Join threads explicitly (by destroying) to avoid destruction order within
+    // this class.
+    for (size_t i = 0; i < thread_data_.size(); ++i)
+      thread_data_[i].thread.reset();
+  }
+
+  void SetStealPartitions(const std::vector<std::pair<unsigned, unsigned>>& partitions) {
+    eigen_plain_assert(partitions.size() == static_cast<std::size_t>(num_threads_));
 
-    // Join threads explicitly to avoid destruction order issues.
-    for (size_t i = 0; i < threads_.size(); i++) delete threads_[i];
-    for (size_t i = 0; i < threads_.size(); i++) delete queues_[i];
+    // Pass this information to each thread queue.
+    for (int i = 0; i < num_threads_; i++) {
+      const auto& pair = partitions[i];
+      unsigned start = pair.first, end = pair.second;
+      AssertBounds(start, end);
+      unsigned val = EncodePartition(start, end);
+      SetStealPartition(i, val);
+    }
+  }
+
+  void Schedule(std::function<void()> fn) EIGEN_OVERRIDE {
+    ScheduleWithHint(std::move(fn), 0, num_threads_);
   }
 
-  void Schedule(std::function<void()> fn) {
+  void ScheduleWithHint(std::function<void()> fn, int start,
+                        int limit) override {
     Task t = env_.CreateTask(std::move(fn));
     PerThread* pt = GetPerThread();
     if (pt->pool == this) {
       // Worker thread of this pool, push onto the thread's queue.
-      Queue* q = queues_[pt->thread_id];
-      t = q->PushFront(std::move(t));
+      Queue& q = thread_data_[pt->thread_id].queue;
+      t = q.PushFront(std::move(t));
     } else {
       // A free-standing thread (or worker of another pool), push onto a random
       // queue.
-      Queue* q = queues_[Rand(&pt->rand) % queues_.size()];
-      t = q->PushBack(std::move(t));
+      eigen_plain_assert(start < limit);
+      eigen_plain_assert(limit <= num_threads_);
+      int num_queues = limit - start;
+      int rnd = Rand(&pt->rand) % num_queues;
+      eigen_plain_assert(start + rnd < limit);
+      Queue& q = thread_data_[start + rnd].queue;
+      t = q.PushBack(std::move(t));
     }
     // Note: below we touch this after making w available to worker threads.
     // Strictly speaking, this can lead to a racy-use-after-free. Consider that
@@ -91,19 +128,32 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     // completes overall computations, which in turn leads to destruction of
     // this. We expect that such scenario is prevented by program, that is,
     // this is kept alive while any threads can potentially be in Schedule.
-    if (!t.f)
+    if (!t.f) {
       ec_.Notify(false);
-    else
+    } else {
       env_.ExecuteTask(t);  // Push failed, execute directly.
+    }
   }
 
-  int NumThreads() const final {
-    return static_cast<int>(threads_.size());
+  void Cancel() EIGEN_OVERRIDE {
+    cancelled_ = true;
+    done_ = true;
+
+    // Let each thread know it's been cancelled.
+#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
+    for (size_t i = 0; i < thread_data_.size(); i++) {
+      thread_data_[i].thread->OnCancel();
+    }
+#endif
+
+    // Wake up the threads without work to let them exit on their own.
+    ec_.Notify(true);
   }
 
-  int CurrentThreadId() const final {
-    const PerThread* pt =
-        const_cast<NonBlockingThreadPoolTempl*>(this)->GetPerThread();
+  int NumThreads() const EIGEN_FINAL { return num_threads_; }
+
+  int CurrentThreadId() const EIGEN_FINAL {
+    const PerThread* pt = const_cast<ThreadPoolTempl*>(this)->GetPerThread();
     if (pt->pool == this) {
       return pt->thread_id;
     } else {
@@ -112,72 +162,191 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   }
 
  private:
+  // Create a single atomic<int> that encodes start and limit information for
+  // each thread.
+  // We expect num_threads_ < 65536, so we can store them in a single
+  // std::atomic<unsigned>.
+  // Exposed publicly as static functions so that external callers can reuse
+  // this encode/decode logic for maintaining their own thread-safe copies of
+  // scheduling and steal domain(s).
+  static const int kMaxPartitionBits = 16;
+  static const int kMaxThreads = 1 << kMaxPartitionBits;
+
+  inline unsigned EncodePartition(unsigned start, unsigned limit) {
+    return (start << kMaxPartitionBits) | limit;
+  }
+
+  inline void DecodePartition(unsigned val, unsigned* start, unsigned* limit) {
+    *limit = val & (kMaxThreads - 1);
+    val >>= kMaxPartitionBits;
+    *start = val;
+  }
+
+  void AssertBounds(int start, int end) {
+    eigen_plain_assert(start >= 0);
+    eigen_plain_assert(start < end);  // non-zero sized partition
+    eigen_plain_assert(end <= num_threads_);
+  }
+
+  inline void SetStealPartition(size_t i, unsigned val) {
+    thread_data_[i].steal_partition.store(val, std::memory_order_relaxed);
+  }
+
+  inline unsigned GetStealPartition(int i) {
+    return thread_data_[i].steal_partition.load(std::memory_order_relaxed);
+  }
+
+  void ComputeCoprimes(int N, MaxSizeVector<unsigned>* coprimes) {
+    for (int i = 1; i <= N; i++) {
+      unsigned a = i;
+      unsigned b = N;
+      // If GCD(a, b) == 1, then a and b are coprimes.
+      while (b != 0) {
+        unsigned tmp = a;
+        a = b;
+        b = tmp % b;
+      }
+      if (a == 1) {
+        coprimes->push_back(i);
+      }
+    }
+  }
+
   typedef typename Environment::EnvThread Thread;
 
   struct PerThread {
-    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) { }
-    NonBlockingThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
-    uint64_t rand;  // Random generator state.
-    int thread_id;  // Worker thread index in pool.
+    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) {}
+    ThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
+    uint64_t rand;          // Random generator state.
+    int thread_id;          // Worker thread index in pool.
+#ifndef EIGEN_THREAD_LOCAL
+    // Prevent false sharing.
+    char pad_[128];
+#endif
+  };
+
+  struct ThreadData {
+    constexpr ThreadData() : thread(), steal_partition(0), queue() {}
+    std::unique_ptr<Thread> thread;
+    std::atomic<unsigned> steal_partition;
+    Queue queue;
   };
 
   Environment env_;
-  MaxSizeVector<Thread*> threads_;
-  MaxSizeVector<Queue*> queues_;
-  MaxSizeVector<unsigned> coprimes_;
+  const int num_threads_;
+  const bool allow_spinning_;
+  MaxSizeVector<ThreadData> thread_data_;
+  MaxSizeVector<MaxSizeVector<unsigned>> all_coprimes_;
   MaxSizeVector<EventCount::Waiter> waiters_;
+  unsigned global_steal_partition_;
   std::atomic<unsigned> blocked_;
   std::atomic<bool> spinning_;
   std::atomic<bool> done_;
+  std::atomic<bool> cancelled_;
   EventCount ec_;
+#ifndef EIGEN_THREAD_LOCAL
+  std::unique_ptr<Barrier> init_barrier_;
+  std::mutex per_thread_map_mutex_;  // Protects per_thread_map_.
+  std::unordered_map<uint64_t, std::unique_ptr<PerThread>> per_thread_map_;
+#endif
 
   // Main worker thread loop.
   void WorkerLoop(int thread_id) {
+#ifndef EIGEN_THREAD_LOCAL
+    std::unique_ptr<PerThread> new_pt(new PerThread());
+    per_thread_map_mutex_.lock();
+    bool insertOK = per_thread_map_.emplace(GlobalThreadIdHash(), std::move(new_pt)).second;
+    eigen_plain_assert(insertOK);
+    EIGEN_UNUSED_VARIABLE(insertOK);
+    per_thread_map_mutex_.unlock();
+    init_barrier_->Notify();
+    init_barrier_->Wait();
+#endif
     PerThread* pt = GetPerThread();
     pt->pool = this;
-    pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
+    pt->rand = GlobalThreadIdHash();
     pt->thread_id = thread_id;
-    Queue* q = queues_[thread_id];
+    Queue& q = thread_data_[thread_id].queue;
     EventCount::Waiter* waiter = &waiters_[thread_id];
-    for (;;) {
-      Task t = q->PopFront();
-      if (!t.f) {
-        t = Steal();
+    // TODO(dvyukov,rmlarsen): The time spent in NonEmptyQueueIndex() is
+    // proportional to num_threads_ and we assume that new work is scheduled at
+    // a constant rate, so we set spin_count to 5000 / num_threads_. The
+    // constant was picked based on a fair dice roll, tune it.
+    const int spin_count =
+        allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0;
+    if (num_threads_ == 1) {
+      // For num_threads_ == 1 there is no point in going through the expensive
+      // steal loop. Moreover, since NonEmptyQueueIndex() calls PopBack() on the
+      // victim queues it might reverse the order in which ops are executed
+      // compared to the order in which they are scheduled, which tends to be
+      // counter-productive for the types of I/O workloads the single thread
+      // pools tend to be used for.
+      while (!cancelled_) {
+        Task t = q.PopFront();
+        for (int i = 0; i < spin_count && !t.f; i++) {
+          if (!cancelled_.load(std::memory_order_relaxed)) {
+            t = q.PopFront();
+          }
+        }
         if (!t.f) {
-          // Leave one thread spinning. This reduces latency.
-          // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it.
-          // Also, the time it takes to attempt to steal work 1000 times depends
-          // on the size of the thread pool. However the speed at which the user
-          // of the thread pool submit tasks is independent of the size of the
-          // pool. Consider a time based limit instead.
-          if (!spinning_ && !spinning_.exchange(true)) {
-            for (int i = 0; i < 1000 && !t.f; i++) {
-              t = Steal();
-            }
-            spinning_ = false;
+          if (!WaitForWork(waiter, &t)) {
+            return;
           }
+        }
+        if (t.f) {
+          env_.ExecuteTask(t);
+        }
+      }
+    } else {
+      while (!cancelled_) {
+        Task t = q.PopFront();
+        if (!t.f) {
+          t = LocalSteal();
           if (!t.f) {
-            if (!WaitForWork(waiter, &t)) {
-              return;
+            t = GlobalSteal();
+            if (!t.f) {
+              // Leave one thread spinning. This reduces latency.
+              if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
+                for (int i = 0; i < spin_count && !t.f; i++) {
+                  if (!cancelled_.load(std::memory_order_relaxed)) {
+                    t = GlobalSteal();
+                  } else {
+                    return;
+                  }
+                }
+                spinning_ = false;
+              }
+              if (!t.f) {
+                if (!WaitForWork(waiter, &t)) {
+                  return;
+                }
+              }
             }
           }
         }
-      }
-      if (t.f) {
-        env_.ExecuteTask(t);
+        if (t.f) {
+          env_.ExecuteTask(t);
+        }
       }
     }
   }
 
-  // Steal tries to steal work from other worker threads in best-effort manner.
-  Task Steal() {
+  // Steal tries to steal work from other worker threads in the range [start,
+  // limit) in best-effort manner.
+  Task Steal(unsigned start, unsigned limit) {
     PerThread* pt = GetPerThread();
-    const size_t size = queues_.size();
+    const size_t size = limit - start;
     unsigned r = Rand(&pt->rand);
-    unsigned inc = coprimes_[r % coprimes_.size()];
-    unsigned victim = r % size;
+    // Reduce r into [0, size) range, this utilizes trick from
+    // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+    eigen_plain_assert(all_coprimes_[size - 1].size() < (1<<30));
+    unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32;
+    unsigned index = ((uint64_t) all_coprimes_[size - 1].size() * (uint64_t)r) >> 32;
+    unsigned inc = all_coprimes_[size - 1][index];
+
     for (unsigned i = 0; i < size; i++) {
-      Task t = queues_[victim]->PopBack();
+      eigen_plain_assert(start + victim < limit);
+      Task t = thread_data_[start + victim].queue.PopBack();
       if (t.f) {
         return t;
       }
@@ -189,27 +358,52 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     return Task();
   }
 
+  // Steals work within threads belonging to the partition.
+  Task LocalSteal() {
+    PerThread* pt = GetPerThread();
+    unsigned partition = GetStealPartition(pt->thread_id);
+    // If thread steal partition is the same as global partition, there is no
+    // need to go through the steal loop twice.
+    if (global_steal_partition_ == partition) return Task();
+    unsigned start, limit;
+    DecodePartition(partition, &start, &limit);
+    AssertBounds(start, limit);
+
+    return Steal(start, limit);
+  }
+
+  // Steals work from any other thread in the pool.
+  Task GlobalSteal() {
+    return Steal(0, num_threads_);
+  }
+
+
   // WaitForWork blocks until new work is available (returns true), or if it is
   // time to exit (returns false). Can optionally return a task to execute in t
   // (in such case t.f != nullptr on return).
   bool WaitForWork(EventCount::Waiter* waiter, Task* t) {
-    eigen_assert(!t->f);
+    eigen_plain_assert(!t->f);
     // We already did best-effort emptiness check in Steal, so prepare for
     // blocking.
-    ec_.Prewait(waiter);
+    ec_.Prewait();
     // Now do a reliable emptiness check.
     int victim = NonEmptyQueueIndex();
     if (victim != -1) {
-      ec_.CancelWait(waiter);
-      *t = queues_[victim]->PopBack();
-      return true;
+      ec_.CancelWait();
+      if (cancelled_) {
+        return false;
+      } else {
+        *t = thread_data_[victim].queue.PopBack();
+        return true;
+      }
     }
     // Number of blocked threads is used as termination condition.
     // If we are shutting down and all worker threads blocked without work,
     // that's we are done.
     blocked_++;
-    if (done_ && blocked_ == threads_.size()) {
-      ec_.CancelWait(waiter);
+    // TODO is blocked_ required to be unsigned?
+    if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) {
+      ec_.CancelWait();
       // Almost done, but need to re-check queues.
       // Consider that all queues are empty and all worker threads are preempted
       // right after incrementing blocked_ above. Now a free-standing thread
@@ -236,12 +430,15 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
 
   int NonEmptyQueueIndex() {
     PerThread* pt = GetPerThread();
-    const size_t size = queues_.size();
+    // We intentionally design NonEmptyQueueIndex to steal work from
+    // anywhere in the queue so threads don't block in WaitForWork() forever
+    // when all threads in their partition go to sleep. Steal is still local.
+    const size_t size = thread_data_.size();
     unsigned r = Rand(&pt->rand);
-    unsigned inc = coprimes_[r % coprimes_.size()];
+    unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];
     unsigned victim = r % size;
     for (unsigned i = 0; i < size; i++) {
-      if (!queues_[victim]->Empty()) {
+      if (!thread_data_[victim].queue.Empty()) {
         return victim;
       }
       victim += inc;
@@ -252,10 +449,24 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     return -1;
   }
 
-  static EIGEN_STRONG_INLINE PerThread* GetPerThread() {
+  static EIGEN_STRONG_INLINE uint64_t GlobalThreadIdHash() {
+    return std::hash<std::thread::id>()(std::this_thread::get_id());
+  }
+
+  EIGEN_STRONG_INLINE PerThread* GetPerThread() {
+#ifndef EIGEN_THREAD_LOCAL
+    static PerThread dummy;
+    auto it = per_thread_map_.find(GlobalThreadIdHash());
+    if (it == per_thread_map_.end()) {
+      return &dummy;
+    } else {
+      return it->second.get();
+    }
+#else
     EIGEN_THREAD_LOCAL PerThread per_thread_;
     PerThread* pt = &per_thread_;
     return pt;
+#endif
   }
 
   static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
@@ -263,11 +474,12 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     // Update the internal state
     *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
     // Generate the random output (using the PCG-XSH-RS scheme)
-    return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
+    return static_cast<unsigned>((current ^ (current >> 22)) >>
+                                 (22 + (current >> 61)));
   }
 };
 
-typedef NonBlockingThreadPoolTempl<StlThreadEnvironment> NonBlockingThreadPool;
+typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool;
 
 }  // namespace Eigen
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
index 6e505fc14c8a97997d4b59eb995dc4eb6c18af23..b572ebcdfa2547b10b436b4013ccb1e8274ceb3e 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
@@ -10,7 +10,6 @@
 #ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
 #define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
 
-
 namespace Eigen {
 
 // RunQueue is a fixed-size, partially non-blocking deque or Work items.
@@ -40,9 +39,9 @@ class RunQueue {
  public:
   RunQueue() : front_(0), back_(0) {
     // require power-of-two for fast masking
-    eigen_assert((kSize & (kSize - 1)) == 0);
-    eigen_assert(kSize > 2);            // why would you do this?
-    eigen_assert(kSize <= (64 << 10));  // leave enough space for counter
+    eigen_plain_assert((kSize & (kSize - 1)) == 0);
+    eigen_plain_assert(kSize > 2);            // why would you do this?
+    eigen_plain_assert(kSize <= (64 << 10));  // leave enough space for counter
     for (unsigned i = 0; i < kSize; i++)
       array_[i].state.store(kEmpty, std::memory_order_relaxed);
   }
@@ -98,11 +97,9 @@ class RunQueue {
   }
 
   // PopBack removes and returns the last elements in the queue.
-  // Can fail spuriously.
   Work PopBack() {
     if (Empty()) return Work();
-    std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
-    if (!lock) return Work();
+    std::unique_lock<std::mutex> lock(mutex_);
     unsigned back = back_.load(std::memory_order_relaxed);
     Elem* e = &array_[back & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
@@ -116,11 +113,10 @@ class RunQueue {
   }
 
   // PopBackHalf removes and returns half last elements in the queue.
-  // Returns number of elements removed. But can also fail spuriously.
+  // Returns number of elements removed.
   unsigned PopBackHalf(std::vector<Work>* result) {
     if (Empty()) return 0;
-    std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
-    if (!lock) return 0;
+    std::unique_lock<std::mutex> lock(mutex_);
     unsigned back = back_.load(std::memory_order_relaxed);
     unsigned size = Size();
     unsigned mid = back;
@@ -131,15 +127,14 @@ class RunQueue {
       Elem* e = &array_[mid & kMask];
       uint8_t s = e->state.load(std::memory_order_relaxed);
       if (n == 0) {
-        if (s != kReady ||
-            !e->state.compare_exchange_strong(s, kBusy,
-                                              std::memory_order_acquire))
+        if (s != kReady || !e->state.compare_exchange_strong(
+                               s, kBusy, std::memory_order_acquire))
           continue;
         start = mid;
       } else {
         // Note: no need to store temporal kBusy, we exclusively own these
         // elements.
-        eigen_assert(s == kReady);
+        eigen_plain_assert(s == kReady);
       }
       result->push_back(std::move(e->w));
       e->state.store(kEmpty, std::memory_order_release);
@@ -152,30 +147,18 @@ class RunQueue {
 
   // Size returns current queue size.
   // Can be called by any thread at any time.
-  unsigned Size() const {
-    // Emptiness plays critical role in thread pool blocking. So we go to great
-    // effort to not produce false positives (claim non-empty queue as empty).
-    for (;;) {
-      // Capture a consistent snapshot of front/tail.
-      unsigned front = front_.load(std::memory_order_acquire);
-      unsigned back = back_.load(std::memory_order_acquire);
-      unsigned front1 = front_.load(std::memory_order_relaxed);
-      if (front != front1) continue;
-      int size = (front & kMask2) - (back & kMask2);
-      // Fix overflow.
-      if (size < 0) size += 2 * kSize;
-      // Order of modification in push/pop is crafted to make the queue look
-      // larger than it is during concurrent modifications. E.g. pop can
-      // decrement size before the corresponding push has incremented it.
-      // So the computed size can be up to kSize + 1, fix it.
-      if (size > static_cast<int>(kSize)) size = kSize;
-      return size;
-    }
-  }
+  unsigned Size() const { return SizeOrNotEmpty<true>(); }
 
   // Empty tests whether container is empty.
   // Can be called by any thread at any time.
-  bool Empty() const { return Size() == 0; }
+  bool Empty() const { return SizeOrNotEmpty<false>() == 0; }
+
+  // Delete all the elements from the queue.
+  void Flush() {
+    while (!Empty()) {
+      PopFront();
+    }
+  }
 
  private:
   static const unsigned kMask = kSize - 1;
@@ -191,7 +174,7 @@ class RunQueue {
   };
   std::mutex mutex_;
   // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
-  // front/back, repsectively. The remaining bits contain modification counters
+  // front/back, respectively. The remaining bits contain modification counters
   // that are incremented on Push operations. This allows us to (1) distinguish
   // between empty and full conditions (if we would use log(kSize) bits for
   // position, these conditions would be indistinguishable); (2) obtain
@@ -201,6 +184,49 @@ class RunQueue {
   std::atomic<unsigned> back_;
   Elem array_[kSize];
 
+  // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
+  // only whether the size is 0 is guaranteed to be correct.
+  // Can be called by any thread at any time.
+  template<bool NeedSizeEstimate>
+  unsigned SizeOrNotEmpty() const {
+    // Emptiness plays critical role in thread pool blocking. So we go to great
+    // effort to not produce false positives (claim non-empty queue as empty).
+    unsigned front = front_.load(std::memory_order_acquire);
+    for (;;) {
+      // Capture a consistent snapshot of front/tail.
+      unsigned back = back_.load(std::memory_order_acquire);
+      unsigned front1 = front_.load(std::memory_order_relaxed);
+      if (front != front1) {
+        front = front1;
+        std::atomic_thread_fence(std::memory_order_acquire);
+        continue;
+      }
+      if (NeedSizeEstimate) {
+        return CalculateSize(front, back);
+      } else {
+        // This value will be 0 if the queue is empty, and undefined otherwise.
+        unsigned maybe_zero = ((front ^ back) & kMask2);
+        // Queue size estimate must agree with maybe zero check on the queue
+        // empty/non-empty state.
+        eigen_assert((CalculateSize(front, back) == 0) == (maybe_zero == 0));
+        return maybe_zero;
+      }
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE
+  unsigned CalculateSize(unsigned front, unsigned back) const {
+    int size = (front & kMask2) - (back & kMask2);
+    // Fix overflow.
+    if (size < 0) size += 2 * kSize;
+    // Order of modification in push/pop is crafted to make the queue look
+    // larger than it is during concurrent modifications. E.g. push can
+    // increment size before the corresponding pop has decremented it.
+    // So the computed size can be up to kSize + 1, fix it.
+    if (size > static_cast<int>(kSize)) size = kSize;
+    return static_cast<unsigned>(size);
+  }
+
   RunQueue(const RunQueue&) = delete;
   void operator=(const RunQueue&) = delete;
 };
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
deleted file mode 100644
index e75d0f467aaa25dd0c7138096b1f37d3917250e9..0000000000000000000000000000000000000000
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
+++ /dev/null
@@ -1,154 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
-#define EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
-
-namespace Eigen {
-
-// The implementation of the ThreadPool type ensures that the Schedule method
-// runs the functions it is provided in FIFO order when the scheduling is done
-// by a single thread.
-// Environment provides a way to create threads and also allows to intercept
-// task submission and execution.
-template <typename Environment>
-class SimpleThreadPoolTempl : public ThreadPoolInterface {
- public:
-  // Construct a pool that contains "num_threads" threads.
-  explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment())
-      : env_(env), threads_(num_threads), waiters_(num_threads) {
-    for (int i = 0; i < num_threads; i++) {
-      threads_.push_back(env.CreateThread([this, i]() { WorkerLoop(i); }));
-    }
-  }
-
-  // Wait until all scheduled work has finished and then destroy the
-  // set of threads.
-  ~SimpleThreadPoolTempl() {
-    {
-      // Wait for all work to get done.
-      std::unique_lock<std::mutex> l(mu_);
-      while (!pending_.empty()) {
-        empty_.wait(l);
-      }
-      exiting_ = true;
-
-      // Wakeup all waiters.
-      for (auto w : waiters_) {
-        w->ready = true;
-        w->task.f = nullptr;
-        w->cv.notify_one();
-      }
-    }
-
-    // Wait for threads to finish.
-    for (auto t : threads_) {
-      delete t;
-    }
-  }
-
-  // Schedule fn() for execution in the pool of threads. The functions are
-  // executed in the order in which they are scheduled.
-  void Schedule(std::function<void()> fn) final {
-    Task t = env_.CreateTask(std::move(fn));
-    std::unique_lock<std::mutex> l(mu_);
-    if (waiters_.empty()) {
-      pending_.push_back(std::move(t));
-    } else {
-      Waiter* w = waiters_.back();
-      waiters_.pop_back();
-      w->ready = true;
-      w->task = std::move(t);
-      w->cv.notify_one();
-    }
-  }
-
-  int NumThreads() const final {
-    return static_cast<int>(threads_.size());
-  }
-
-  int CurrentThreadId() const final {
-    const PerThread* pt = this->GetPerThread();
-    if (pt->pool == this) {
-      return pt->thread_id;
-    } else {
-      return -1;
-    }
-  }
-
- protected:
-  void WorkerLoop(int thread_id) {
-    std::unique_lock<std::mutex> l(mu_);
-    PerThread* pt = GetPerThread();
-    pt->pool = this;
-    pt->thread_id = thread_id;
-    Waiter w;
-    Task t;
-    while (!exiting_) {
-      if (pending_.empty()) {
-        // Wait for work to be assigned to me
-        w.ready = false;
-        waiters_.push_back(&w);
-        while (!w.ready) {
-          w.cv.wait(l);
-        }
-        t = w.task;
-        w.task.f = nullptr;
-      } else {
-        // Pick up pending work
-        t = std::move(pending_.front());
-        pending_.pop_front();
-        if (pending_.empty()) {
-          empty_.notify_all();
-        }
-      }
-      if (t.f) {
-        mu_.unlock();
-        env_.ExecuteTask(t);
-        t.f = nullptr;
-        mu_.lock();
-      }
-    }
-  }
-
- private:
-  typedef typename Environment::Task Task;
-  typedef typename Environment::EnvThread Thread;
-
-  struct Waiter {
-    std::condition_variable cv;
-    Task task;
-    bool ready;
-  };
-
-  struct PerThread {
-    constexpr PerThread() : pool(NULL), thread_id(-1) { }
-    SimpleThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
-    int thread_id;                // Worker thread index in pool.
-  };
-
-  Environment env_;
-  std::mutex mu_;
-  MaxSizeVector<Thread*> threads_;  // All threads
-  MaxSizeVector<Waiter*> waiters_;  // Stack of waiting threads.
-  std::deque<Task> pending_;        // Queue of pending work
-  std::condition_variable empty_;   // Signaled on pending_.empty()
-  bool exiting_ = false;
-
-  PerThread* GetPerThread() const {
-    EIGEN_THREAD_LOCAL PerThread per_thread;
-    return &per_thread;
-  }
-};
-
-typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool;
-
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a05685f11d80cfe7033b9ff620f684380ab80a42
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h
@@ -0,0 +1,23 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
+
+// Try to come up with a portable way to cancel a thread
+#if EIGEN_OS_GNULINUX
+  #define EIGEN_THREAD_CANCEL(t)                  \
+    pthread_cancel(t.native_handle());
+  #define EIGEN_SUPPORTS_THREAD_CANCELLATION 1
+#else
+#define EIGEN_THREAD_CANCEL(t)
+#endif
+
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
index 399f95cc158cc139a14b5421c05322c42b61db89..d94a06416aace537d9969387920419ba152fdbe9 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -23,6 +23,8 @@ struct StlThreadEnvironment {
    public:
     EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
     ~EnvThread() { thr_.join(); }
+    // This function is called when the threadpool is cancelled.
+    void OnCancel() { }
 
    private:
     std::thread thr_;
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
index cfa22173203aaa61e8bf24c87ee3c904e3ef6456..4e6847404780fbe1f67f3b3168eaa5f12a7e8e6b 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
@@ -10,13 +10,292 @@
 #ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
 #define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
 
-// Try to come up with a portable implementation of thread local variables
-#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
-#define EIGEN_THREAD_LOCAL static __thread
-#elif EIGEN_COMP_CLANG
-#define EIGEN_THREAD_LOCAL static __thread
+#ifdef EIGEN_AVOID_THREAD_LOCAL
+
+#ifdef EIGEN_THREAD_LOCAL
+#undef EIGEN_THREAD_LOCAL
+#endif
+
 #else
+
+#if EIGEN_MAX_CPP_VER >= 11 &&                         \
+    ((EIGEN_COMP_GNUC && EIGEN_GNUC_AT_LEAST(4, 8)) || \
+     __has_feature(cxx_thread_local)                || \
+     (EIGEN_COMP_MSVC >= 1900) )
 #define EIGEN_THREAD_LOCAL static thread_local
 #endif
 
+// Disable TLS for Apple and Android builds with older toolchains.
+#if defined(__APPLE__)
+// Included for TARGET_OS_IPHONE, __IPHONE_OS_VERSION_MIN_REQUIRED,
+// __IPHONE_8_0.
+#include <Availability.h>
+#include <TargetConditionals.h>
+#endif
+// Checks whether C++11's `thread_local` storage duration specifier is
+// supported.
+#if defined(__apple_build_version__) &&     \
+    ((__apple_build_version__ < 8000042) || \
+     (TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0))
+// Notes: Xcode's clang did not support `thread_local` until version
+// 8, and even then not for all iOS < 9.0.
+#undef EIGEN_THREAD_LOCAL
+
+#elif defined(__ANDROID__) && EIGEN_COMP_CLANG
+// There are platforms for which TLS should not be used even though the compiler
+// makes it seem like it's supported (Android NDK < r12b for example).
+// This is primarily because of linker problems and toolchain misconfiguration:
+// TLS isn't supported until NDK r12b per
+// https://developer.android.com/ndk/downloads/revision_history.html
+// Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in
+// <android/ndk-version.h>. For NDK < r16, users should define these macros,
+// e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11.
+#if __has_include(<android/ndk-version.h>)
+#include <android/ndk-version.h>
+#endif  // __has_include(<android/ndk-version.h>)
+#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
+    defined(__NDK_MINOR__) &&                                               \
+    ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
+#undef EIGEN_THREAD_LOCAL
+#endif
+#endif  // defined(__ANDROID__) && defined(__clang__)
+
+#endif  // EIGEN_AVOID_THREAD_LOCAL
+
+namespace Eigen {
+
+namespace internal {
+template <typename T>
+struct ThreadLocalNoOpInitialize {
+  void operator()(T&) const {}
+};
+
+template <typename T>
+struct ThreadLocalNoOpRelease {
+  void operator()(T&) const {}
+};
+
+}  // namespace internal
+
+// Thread local container for elements of type T, that does not use thread local
+// storage. As long as the number of unique threads accessing this storage
+// is smaller than `capacity_`, it is lock-free and wait-free. Otherwise it will
+// use a mutex for synchronization.
+//
+// Type `T` has to be default constructible, and by default each thread will get
+// a default constructed value. It is possible to specify custom `initialize`
+// callable, that will be called lazily from each thread accessing this object,
+// and will be passed a default initialized object of type `T`. Also it's
+// possible to pass a custom `release` callable, that will be invoked before
+// calling ~T().
+//
+// Example:
+//
+//   struct Counter {
+//     int value = 0;
+//   }
+//
+//   Eigen::ThreadLocal<Counter> counter(10);
+//
+//   // Each thread will have access to it's own counter object.
+//   Counter& cnt = counter.local();
+//   cnt++;
+//
+// WARNING: Eigen::ThreadLocal uses the OS-specific value returned by
+// std::this_thread::get_id() to identify threads. This value is not guaranteed
+// to be unique except for the life of the thread. A newly created thread may
+// get an OS-specific ID equal to that of an already destroyed thread.
+//
+// Somewhat similar to TBB thread local storage, with similar restrictions:
+// https://www.threadingbuildingblocks.org/docs/help/reference/thread_local_storage/enumerable_thread_specific_cls.html
+//
+template <typename T,
+          typename Initialize = internal::ThreadLocalNoOpInitialize<T>,
+          typename Release = internal::ThreadLocalNoOpRelease<T>>
+class ThreadLocal {
+  // We preallocate default constructed elements in MaxSizedVector.
+  static_assert(std::is_default_constructible<T>::value,
+                "ThreadLocal data type must be default constructible");
+
+ public:
+  explicit ThreadLocal(int capacity)
+      : ThreadLocal(capacity, internal::ThreadLocalNoOpInitialize<T>(),
+                    internal::ThreadLocalNoOpRelease<T>()) {}
+
+  ThreadLocal(int capacity, Initialize initialize)
+      : ThreadLocal(capacity, std::move(initialize),
+                    internal::ThreadLocalNoOpRelease<T>()) {}
+
+  ThreadLocal(int capacity, Initialize initialize, Release release)
+      : initialize_(std::move(initialize)),
+        release_(std::move(release)),
+        capacity_(capacity),
+        data_(capacity_),
+        ptr_(capacity_),
+        filled_records_(0) {
+    eigen_assert(capacity_ >= 0);
+    data_.resize(capacity_);
+    for (int i = 0; i < capacity_; ++i) {
+      ptr_.emplace_back(nullptr);
+    }
+  }
+
+  T& local() {
+    std::thread::id this_thread = std::this_thread::get_id();
+    if (capacity_ == 0) return SpilledLocal(this_thread);
+
+    std::size_t h = std::hash<std::thread::id>()(this_thread);
+    const int start_idx = h % capacity_;
+
+    // NOTE: From the definition of `std::this_thread::get_id()` it is
+    // guaranteed that we never can have concurrent insertions with the same key
+    // to our hash-map like data structure. If we didn't find an element during
+    // the initial traversal, it's guaranteed that no one else could have
+    // inserted it while we are in this function. This allows to massively
+    // simplify out lock-free insert-only hash map.
+
+    // Check if we already have an element for `this_thread`.
+    int idx = start_idx;
+    while (ptr_[idx].load() != nullptr) {
+      ThreadIdAndValue& record = *(ptr_[idx].load());
+      if (record.thread_id == this_thread) return record.value;
+
+      idx += 1;
+      if (idx >= capacity_) idx -= capacity_;
+      if (idx == start_idx) break;
+    }
+
+    // If we are here, it means that we found an insertion point in lookup
+    // table at `idx`, or we did a full traversal and table is full.
+
+    // If lock-free storage is full, fallback on mutex.
+    if (filled_records_.load() >= capacity_) return SpilledLocal(this_thread);
+
+    // We double check that we still have space to insert an element into a lock
+    // free storage. If old value in `filled_records_` is larger than the
+    // records capacity, it means that some other thread added an element while
+    // we were traversing lookup table.
+    int insertion_index =
+        filled_records_.fetch_add(1, std::memory_order_relaxed);
+    if (insertion_index >= capacity_) return SpilledLocal(this_thread);
+
+    // At this point it's guaranteed that we can access to
+    // data_[insertion_index_] without a data race.
+    data_[insertion_index].thread_id = this_thread;
+    initialize_(data_[insertion_index].value);
+
+    // That's the pointer we'll put into the lookup table.
+    ThreadIdAndValue* inserted = &data_[insertion_index];
+
+    // We'll use nullptr pointer to ThreadIdAndValue in a compare-and-swap loop.
+    ThreadIdAndValue* empty = nullptr;
+
+    // Now we have to find an insertion point into the lookup table. We start
+    // from the `idx` that was identified as an insertion point above, it's
+    // guaranteed that we will have an empty record somewhere in a lookup table
+    // (because we created a record in the `data_`).
+    const int insertion_idx = idx;
+
+    do {
+      // Always start search from the original insertion candidate.
+      idx = insertion_idx;
+      while (ptr_[idx].load() != nullptr) {
+        idx += 1;
+        if (idx >= capacity_) idx -= capacity_;
+        // If we did a full loop, it means that we don't have any free entries
+        // in the lookup table, and this means that something is terribly wrong.
+        eigen_assert(idx != insertion_idx);
+      }
+      // Atomic CAS of the pointer guarantees that any other thread, that will
+      // follow this pointer will see all the mutations in the `data_`.
+    } while (!ptr_[idx].compare_exchange_weak(empty, inserted));
+
+    return inserted->value;
+  }
+
+  // WARN: It's not thread safe to call it concurrently with `local()`.
+  void ForEach(std::function<void(std::thread::id, T&)> f) {
+    // Reading directly from `data_` is unsafe, because only CAS to the
+    // record in `ptr_` makes all changes visible to other threads.
+    for (auto& ptr : ptr_) {
+      ThreadIdAndValue* record = ptr.load();
+      if (record == nullptr) continue;
+      f(record->thread_id, record->value);
+    }
+
+    // We did not spill into the map based storage.
+    if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
+
+    // Adds a happens before edge from the last call to SpilledLocal().
+    std::unique_lock<std::mutex> lock(mu_);
+    for (auto& kv : per_thread_map_) {
+      f(kv.first, kv.second);
+    }
+  }
+
+  // WARN: It's not thread safe to call it concurrently with `local()`.
+  ~ThreadLocal() {
+    // Reading directly from `data_` is unsafe, because only CAS to the record
+    // in `ptr_` makes all changes visible to other threads.
+    for (auto& ptr : ptr_) {
+      ThreadIdAndValue* record = ptr.load();
+      if (record == nullptr) continue;
+      release_(record->value);
+    }
+
+    // We did not spill into the map based storage.
+    if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
+
+    // Adds a happens before edge from the last call to SpilledLocal().
+    std::unique_lock<std::mutex> lock(mu_);
+    for (auto& kv : per_thread_map_) {
+      release_(kv.second);
+    }
+  }
+
+ private:
+  struct ThreadIdAndValue {
+    std::thread::id thread_id;
+    T value;
+  };
+
+  // Use unordered map guarded by a mutex when lock free storage is full.
+  T& SpilledLocal(std::thread::id this_thread) {
+    std::unique_lock<std::mutex> lock(mu_);
+
+    auto it = per_thread_map_.find(this_thread);
+    if (it == per_thread_map_.end()) {
+      auto result = per_thread_map_.emplace(this_thread, T());
+      eigen_assert(result.second);
+      initialize_((*result.first).second);
+      return (*result.first).second;
+    } else {
+      return it->second;
+    }
+  }
+
+  Initialize initialize_;
+  Release release_;
+  const int capacity_;
+
+  // Storage that backs lock-free lookup table `ptr_`. Records stored in this
+  // storage contiguously starting from index 0.
+  MaxSizeVector<ThreadIdAndValue> data_;
+
+  // Atomic pointers to the data stored in `data_`. Used as a lookup table for
+  // linear probing hash map (https://en.wikipedia.org/wiki/Linear_probing).
+  MaxSizeVector<std::atomic<ThreadIdAndValue*>> ptr_;
+
+  // Number of records stored in the `data_`.
+  std::atomic<int> filled_records_;
+
+  // We fallback on per thread map if lock-free storage is full. In practice
+  // this should never happen, if `capacity_` is a reasonable estimate of the
+  // number of threads running in a system.
+  std::mutex mu_;  // Protects per_thread_map_.
+  std::unordered_map<std::thread::id, T> per_thread_map_;
+};
+
+}  // namespace Eigen
+
 #endif  // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
index a65ee97c96a5359dcf7cd05441b9fe82f3f5c66a..25030dc0bf2f67c5ae9cdc47833ab444fcd847bc 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -16,8 +16,23 @@ namespace Eigen {
 // custom thread pools underneath.
 class ThreadPoolInterface {
  public:
+  // Submits a closure to be run by a thread in the pool.
   virtual void Schedule(std::function<void()> fn) = 0;
 
+  // Submits a closure to be run by threads in the range [start, end) in the
+  // pool.
+  virtual void ScheduleWithHint(std::function<void()> fn, int /*start*/,
+                                int /*end*/) {
+    // Just defer to Schedule in case sub-classes aren't interested in
+    // overriding this functionality.
+    Schedule(fn);
+  }
+
+  // If implemented, stop processing the closures that have been enqueued.
+  // Currently running closures may still be processed.
+  // If not implemented, does nothing.
+  virtual void Cancel() {}
+
   // Returns the number of threads in the pool.
   virtual int NumThreads() const = 0;
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/contrib/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
index ec27eddb8a88fc21ed68af0a5c310acd4401aef7..149ceaff01d99924b3fb6b410b524c66097e37ce 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
@@ -13,11 +13,6 @@
 #include <vector>
 #include "EmulateArray.h"
 
-// Emulate the cxx11 functionality that we need if the compiler doesn't support it.
-// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it
-// supports enough of the standard for our needs
-#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
-
 #include "CXX11Workarounds.h"
 
 namespace Eigen {
@@ -40,8 +35,9 @@ template<typename T, T... nn>
 struct numeric_list { constexpr static std::size_t count = sizeof...(nn); };
 
 template<typename T, T n, T... nn>
-struct numeric_list<T, n, nn...> { constexpr static std::size_t count = sizeof...(nn) + 1; constexpr static T first_value = n; };
+struct numeric_list<T, n, nn...> { static const std::size_t count = sizeof...(nn) + 1; const static T first_value = n; };
 
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 /* numeric list constructors
  *
  * equivalencies:
@@ -100,13 +96,14 @@ template<int n, typename t, typename... tt> struct h_skip_helper_type<n, t, tt..
 template<typename t, typename... tt>        struct h_skip_helper_type<0, t, tt...> { typedef type_list<t, tt...> type; };
 template<int n>                             struct h_skip_helper_type<n>           { typedef type_list<> type; };
 template<>                                  struct h_skip_helper_type<0>           { typedef type_list<> type; };
+#endif //not EIGEN_PARSED_BY_DOXYGEN
 
 template<int n>
 struct h_skip {
   template<typename T, T... ii>
-  constexpr static inline typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); }
+  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); }
   template<typename... tt>
-  constexpr static inline typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); }
+  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); }
 };
 
 template<int n, typename a> struct skip { typedef decltype(h_skip<n>::helper(a())) type; };
@@ -123,6 +120,10 @@ template<typename a, typename... as>                      struct get<0, type_lis
 template<typename T, int n, T a, T... as>                        struct get<n, numeric_list<T, a, as...>>   : get<n-1, numeric_list<T, as...>> {};
 template<typename T, T a, T... as>                               struct get<0, numeric_list<T, a, as...>>   { constexpr static T value = a; };
 
+template<std::size_t n, typename T, T a, T... as> constexpr T       array_get(const numeric_list<T, a, as...>&) {
+   return get<(int)n, numeric_list<T, a, as...>>::value;
+}
+
 /* always get type, regardless of dummy; good for parameter pack expansion */
 
 template<typename T, T dummy, typename t> struct id_numeric  { typedef t type; };
@@ -264,7 +265,7 @@ template<
   typename Reducer
 > struct reduce<Reducer>
 {
-  constexpr static inline int run() { return Reducer::Identity; }
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE int run() { return Reducer::Identity; }
 };
 
 template<
@@ -272,7 +273,7 @@ template<
   typename A
 > struct reduce<Reducer, A>
 {
-  constexpr static inline A run(A a) { return a; }
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE A run(A a) { return a; }
 };
 
 template<
@@ -281,7 +282,7 @@ template<
   typename... Ts
 > struct reduce<Reducer, A, Ts...>
 {
-  constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
     return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
   }
 };
@@ -289,29 +290,29 @@ template<
 /* generic binary operations */
 
 struct sum_op           {
-  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a + b)   { return a + b;   }
+  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a + b)   { return a + b;   }
   static constexpr int Identity = 0;
 };
 struct product_op       {
-  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a * b)   { return a * b;   }
+  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a * b)   { return a * b;   }
   static constexpr int Identity = 1;
 };
 
-struct logical_and_op   { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a && b)  { return a && b;  } };
-struct logical_or_op    { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a || b)  { return a || b;  } };
+struct logical_and_op   { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a && b)  { return a && b;  } };
+struct logical_or_op    { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a || b)  { return a || b;  } };
 
-struct equal_op         { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a == b)  { return a == b;  } };
-struct not_equal_op     { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a != b)  { return a != b;  } };
-struct lesser_op        { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a < b)   { return a < b;   } };
-struct lesser_equal_op  { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a <= b)  { return a <= b;  } };
-struct greater_op       { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a > b)   { return a > b;   } };
-struct greater_equal_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a >= b)  { return a >= b;  } };
+struct equal_op         { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a == b)  { return a == b;  } };
+struct not_equal_op     { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a != b)  { return a != b;  } };
+struct lesser_op        { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a < b)   { return a < b;   } };
+struct lesser_equal_op  { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a <= b)  { return a <= b;  } };
+struct greater_op       { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a > b)   { return a > b;   } };
+struct greater_equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a >= b)  { return a >= b;  } };
 
 /* generic unary operations */
 
-struct not_op                { template<typename A> constexpr static inline auto run(A a) -> decltype(!a)      { return !a;      } };
-struct negation_op           { template<typename A> constexpr static inline auto run(A a) -> decltype(-a)      { return -a;      } };
-struct greater_equal_zero_op { template<typename A> constexpr static inline auto run(A a) -> decltype(a >= 0)  { return a >= 0;  } };
+struct not_op                { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(!a)      { return !a;      } };
+struct negation_op           { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(-a)      { return -a;      } };
+struct greater_equal_zero_op { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(a >= 0)  { return a >= 0;  } };
 
 
 /* reductions for lists */
@@ -320,13 +321,13 @@ struct greater_equal_zero_op { template<typename A> constexpr static inline auto
 // together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
 // does...
 template<typename... Ts>
-constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
 {
   return reduce<product_op, Ts...>::run(ts...);
 }
 
 template<typename... Ts>
-constexpr inline decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts)
+constexpr EIGEN_STRONG_INLINE decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts)
 {
   return reduce<sum_op, Ts...>::run(ts...);
 }
@@ -334,13 +335,13 @@ constexpr inline decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts
 /* reverse arrays */
 
 template<typename Array, int... n>
-constexpr inline Array h_array_reverse(Array arr, numeric_list<int, n...>)
+constexpr EIGEN_STRONG_INLINE Array h_array_reverse(Array arr, numeric_list<int, n...>)
 {
   return {{array_get<sizeof...(n) - n - 1>(arr)...}};
 }
 
 template<typename T, std::size_t N>
-constexpr inline array<T, N> array_reverse(array<T, N> arr)
+constexpr EIGEN_STRONG_INLINE array<T, N> array_reverse(array<T, N> arr)
 {
   return h_array_reverse(arr, typename gen_numeric_list<int, N>::type());
 }
@@ -355,7 +356,7 @@ constexpr inline array<T, N> array_reverse(array<T, N> arr)
 // an infinite loop)
 template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
 struct h_array_reduce {
-  EIGEN_DEVICE_FUNC constexpr static inline auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)))
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)))
   {
     return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr));
   }
@@ -364,7 +365,7 @@ struct h_array_reduce {
 template<typename Reducer, typename T, std::size_t N>
 struct h_array_reduce<Reducer, T, N, 0>
 {
-  EIGEN_DEVICE_FUNC constexpr static inline T run(const array<T, N>& arr, T)
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, N>& arr, T)
   {
     return array_get<0>(arr);
   }
@@ -373,14 +374,14 @@ struct h_array_reduce<Reducer, T, N, 0>
 template<typename Reducer, typename T>
 struct h_array_reduce<Reducer, T, 0>
 {
-  EIGEN_DEVICE_FUNC constexpr static inline T run(const array<T, 0>&, T identity)
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, 0>&, T identity)
   {
     return identity;
   }
 };
 
 template<typename Reducer, typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr inline auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity))
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity))
 {
   return h_array_reduce<Reducer, T, N>::run(arr, identity);
 }
@@ -388,13 +389,13 @@ EIGEN_DEVICE_FUNC constexpr inline auto array_reduce(const array<T, N>& arr, T i
 /* standard array reductions */
 
 template<typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr inline auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0)))
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0)))
 {
   return array_reduce<sum_op, T, N>(arr, static_cast<T>(0));
 }
 
 template<typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr inline auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1)))
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1)))
 {
   return array_reduce<product_op, T, N>(arr, static_cast<T>(1));
 }
@@ -410,13 +411,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
 /* zip an array */
 
 template<typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr inline array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
 {
   return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
 }
 
 template<typename Op, typename A, typename B, std::size_t N>
-constexpr inline array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b)
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b)
 {
   return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type());
 }
@@ -424,13 +425,13 @@ constexpr inline array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, a
 /* zip an array and reduce the result */
 
 template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr inline auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
+constexpr EIGEN_STRONG_INLINE auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
 {
   return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...);
 }
 
 template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
-constexpr inline auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
+constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
 {
   return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type());
 }
@@ -438,13 +439,13 @@ constexpr inline auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decl
 /* apply stuff to an array */
 
 template<typename Op, typename A, std::size_t N, int... n>
-constexpr inline array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
 {
   return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
 }
 
 template<typename Op, typename A, std::size_t N>
-constexpr inline array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
 {
   return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type());
 }
@@ -452,13 +453,13 @@ constexpr inline array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
 /* apply stuff to an array and reduce */
 
 template<typename Reducer, typename Op, typename A, std::size_t N, int... n>
-constexpr inline auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
+constexpr EIGEN_STRONG_INLINE auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
 {
   return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...);
 }
 
 template<typename Reducer, typename Op, typename A, std::size_t N>
-constexpr inline auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
+constexpr EIGEN_STRONG_INLINE auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
 {
   return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type());
 }
@@ -472,7 +473,7 @@ template<int n>
 struct h_repeat
 {
   template<typename t, int... ii>
-  constexpr static inline array<t, n> run(t v, numeric_list<int, ii...>)
+  constexpr static EIGEN_STRONG_INLINE array<t, n> run(t v, numeric_list<int, ii...>)
   {
     return {{ typename id_numeric<int, ii, t>::type(v)... }};
   }
@@ -533,10 +534,4 @@ InstType instantiate_by_c_array(ArrType* arr)
 
 } // end namespace Eigen
 
-#else // Non C++11, fallback to emulation mode
-
-#include "EmulateCXX11Meta.h"
-
-#endif
-
 #endif // EIGEN_CXX11META_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h b/contrib/eigen/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
index fe4d22803ef981cd63ffdcf4cebbf1b19001c79b..056736c39fc469f6243ae51d8fee97b58bc1da70 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
@@ -32,7 +32,7 @@
  * On the other hand, visual studio still doesn't claim to support C++11 although it's
  * compliant enugh for our purpose.
  */
-#if (__cplusplus <= 199711L) && (EIGEN_COMP_MSVC < 1900)
+#if (EIGEN_COMP_CXXVER < 11)
 #if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
 #pragma GCC diagnostic error "-Wfatal-errors"
 #endif
@@ -47,9 +47,9 @@ namespace internal {
  */
 
 
-template<std::size_t I, class T> constexpr inline T&       array_get(std::vector<T>&       a) { return a[I]; }
-template<std::size_t I, class T> constexpr inline T&&      array_get(std::vector<T>&&      a) { return a[I]; }
-template<std::size_t I, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I]; }
+template<std::size_t I_, class T> constexpr inline T&       array_get(std::vector<T>&       a) { return a[I_]; }
+template<std::size_t I_, class T> constexpr inline T&&      array_get(std::vector<T>&&      a) { return a[I_]; }
+template<std::size_t I_, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I_]; }
 
 /* Suppose you have a template of the form
  * template<typename T> struct X;
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/contrib/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 30d3ebcff3e17b363de00167ee57c04535d64e31..834b20b555fa34af5b586636e565e55d4ec39fff 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -15,15 +15,20 @@
 // The array class is only available starting with cxx11. Emulate our own here
 // if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
 // Moreover, CUDA doesn't support the STL containers, so we use our own instead.
-#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY)
+#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY)
 
 namespace Eigen {
 template <typename T, size_t n> class array {
  public:
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; }
+  EIGEN_STRONG_INLINE T& operator[] (size_t index) { eigen_internal_assert(index < size()); return values[index]; }
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }
+  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { eigen_internal_assert(index < size()); return values[index]; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& at(size_t index) { eigen_assert(index < size()); return values[index]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& at(size_t index) const { eigen_assert(index < size()); return values[index]; }
 
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE T& front() { return values[0]; }
@@ -169,6 +174,7 @@ template <typename T> class array<T, 0> {
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
+    EIGEN_UNUSED_VARIABLE(l);
     eigen_assert(l.size() == 0);
   }
 #endif
@@ -191,30 +197,26 @@ EIGEN_DEVICE_FUNC bool operator==(const array<T,N>& lhs, const array<T,N>& rhs)
 
 
 namespace internal {
-template<std::size_t I, class T, std::size_t N>
+template<std::size_t I_, class T, std::size_t N>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
-  return a[I];
+  return a[I_];
 }
-template<std::size_t I, class T, std::size_t N>
+template<std::size_t I_, class T, std::size_t N>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
-  return a[I];
+  return a[I_];
 }
 
-template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<array<T,N> > {
-  static const size_t value = N;
+  enum { value = N };
 };
-template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<array<T,N>& > {
-  static const size_t value = N;
+  enum { value = N };
 };
-template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<const array<T,N> > {
-  static const size_t value = N;
+  enum { value = N };
 };
-template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<const array<T,N>& > {
-  static const size_t value = N;
+  enum { value = N };
 };
 
 }  // end namespace internal
@@ -222,7 +224,7 @@ template<class T, std::size_t N> struct array_size<const array<T,N>& > {
 
 #else
 
-// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen::array
+// The compiler supports c++11, and we're not targeting cuda: use std::array as Eigen::array
 #include <array>
 namespace Eigen {
 
@@ -238,27 +240,19 @@ namespace internal {
  *                       this may not be constexpr
  */
 #if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
-#define STD_GET_ARR_HACK             a._M_instance[I]
+#define STD_GET_ARR_HACK             a._M_instance[I_]
 #elif defined(_LIBCPP_VERSION)
-#define STD_GET_ARR_HACK             a.__elems_[I]
+#define STD_GET_ARR_HACK             a.__elems_[I_]
 #else
-#define STD_GET_ARR_HACK             std::template get<I, T, N>(a)
+#define STD_GET_ARR_HACK             std::template get<I_, T, N>(a)
 #endif
 
-template<std::size_t I, class T, std::size_t N> constexpr inline T&       array_get(std::array<T,N>&       a) { return (T&)       STD_GET_ARR_HACK; }
-template<std::size_t I, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
-template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
+template<std::size_t I_, class T, std::size_t N> constexpr inline T&       array_get(std::array<T,N>&       a) { return (T&)       STD_GET_ARR_HACK; }
+template<std::size_t I_, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
+template<std::size_t I_, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
 
 #undef STD_GET_ARR_HACK
 
-template <typename T> struct array_size;
-template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
-  static const size_t value = N;
-};
-template <typename T> struct array_size;
-template<class T, std::size_t N> struct array_size<std::array<T,N> > {
-  static const size_t value = N;
-};
 }  // end namespace internal
 }  // end namespace Eigen
 
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h b/contrib/eigen/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h
deleted file mode 100644
index 8a536faf62f4f75dcb3c4d61b87f1608660d458d..0000000000000000000000000000000000000000
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h
+++ /dev/null
@@ -1,311 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_EMULATE_CXX11_META_H
-#define EIGEN_EMULATE_CXX11_META_H
-
-
-
-namespace Eigen {
-
-namespace internal {
-
-/** \internal
-  * \file CXX11/util/EmulateCXX11Meta.h
-  * This file emulates a subset of the functionality provided by CXXMeta.h for
-  * compilers that don't yet support cxx11 such as nvcc.
-  */
-
-struct empty_list { static const std::size_t count = 0; };
-
-template<typename T, typename Tail=empty_list> struct type_list {
-  typedef T HeadType;
-  typedef Tail TailType;
-  static const T head;
-  static const Tail tail;
-  static const std::size_t count = 1 + Tail::count;
-};
-
-struct null_type { };
-
-template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type,
-         typename T4 = null_type, typename T5 = null_type, typename T6 = null_type,
-         typename T7 = null_type, typename T8 = null_type>
-struct make_type_list {
-  typedef typename make_type_list<T2, T3, T4, T5, T6, T7, T8>::type tailresult;
-
-  typedef type_list<T1, tailresult> type;
-};
-
-template<> struct make_type_list<> {
-  typedef empty_list type;
-};
-
-
-template <std::size_t index, class TList> struct get_type;
-
-template <class Head, class Tail>
-struct get_type<0, type_list<Head, Tail> >
-{
-  typedef Head type;
-};
-
-template <std::size_t i, class Head, class Tail>
-struct get_type<i, type_list<Head, Tail> >
-{
-  typedef typename get_type<i-1, Tail>::type type;
-};
-
-
-/* numeric list */
-template <typename T, T n>
-struct type2val {
-  typedef T type;
-  static const T value = n;
-};
-
-
-template<typename T, size_t n, T V> struct gen_numeric_list_repeated;
-
-template<typename T, T V> struct gen_numeric_list_repeated<T, 1, V> {
-  typedef typename make_type_list<type2val<T, V> >::type type;
-};
-
-template<typename T, T V> struct gen_numeric_list_repeated<T, 2, V> {
-  typedef typename make_type_list<type2val<T, V>, type2val<T, V> >::type type;
-};
-
-template<typename T, T V> struct gen_numeric_list_repeated<T, 3, V> {
-  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
-};
-
-template<typename T, T V> struct gen_numeric_list_repeated<T, 4, V> {
-  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
-};
-
-template<typename T, T V> struct gen_numeric_list_repeated<T, 5, V> {
-  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
-};
-
-template<typename T, T V> struct gen_numeric_list_repeated<T, 6, V> {
-  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
-                                  type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
-};
-
-template<typename T, T V> struct gen_numeric_list_repeated<T, 7, V> {
-  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
-                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
-                                  type2val<T, V> >::type type;
-};
-
-template<typename T, T V> struct gen_numeric_list_repeated<T, 8, V> {
-  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
-                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
-                                  type2val<T, V>, type2val<T, V> >::type type;
-};
-
-
-template <std::size_t index, class NList> struct get;
-
-template <std::size_t i>
-struct get<i, empty_list>
-{
-  get() { eigen_assert(false && "index overflow"); }
-  typedef void type;
-  static const char value = '\0';
-};
-
-template <std::size_t i, class Head>
-struct get<i, type_list<Head, empty_list> >
-{
-  get() { eigen_assert(false && "index overflow"); }
-  typedef void type;
-  static const char value = '\0';
-};
-
-template <class Head>
-struct get<0, type_list<Head, empty_list> >
-{
-  typedef typename Head::type type;
-  static const type value = Head::value;
-};
-
-template <class Head, class Tail>
-struct get<0, type_list<Head, Tail> >
-{
-  typedef typename Head::type type;
-  static const type value = Head::value;
-};
-
-template <std::size_t i, class Head, class Tail>
-struct get<i, type_list<Head, Tail> >
-{
-  typedef typename Tail::HeadType::type type;
-  static const type value = get<i-1, Tail>::value;
-};
-
-
-template <class NList> struct arg_prod {
-  static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod<typename NList::TailType>::value;
-};
-template <> struct arg_prod<empty_list> {
-  static const int value = 1;
-};
-
-
-template<int n, typename t>
-array<t, n> repeat(t v) {
-  array<t, n> array;
-  array.fill(v);
-  return array;
-}
-
-template<std::size_t I, class Head, class Tail>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>&) {
-  return get<I, type_list<Head, Tail> >::value;
-}
-template<std::size_t I, class Head, class Tail>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>&) {
-  return get<I, type_list<Head, Tail> >::value;
-}
-
-template <class NList>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList&) {
-  return arg_prod<NList>::value;
-}
-
-template<typename t, std::size_t n>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) {
-  t prod = 1;
-  for (size_t i = 0; i < n; ++i) { prod *= a[i]; }
-  return prod;
-}
-template<typename t>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, 0>& /*a*/) {
-  return 1;
-}
-
-template<typename t>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
-  eigen_assert(a.size() > 0);
-  t prod = 1;
-  for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
-  return prod;
-}
-
-
-template<std::size_t I, class T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector<T>& a) {
-  return a[I];
-}
-template<std::size_t I, class T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector<T>& a) {
-  return a[I];
-}
-
-struct sum_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a + b; }
-};
-struct product_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a * b; }
-};
-
-struct logical_and_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a && b; }
-};
-struct logical_or_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a || b; }
-};
-
-struct equal_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a == b; }
-};
-struct not_equal_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a != b; }
-};
-struct lesser_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a < b; }
-};
-struct lesser_equal_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a <= b; }
-};
-
-struct greater_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a > b; }
-};
-struct greater_equal_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a >= b; }
-};
-
-struct not_op {
-  template<typename A> static inline bool run(A a) { return !a; }
-};
-struct negation_op {
-  template<typename A> static inline bool run(A a) { return -a; }
-};
-struct greater_equal_zero_op {
-  template<typename A> static inline bool run(A a) { return a >= 0; }
-};
-
-
-template<typename Reducer, typename Op, typename A, std::size_t N>
-struct ArrayApplyAndReduce {
-  static inline bool run(const array<A, N>& a) {
-    EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    bool result = Reducer::run(Op::run(a[0]), Op::run(a[1]));
-    for (size_t i = 2; i < N; ++i) {
-      result = Reducer::run(result, Op::run(a[i]));
-    }
-    return result;
-  }
-};
-
-template<typename Reducer, typename Op, typename A>
-struct ArrayApplyAndReduce<Reducer, Op, A, 1>  {
-  static inline bool run(const array<A, 1>& a) {
-    return Op::run(a[0]);
-  }
-};
-
-template<typename Reducer, typename Op, typename A, std::size_t N>
-inline bool array_apply_and_reduce(const array<A, N>& a) {
-  return ArrayApplyAndReduce<Reducer, Op, A, N>::run(a);
-}
-
-template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
-struct ArrayZipAndReduce {
-  static inline bool run(const array<A, N>& a, const array<B, N>& b) {
-    EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1]));
-    for (size_t i = 2; i < N; ++i) {
-      result = Reducer::run(result, Op::run(a[i], b[i]));
-    }
-    return result;
-  }
-};
-
-template<typename Reducer, typename Op, typename A, typename B>
-struct ArrayZipAndReduce<Reducer, Op, A, B, 1> {
-  static inline bool run(const array<A, 1>& a, const array<B, 1>& b) {
-    return Op::run(a[0], b[0]);
-  }
-};
-
-template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
-inline bool array_zip_and_reduce(const array<A, N>& a, const array<B, N>& b) {
-  return ArrayZipAndReduce<Reducer, Op, A, B, N>::run(a, b);
-}
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-
-
-#endif  // EIGEN_EMULATE_CXX11_META_H
diff --git a/contrib/eigen/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h b/contrib/eigen/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
index 4bc3dd1ba3fcd99fc3780cb0d1d1de33bb6659c2..277ab149a1e5c39902cdc6e8caafa639293e5be1 100644
--- a/contrib/eigen/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
+++ b/contrib/eigen/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
@@ -29,13 +29,13 @@ namespace Eigen {
   */
 template <typename T>
 class MaxSizeVector {
+  static const size_t alignment = EIGEN_PLAIN_ENUM_MAX(EIGEN_ALIGNOF(T), sizeof(void*));
  public:
   // Construct a new MaxSizeVector, reserve n elements.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   explicit MaxSizeVector(size_t n)
       : reserve_(n), size_(0),
-        data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
-    for (size_t i = 0; i < n; ++i) { new (&data_[i]) T; }
+        data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
   }
 
   // Construct a new MaxSizeVector, reserve and resize to n.
@@ -43,36 +43,56 @@ class MaxSizeVector {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   MaxSizeVector(size_t n, const T& init)
       : reserve_(n), size_(n),
-        data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
-    for (size_t i = 0; i < n; ++i) { new (&data_[i]) T(init); }
+        data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
+    size_t i = 0;
+    EIGEN_TRY
+    {
+      for(; i < size_; ++i) { new (&data_[i]) T(init); }
+    }
+    EIGEN_CATCH(...)
+    {
+      // Construction failed, destruct in reverse order:
+      for(; (i+1) > 0; --i) { data_[i-1].~T(); }
+      internal::handmade_aligned_free(data_);
+      EIGEN_THROW;
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   ~MaxSizeVector() {
-    for (size_t i = 0; i < size_; ++i) {
-      data_[i].~T();
+    for (size_t i = size_; i > 0; --i) {
+      data_[i-1].~T();
     }
-    internal::aligned_free(data_);
+    internal::handmade_aligned_free(data_);
   }
 
   void resize(size_t n) {
     eigen_assert(n <= reserve_);
-    for (size_t i = size_; i < n; ++i) {
-      new (&data_[i]) T;
+    for (; size_ < n; ++size_) {
+      new (&data_[size_]) T;
     }
-    for (size_t i = n; i < size_; ++i) {
-      data_[i].~T();
+    for (; size_ > n; --size_) {
+      data_[size_-1].~T();
     }
-    size_ = n;
+    eigen_assert(size_ == n);
   }
 
   // Append new elements (up to reserved size).
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void push_back(const T& t) {
     eigen_assert(size_ < reserve_);
-    data_[size_++] = t;
+    new (&data_[size_++]) T(t);
   }
 
+  // For C++03 compatibility this only takes one argument
+  template<class X>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void emplace_back(const X& x) {
+    eigen_assert(size_ < reserve_);
+    new (&data_[size_++]) T(x);
+  }
+
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const T& operator[] (size_t i) const {
     eigen_assert(i < size_);
@@ -99,11 +119,8 @@ class MaxSizeVector {
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void pop_back() {
-    // NOTE: This does not destroy the value at the end the way
-    // std::vector's version of pop_back() does.  That happens when
-    // the Vector is destroyed.
     eigen_assert(size_ > 0);
-    size_--;
+    data_[--size_].~T();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
diff --git a/contrib/eigen/unsupported/Eigen/EulerAngles b/contrib/eigen/unsupported/Eigen/EulerAngles
index 521fa3f762bf037513e908030982177af5e5c6dc..f8f1c5d0baae36ebae57f02abfc913aaaff27684 100644
--- a/contrib/eigen/unsupported/Eigen/EulerAngles
+++ b/contrib/eigen/unsupported/Eigen/EulerAngles
@@ -11,10 +11,10 @@
 #define EIGEN_EULERANGLES_MODULE_H
 
 
-#include "Eigen/Core"
-#include "Eigen/Geometry"
+#include "../../Eigen/Core"
+#include "../../Eigen/Geometry"
 
-#include "Eigen/src/Core/util/DisableStupidWarnings.h"
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 namespace Eigen {
 
@@ -38,6 +38,6 @@ namespace Eigen {
 #include "src/EulerAngles/EulerSystem.h"
 #include "src/EulerAngles/EulerAngles.h"
 
-#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_EULERANGLES_MODULE_H
diff --git a/contrib/eigen/unsupported/Eigen/FFT b/contrib/eigen/unsupported/Eigen/FFT
index d8cf3e64267755d83a325f45cb96bdcef8991307..c8c311a60bef74004ac3daad1d1c661e55c70ec3 100644
--- a/contrib/eigen/unsupported/Eigen/FFT
+++ b/contrib/eigen/unsupported/Eigen/FFT
@@ -13,7 +13,7 @@
 #include <complex>
 #include <vector>
 #include <map>
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 
 
 /**
@@ -68,6 +68,8 @@
   */
  
 
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
 #ifdef EIGEN_FFTW_DEFAULT
 // FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
 #  include <fftw3.h>
@@ -129,8 +131,6 @@ protected:
   const T_SrcMat & m_src;
   T_FftIfc & m_ifc;
   Index m_nfft;
-private:
-  fft_fwd_proxy& operator=(const fft_fwd_proxy&);
 };
 
 template<typename T_SrcMat,typename T_FftIfc> 
@@ -149,8 +149,6 @@ protected:
   const T_SrcMat & m_src;
   T_FftIfc & m_ifc;
   Index m_nfft;
-private:
-  fft_inv_proxy& operator=(const fft_inv_proxy&);
 };
 
 
@@ -415,5 +413,7 @@ void fft_inv_proxy<T_SrcMat,T_FftIfc>::evalTo(T_DestMat& dst) const
 }
 
 }
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
 #endif
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/contrib/eigen/unsupported/Eigen/IterativeSolvers b/contrib/eigen/unsupported/Eigen/IterativeSolvers
index 31e880bdc11101fcec5c3c600bb5f7fd3039731a..a3f58d6762c0dcb559643b3b3784dd2f81f39cd4 100644
--- a/contrib/eigen/unsupported/Eigen/IterativeSolvers
+++ b/contrib/eigen/unsupported/Eigen/IterativeSolvers
@@ -10,19 +10,28 @@
 #ifndef EIGEN_ITERATIVE_SOLVERS_MODULE_H
 #define EIGEN_ITERATIVE_SOLVERS_MODULE_H
 
-#include <Eigen/Sparse>
+#include "../../Eigen/Sparse"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/Householder"
+
 
 /**
-  * \defgroup IterativeSolvers_Module Iterative solvers module
+  * \defgroup IterativeLinearSolvers_Module Iterative solvers module
   * This module aims to provide various iterative linear and non linear solver algorithms.
   * It currently provides:
   *  - a constrained conjugate gradient
   *  - a Householder GMRES implementation
+  *  - an IDR(s) implementation
+  *  - a DGMRES implementation
+  *  - a MINRES implementation
+  *
   * \code
   * #include <unsupported/Eigen/IterativeSolvers>
   * \endcode
   */
-//@{
+
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 #ifndef EIGEN_MPL2_ONLY
 #include "src/IterativeSolvers/IterationController.h"
@@ -30,13 +39,13 @@
 #endif
 
 #include "src/IterativeSolvers/IncompleteLU.h"
-#include "../../Eigen/Jacobi"
-#include "../../Eigen/Householder"
 #include "src/IterativeSolvers/GMRES.h"
 #include "src/IterativeSolvers/DGMRES.h"
 //#include "src/IterativeSolvers/SSORPreconditioner.h"
 #include "src/IterativeSolvers/MINRES.h"
+#include "src/IterativeSolvers/IDRS.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
-//@}
 
 #endif // EIGEN_ITERATIVE_SOLVERS_MODULE_H
diff --git a/contrib/eigen/unsupported/Eigen/LevenbergMarquardt b/contrib/eigen/unsupported/Eigen/LevenbergMarquardt
index 0fe2680bab0a5064373abe9053cde11f30441ef7..109050501cca972f1581337a6338964e24eea256 100644
--- a/contrib/eigen/unsupported/Eigen/LevenbergMarquardt
+++ b/contrib/eigen/unsupported/Eigen/LevenbergMarquardt
@@ -12,12 +12,12 @@
 
 // #include <vector>
 
-#include <Eigen/Core>
-#include <Eigen/Jacobi>
-#include <Eigen/QR>
-#include <unsupported/Eigen/NumericalDiff> 
+#include "../../Eigen/Core"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/QR"
+#include "NumericalDiff"
 
-#include <Eigen/SparseQR>
+#include "../../Eigen/SparseQR"
 
 /**
   * \defgroup LevenbergMarquardt_Module Levenberg-Marquardt module
@@ -29,7 +29,10 @@
   * 
   */
 
-#include "Eigen/SparseCore"
+#include "../../Eigen/SparseCore"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
 #include "src/LevenbergMarquardt/LMqrsolv.h"
@@ -41,5 +44,6 @@
 #include "src/LevenbergMarquardt/LevenbergMarquardt.h"
 #include "src/LevenbergMarquardt/LMonestep.h"
 
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_LEVENBERGMARQUARDT_MODULE
diff --git a/contrib/eigen/unsupported/Eigen/MPRealSupport b/contrib/eigen/unsupported/Eigen/MPRealSupport
index 7f0b70c63995dfa2013434502d716580354b6468..c4ea4ec5f6d80f7c97670371e3b3cb64fee2106b 100644
--- a/contrib/eigen/unsupported/Eigen/MPRealSupport
+++ b/contrib/eigen/unsupported/Eigen/MPRealSupport
@@ -12,7 +12,7 @@
 #ifndef EIGEN_MPREALSUPPORT_MODULE_H
 #define EIGEN_MPREALSUPPORT_MODULE_H
 
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 #include <mpreal.h>
 
 namespace Eigen {
@@ -90,6 +90,9 @@ int main()
 #ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
     static inline int digits10 (long Precision = mpfr::mpreal::get_default_prec())  { return std::numeric_limits<Real>::digits10(Precision); }
     static inline int digits10 (const Real& x)                                      { return std::numeric_limits<Real>::digits10(x); }
+    
+    static inline int digits ()               { return std::numeric_limits<Real>::digits(); }
+    static inline int digits (const Real& x)  { return std::numeric_limits<Real>::digits(x); }
 #endif
 
     static inline Real dummy_precision()
@@ -159,6 +162,7 @@ int main()
       typedef ResScalar LhsPacket;
       typedef ResScalar RhsPacket;
       typedef ResScalar ResPacket;
+      typedef LhsPacket LhsPacket4Packing;
       
     };
 
diff --git a/contrib/eigen/unsupported/Eigen/MatrixFunctions b/contrib/eigen/unsupported/Eigen/MatrixFunctions
index 60dc0a69b3a169fc429abaa0b652a39df5f7138e..20c23d1c5f126499bfd1dfd1f7144a1664e7fbe2 100644
--- a/contrib/eigen/unsupported/Eigen/MatrixFunctions
+++ b/contrib/eigen/unsupported/Eigen/MatrixFunctions
@@ -14,9 +14,9 @@
 #include <cfloat>
 #include <list>
 
-#include <Eigen/Core>
-#include <Eigen/LU>
-#include <Eigen/Eigenvalues>
+#include "../../Eigen/Core"
+#include "../../Eigen/LU"
+#include "../../Eigen/Eigenvalues"
 
 /**
   * \defgroup MatrixFunctions_Module Matrix functions module
@@ -53,12 +53,16 @@
   *
   */
 
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
 #include "src/MatrixFunctions/MatrixExponential.h"
 #include "src/MatrixFunctions/MatrixFunction.h"
 #include "src/MatrixFunctions/MatrixSquareRoot.h"
 #include "src/MatrixFunctions/MatrixLogarithm.h"
 #include "src/MatrixFunctions/MatrixPower.h"
 
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
 
 /** 
 \page matrixbaseextra_page
diff --git a/contrib/eigen/unsupported/Eigen/MoreVectorization b/contrib/eigen/unsupported/Eigen/MoreVectorization
index 470e7243086388dbf39490e678f190bba190e2e6..7662b47801ec03b87f00bccfe18d9eaec8de33f8 100644
--- a/contrib/eigen/unsupported/Eigen/MoreVectorization
+++ b/contrib/eigen/unsupported/Eigen/MoreVectorization
@@ -9,7 +9,7 @@
 #ifndef EIGEN_MOREVECTORIZATION_MODULE_H
 #define EIGEN_MOREVECTORIZATION_MODULE_H
 
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 
 namespace Eigen {
 
diff --git a/contrib/eigen/unsupported/Eigen/NonLinearOptimization b/contrib/eigen/unsupported/Eigen/NonLinearOptimization
index 600ab4c12e298e55c64361e8cdb93ad395f38ac9..961f192b5178a3e1b26b280c8b31b827d7a9e9bc 100644
--- a/contrib/eigen/unsupported/Eigen/NonLinearOptimization
+++ b/contrib/eigen/unsupported/Eigen/NonLinearOptimization
@@ -12,10 +12,10 @@
 
 #include <vector>
 
-#include <Eigen/Core>
-#include <Eigen/Jacobi>
-#include <Eigen/QR>
-#include <unsupported/Eigen/NumericalDiff>
+#include "../../Eigen/Core"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/QR"
+#include "NumericalDiff"
 
 /**
   * \defgroup NonLinearOptimization_Module Non linear optimization module
@@ -30,12 +30,12 @@
   * actually linear. But if this is so, you should probably better use other
   * methods more fitted to this special case.
   *
-  * One algorithm allows to find an extremum of such a system (Levenberg
-  * Marquardt algorithm) and the second one is used to find 
+  * One algorithm allows to find a least-squares solution of such a system
+  * (Levenberg-Marquardt algorithm) and the second one is used to find 
   * a zero for the system (Powell hybrid "dogleg" method).
   *
   * This code is a port of minpack (http://en.wikipedia.org/wiki/MINPACK).
-  * Minpack is a very famous, old, robust and well-reknown package, written in 
+  * Minpack is a very famous, old, robust and well renowned package, written in
   * fortran. Those implementations have been carefully tuned, tested, and used
   * for several decades.
   *
@@ -58,35 +58,41 @@
   * There are two kinds of tests : those that come from examples bundled with cminpack.
   * They guaranty we get the same results as the original algorithms (value for 'x',
   * for the number of evaluations of the function, and for the number of evaluations
-  * of the jacobian if ever).
+  * of the Jacobian if ever).
   * 
   * Other tests were added by myself at the very beginning of the 
-  * process and check the results for levenberg-marquardt using the reference data 
+  * process and check the results for Levenberg-Marquardt using the reference data 
   * on http://www.itl.nist.gov/div898/strd/nls/nls_main.shtml. Since then i've 
-  * carefully checked that the same results were obtained when modifiying the 
+  * carefully checked that the same results were obtained when modifying the
   * code. Please note that we do not always get the exact same decimals as they do,
   * but this is ok : they use 128bits float, and we do the tests using the C type 'double',
   * which is 64 bits on most platforms (x86 and amd64, at least).
-  * I've performed those tests on several other implementations of levenberg-marquardt, and
+  * I've performed those tests on several other implementations of Levenberg-Marquardt, and
   * (c)minpack performs VERY well compared to those, both in accuracy and speed.
   * 
   * The documentation for running the tests is on the wiki
   * http://eigen.tuxfamily.org/index.php?title=Tests
   * 
-  * \section API API : overview of methods
+  * \section API API: overview of methods
   * 
-  * Both algorithms can use either the jacobian (provided by the user) or compute 
-  * an approximation by themselves (actually using Eigen \ref NumericalDiff_Module).
-  * The part of API referring to the latter use 'NumericalDiff' in the method names
-  * (exemple: LevenbergMarquardt.minimizeNumericalDiff() ) 
+  * Both algorithms needs a functor computing the Jacobian. It can be computed by
+  * hand, using auto-differentiation (see \ref AutoDiff_Module), or using numerical
+  * differences (see \ref NumericalDiff_Module). For instance:
+  *\code
+  * MyFunc func;
+  * NumericalDiff<MyFunc> func_with_num_diff(func);
+  * LevenbergMarquardt<NumericalDiff<MyFunc> > lm(func_with_num_diff);
+  * \endcode
+  * For HybridNonLinearSolver, the method solveNumericalDiff() does the above wrapping for
+  * you.
   * 
   * The methods LevenbergMarquardt.lmder1()/lmdif1()/lmstr1() and 
   * HybridNonLinearSolver.hybrj1()/hybrd1() are specific methods from the original 
   * minpack package that you probably should NOT use until you are porting a code that
-  *  was previously using minpack. They just define a 'simple' API with default values 
+  * was previously using minpack. They just define a 'simple' API with default values 
   * for some parameters.
   * 
-  * All algorithms are provided using Two APIs :
+  * All algorithms are provided using two APIs :
   *     - one where the user inits the algorithm, and uses '*OneStep()' as much as he wants : 
   * this way the caller have control over the steps
   *     - one where the user just calls a method (optimize() or solve()) which will 
@@ -94,7 +100,7 @@
   *  convenience.
   * 
   * As an example, the method LevenbergMarquardt::minimize() is 
-  * implemented as follow : 
+  * implemented as follow: 
   * \code
   * Status LevenbergMarquardt<FunctorType,Scalar>::minimize(FVectorType  &x, const int mode)
   * {
diff --git a/contrib/eigen/unsupported/Eigen/NumericalDiff b/contrib/eigen/unsupported/Eigen/NumericalDiff
index 433334ca80b7f9de91d26e3fc093002d87bb70dc..0668f960f9b6daffbe9d445fd77feb2cb1b635a4 100644
--- a/contrib/eigen/unsupported/Eigen/NumericalDiff
+++ b/contrib/eigen/unsupported/Eigen/NumericalDiff
@@ -10,7 +10,7 @@
 #ifndef EIGEN_NUMERICALDIFF_MODULE
 #define EIGEN_NUMERICALDIFF_MODULE
 
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 
 namespace Eigen {
 
diff --git a/contrib/eigen/unsupported/Eigen/OpenGLSupport b/contrib/eigen/unsupported/Eigen/OpenGLSupport
index 085325ce10d5f1db8fd9ee8ffc5a77ce925d0858..f8c213003b4364720c5cbb5b61904ad1f333be25 100644
--- a/contrib/eigen/unsupported/Eigen/OpenGLSupport
+++ b/contrib/eigen/unsupported/Eigen/OpenGLSupport
@@ -10,7 +10,7 @@
 #ifndef EIGEN_OPENGL_MODULE
 #define EIGEN_OPENGL_MODULE
 
-#include <Eigen/Geometry>
+#include "../../Eigen/Geometry"
 
 #if defined(__APPLE_CC__)
   #include <OpenGL/gl.h>
@@ -25,7 +25,7 @@ namespace Eigen {
   *
   * This module provides wrapper functions for a couple of OpenGL functions
   * which simplify the way to pass Eigen's object to openGL.
-  * Here is an exmaple:
+  * Here is an example:
   * 
   * \code
   * // You need to add path_to_eigen/unsupported to your include path.
diff --git a/contrib/eigen/unsupported/Eigen/Polynomials b/contrib/eigen/unsupported/Eigen/Polynomials
index 334b031427ad76b8e405e1fb0b5bce5ac87e1436..32ce2a2aa66f2461d97ae02fd86bed2c9399d00f 100644
--- a/contrib/eigen/unsupported/Eigen/Polynomials
+++ b/contrib/eigen/unsupported/Eigen/Polynomials
@@ -9,11 +9,11 @@
 #ifndef EIGEN_POLYNOMIALS_MODULE_H
 #define EIGEN_POLYNOMIALS_MODULE_H
 
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 
-#include <Eigen/Eigenvalues>
+#include "../../Eigen/Eigenvalues"
 
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 // Note that EIGEN_HIDE_HEAVY_CODE has to be defined per module
 #if (defined EIGEN_EXTERN_INSTANTIATIONS) && (EIGEN_EXTERN_INSTANTIATIONS>=2)
@@ -132,7 +132,6 @@
   Output: \verbinclude PolynomialSolver1.out
 */
 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_POLYNOMIALS_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/contrib/eigen/unsupported/Eigen/Skyline b/contrib/eigen/unsupported/Eigen/Skyline
index 71a68cb422ce349f76b7f214805f391ac01705fc..ebdf143f70a73935c2a4ae4af676d9b80ec0cf51 100644
--- a/contrib/eigen/unsupported/Eigen/Skyline
+++ b/contrib/eigen/unsupported/Eigen/Skyline
@@ -10,9 +10,9 @@
 #define EIGEN_SKYLINE_MODULE_H
 
 
-#include "Eigen/Core"
+#include "../../Eigen/Core"
 
-#include "Eigen/src/Core/util/DisableStupidWarnings.h"
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 #include <map>
 #include <cstdlib>
@@ -34,6 +34,6 @@
 #include "src/Skyline/SkylineInplaceLU.h"
 #include "src/Skyline/SkylineProduct.h"
 
-#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_SKYLINE_MODULE_H
diff --git a/contrib/eigen/unsupported/Eigen/SparseExtra b/contrib/eigen/unsupported/Eigen/SparseExtra
index 819cffa27584cb24f25ec51e540d8de78e542936..ba5cbd66188d00b4eb0aa53e2b34f4c0c00034a0 100644
--- a/contrib/eigen/unsupported/Eigen/SparseExtra
+++ b/contrib/eigen/unsupported/Eigen/SparseExtra
@@ -24,6 +24,7 @@
 
 #ifdef EIGEN_GOOGLEHASH_SUPPORT
   #include <google/dense_hash_map>
+  #include <google/sparse_hash_map>
 #endif
 
 /**
diff --git a/contrib/eigen/unsupported/Eigen/SpecialFunctions b/contrib/eigen/unsupported/Eigen/SpecialFunctions
index a2ad4925e9c7c01f646dc38243ff09a9f370a7ee..f6a2460e6757b3d0d08a10c2068ac7ad11f2f5ed 100644
--- a/contrib/eigen/unsupported/Eigen/SpecialFunctions
+++ b/contrib/eigen/unsupported/Eigen/SpecialFunctions
@@ -29,12 +29,29 @@ namespace Eigen {
   * - erfc
   * - lgamma
   * - igamma
+  * - igamma_der_a
+  * - gamma_sample_der_alpha
   * - igammac
   * - digamma
+  * - ndtri
   * - polygamma
   * - zeta
   * - betainc
   *
+  * Bessel Functions
+  * - bessel_i0
+  * - bessel_i0e
+  * - bessel_i1
+  * - bessel_i1e
+  * - bessel_j0
+  * - bessel_j1
+  * - bessel_k0
+  * - bessel_k0e
+  * - bessel_k1
+  * - bessel_k1e
+  * - bessel_y0
+  * - bessel_y1
+  *
   * \code
   * #include <unsupported/Eigen/SpecialFunctions>
   * \endcode
@@ -43,14 +60,37 @@ namespace Eigen {
 
 }
 
+#include "src/SpecialFunctions/BesselFunctionsImpl.h"
+#include "src/SpecialFunctions/BesselFunctionsBFloat16.h"
+#include "src/SpecialFunctions/BesselFunctionsHalf.h"
+#include "src/SpecialFunctions/BesselFunctionsPacketMath.h"
+#include "src/SpecialFunctions/BesselFunctionsFunctors.h"
+#include "src/SpecialFunctions/BesselFunctionsArrayAPI.h"
 #include "src/SpecialFunctions/SpecialFunctionsImpl.h"
-#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
+#if defined(EIGEN_HIPCC)
+#include "src/SpecialFunctions/HipVectorCompatibility.h"
+#endif
+#include "src/SpecialFunctions/SpecialFunctionsBFloat16.h"
 #include "src/SpecialFunctions/SpecialFunctionsHalf.h"
+#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
 #include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
 #include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
 
-#if defined EIGEN_VECTORIZE_CUDA
-  #include "src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h"
+#if defined EIGEN_VECTORIZE_AVX512
+  #include "src/SpecialFunctions/arch/AVX/BesselFunctions.h"
+  #include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h"
+  #include "src/SpecialFunctions/arch/AVX512/BesselFunctions.h"
+  #include "src/SpecialFunctions/arch/AVX512/SpecialFunctions.h"
+#elif defined EIGEN_VECTORIZE_AVX
+  #include "src/SpecialFunctions/arch/AVX/BesselFunctions.h"
+  #include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h"
+#elif defined EIGEN_VECTORIZE_NEON
+  #include "src/SpecialFunctions/arch/NEON/BesselFunctions.h"
+  #include "src/SpecialFunctions/arch/NEON/SpecialFunctions.h"
+#endif
+
+#if defined EIGEN_VECTORIZE_GPU
+  #include "src/SpecialFunctions/arch/GPU/SpecialFunctions.h"
 #endif
 
 namespace Eigen {
diff --git a/contrib/eigen/unsupported/Eigen/Splines b/contrib/eigen/unsupported/Eigen/Splines
index 322e6b9f5c70577cb1eb52bc1cd7b389000c088f..2ca581364fff270a359f13f8eb63cfe1e5577d70 100644
--- a/contrib/eigen/unsupported/Eigen/Splines
+++ b/contrib/eigen/unsupported/Eigen/Splines
@@ -24,8 +24,12 @@ namespace Eigen
   */
 }
 
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
 #include "src/Splines/SplineFwd.h"
 #include "src/Splines/Spline.h"
 #include "src/Splines/SplineFitting.h"
 
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
 #endif // EIGEN_SPLINES_MODULE_H
diff --git a/contrib/eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/contrib/eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index 58f3f3319d4788da6a8546dbecbf0ee12854c664..0f166e35f01f2ab4951404693366b191d0ebe71b 100755
--- a/contrib/eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/contrib/eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -26,11 +26,11 @@ void make_coherent(const A& a, const B&b)
   make_coherent_impl<A,B>::run(a.const_cast_derived(), b.const_cast_derived());
 }
 
-template<typename _DerType, bool Enable> struct auto_diff_special_op;
+template<typename DerivativeType, bool Enable> struct auto_diff_special_op;
 
 } // end namespace internal
 
-template<typename _DerType> class AutoDiffScalar;
+template<typename DerivativeType> class AutoDiffScalar;
 
 template<typename NewDerType>
 inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType &der) {
@@ -38,16 +38,16 @@ inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::
 }
 
 /** \class AutoDiffScalar
-  * \brief A scalar type replacement with automatic differentation capability
+  * \brief A scalar type replacement with automatic differentiation capability
   *
-  * \param _DerType the vector type used to store/represent the derivatives. The base scalar type
+  * \param DerivativeType the vector type used to store/represent the derivatives. The base scalar type
   *                 as well as the number of derivatives to compute are determined from this type.
   *                 Typical choices include, e.g., \c Vector4f for 4 derivatives, or \c VectorXf
   *                 if the number of derivatives is not known at compile time, and/or, the number
   *                 of derivatives is large.
-  *                 Note that _DerType can also be a reference (e.g., \c VectorXf&) to wrap a
+  *                 Note that DerivativeType can also be a reference (e.g., \c VectorXf&) to wrap a
   *                 existing vector into an AutoDiffScalar.
-  *                 Finally, _DerType can also be any Eigen compatible expression.
+  *                 Finally, DerivativeType can also be any Eigen compatible expression.
   *
   * This class represents a scalar value while tracking its respective derivatives using Eigen's expression
   * template mechanism.
@@ -63,17 +63,17 @@ inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::
   *
   */
 
-template<typename _DerType>
+template<typename DerivativeType>
 class AutoDiffScalar
   : public internal::auto_diff_special_op
-            <_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar,
-                                          typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value>
+            <DerivativeType, !internal::is_same<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar,
+                                          typename NumTraits<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar>::Real>::value>
 {
   public:
     typedef internal::auto_diff_special_op
-            <_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar,
-                       typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value> Base;
-    typedef typename internal::remove_all<_DerType>::type DerType;
+            <DerivativeType, !internal::is_same<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar,
+                       typename NumTraits<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar>::Real>::value> Base;
+    typedef typename internal::remove_all<DerivativeType>::type DerType;
     typedef typename internal::traits<DerType>::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real Real;
 
@@ -382,16 +382,16 @@ class AutoDiffScalar
 
 namespace internal {
 
-template<typename _DerType>
-struct auto_diff_special_op<_DerType, true>
-//   : auto_diff_scalar_op<_DerType, typename NumTraits<Scalar>::Real,
+template<typename DerivativeType>
+struct auto_diff_special_op<DerivativeType, true>
+//   : auto_diff_scalar_op<DerivativeType, typename NumTraits<Scalar>::Real,
 //                            is_same<Scalar,typename NumTraits<Scalar>::Real>::value>
 {
-  typedef typename remove_all<_DerType>::type DerType;
+  typedef typename remove_all<DerivativeType>::type DerType;
   typedef typename traits<DerType>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real Real;
 
-//   typedef auto_diff_scalar_op<_DerType, typename NumTraits<Scalar>::Real,
+//   typedef auto_diff_scalar_op<DerivativeType, typename NumTraits<Scalar>::Real,
 //                            is_same<Scalar,typename NumTraits<Scalar>::Real>::value> Base;
 
 //   using Base::operator+;
@@ -401,8 +401,8 @@ struct auto_diff_special_op<_DerType, true>
 //   using Base::operator*;
 //   using Base::operator*=;
 
-  const AutoDiffScalar<_DerType>& derived() const { return *static_cast<const AutoDiffScalar<_DerType>*>(this); }
-  AutoDiffScalar<_DerType>& derived() { return *static_cast<AutoDiffScalar<_DerType>*>(this); }
+  const AutoDiffScalar<DerivativeType>& derived() const { return *static_cast<const AutoDiffScalar<DerivativeType>*>(this); }
+  AutoDiffScalar<DerivativeType>& derived() { return *static_cast<AutoDiffScalar<DerivativeType>*>(this); }
 
 
   inline const AutoDiffScalar<DerType&> operator+(const Real& other) const
@@ -410,12 +410,12 @@ struct auto_diff_special_op<_DerType, true>
     return AutoDiffScalar<DerType&>(derived().value() + other, derived().derivatives());
   }
 
-  friend inline const AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar<_DerType>& b)
+  friend inline const AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar<DerivativeType>& b)
   {
     return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
   }
 
-  inline AutoDiffScalar<_DerType>& operator+=(const Real& other)
+  inline AutoDiffScalar<DerivativeType>& operator+=(const Real& other)
   {
     derived().value() += other;
     return derived();
@@ -431,22 +431,22 @@ struct auto_diff_special_op<_DerType, true>
   }
 
   friend inline const AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >
-  operator*(const Real& other, const AutoDiffScalar<_DerType>& a)
+  operator*(const Real& other, const AutoDiffScalar<DerivativeType>& a)
   {
     return AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >(
       a.value() * other,
       a.derivatives() * other);
   }
 
-  inline AutoDiffScalar<_DerType>& operator*=(const Scalar& other)
+  inline AutoDiffScalar<DerivativeType>& operator*=(const Scalar& other)
   {
     *this = *this * other;
     return derived();
   }
 };
 
-template<typename _DerType>
-struct auto_diff_special_op<_DerType, false>
+template<typename DerivativeType>
+struct auto_diff_special_op<DerivativeType, false>
 {
   void operator*() const;
   void operator-() const;
@@ -565,6 +565,11 @@ struct ScalarBinaryOpTraits<typename DerType::Scalar,AutoDiffScalar<DerType>, Bi
     CODE; \
   }
 
+template<typename DerType>
+struct CleanedUpDerType {
+  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> type;
+};
+
 template<typename DerType>
 inline const AutoDiffScalar<DerType>& conj(const AutoDiffScalar<DerType>& x)  { return x; }
 template<typename DerType>
@@ -572,31 +577,31 @@ inline const AutoDiffScalar<DerType>& real(const AutoDiffScalar<DerType>& x)  {
 template<typename DerType>
 inline typename DerType::Scalar imag(const AutoDiffScalar<DerType>&)    { return 0.; }
 template<typename DerType, typename T>
-inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const T& y) {
-  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+inline typename CleanedUpDerType<DerType>::type (min)(const AutoDiffScalar<DerType>& x, const T& y) {
+  typedef typename CleanedUpDerType<DerType>::type ADS;
   return (x <= y ? ADS(x) : ADS(y));
 }
 template<typename DerType, typename T>
-inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const T& y) {
-  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+inline typename CleanedUpDerType<DerType>::type (max)(const AutoDiffScalar<DerType>& x, const T& y) {
+  typedef typename CleanedUpDerType<DerType>::type ADS;
   return (x >= y ? ADS(x) : ADS(y));
 }
 template<typename DerType, typename T>
-inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const T& x, const AutoDiffScalar<DerType>& y) {
-  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+inline typename CleanedUpDerType<DerType>::type (min)(const T& x, const AutoDiffScalar<DerType>& y) {
+  typedef typename CleanedUpDerType<DerType>::type ADS;
   return (x < y ? ADS(x) : ADS(y));
 }
 template<typename DerType, typename T>
-inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const T& x, const AutoDiffScalar<DerType>& y) {
-  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+inline typename CleanedUpDerType<DerType>::type (max)(const T& x, const AutoDiffScalar<DerType>& y) {
+  typedef typename CleanedUpDerType<DerType>::type ADS;
   return (x > y ? ADS(x) : ADS(y));
 }
 template<typename DerType>
-inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+inline typename CleanedUpDerType<DerType>::type (min)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
   return (x.value() < y.value() ? x : y);
 }
 template<typename DerType>
-inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+inline typename CleanedUpDerType<DerType>::type (max)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
   return (x.value() >= y.value() ? x : y);
 }
 
@@ -711,10 +716,15 @@ template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> >
 }
 
 namespace std {
+
 template <typename T>
 class numeric_limits<Eigen::AutoDiffScalar<T> >
   : public numeric_limits<typename T::Scalar> {};
 
+template <typename T>
+class numeric_limits<Eigen::AutoDiffScalar<T&> >
+  : public numeric_limits<typename T::Scalar> {};
+
 }  // namespace std
 
 #endif // EIGEN_AUTODIFF_SCALAR_H
diff --git a/contrib/eigen/unsupported/Eigen/src/BVH/KdBVH.h b/contrib/eigen/unsupported/Eigen/src/BVH/KdBVH.h
index 5e39af26c1fd2a211b3cec1e856c8c15085e75a1..2d5b76ad0ac04e9581d00896ef159248fd97b78f 100644
--- a/contrib/eigen/unsupported/Eigen/src/BVH/KdBVH.h
+++ b/contrib/eigen/unsupported/Eigen/src/BVH/KdBVH.h
@@ -171,7 +171,7 @@ private:
   typedef internal::vector_int_pair<Scalar, Dim> VIPair;
   typedef std::vector<VIPair, aligned_allocator<VIPair> > VIPairList;
   typedef Matrix<Scalar, Dim, 1> VectorType;
-  struct VectorComparator //compares vectors, or, more specificall, VIPairs along a particular dimension
+  struct VectorComparator //compares vectors, or more specifically, VIPairs along a particular dimension
   {
     VectorComparator(int inDim) : dim(inDim) {}
     inline bool operator()(const VIPair &v1, const VIPair &v2) const { return v1.first[dim] < v2.first[dim]; }
diff --git a/contrib/eigen/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h b/contrib/eigen/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
index 4170d26b6f6bd4779586f70f2b16d2749f545726..0fbd8477294bca1d6b06a235882240da63cafc84 100644
--- a/contrib/eigen/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
+++ b/contrib/eigen/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_ARPACKGENERALIZEDSELFADJOINTEIGENSOLVER_H
 #define EIGEN_ARPACKGENERALIZEDSELFADJOINTEIGENSOLVER_H
 
-#include <Eigen/Dense>
+#include "../../../../Eigen/Dense"
 
 namespace Eigen { 
 
@@ -285,7 +285,7 @@ public:
 
   /** \brief Reports whether previous computation was successful.
    *
-   * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+   * \returns \c Success if computation was successful, \c NoConvergence otherwise.
    */
   ComputationInfo info() const
   {
diff --git a/contrib/eigen/unsupported/Eigen/src/EulerAngles/CMakeLists.txt b/contrib/eigen/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
index 40af550e8a92987c7846e53e22d30c180a368492..22088eb30ba4e7075a60e93b4adaa4cd78da0dc5 100644
--- a/contrib/eigen/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
+++ b/contrib/eigen/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
@@ -1,6 +1,6 @@
-FILE(GLOB Eigen_EulerAngles_SRCS "*.h")
+file(GLOB Eigen_EulerAngles_SRCS "*.h")
 
-INSTALL(FILES
+install(FILES
   ${Eigen_EulerAngles_SRCS}
   DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/EulerAngles COMPONENT Devel
   )
diff --git a/contrib/eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/contrib/eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h
index 13a0da1ab44517908730c8069654052213eea0ec..e43cdb7fb2a56af1adb0ddfa3562af66394cd717 100644
--- a/contrib/eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h
+++ b/contrib/eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h
@@ -12,11 +12,6 @@
 
 namespace Eigen
 {
-  /*template<typename Other,
-         int OtherRows=Other::RowsAtCompileTime,
-         int OtherCols=Other::ColsAtCompileTime>
-  struct ei_eulerangles_assign_impl;*/
-
   /** \class EulerAngles
     *
     * \ingroup EulerAngles_Module
@@ -36,7 +31,7 @@ namespace Eigen
     * ### Rotation representation and conversions ###
     *
     * It has been proved(see Wikipedia link below) that every rotation can be represented
-    *  by Euler angles, but there is no singular representation (e.g. unlike rotation matrices).
+    *  by Euler angles, but there is no single representation (e.g. unlike rotation matrices).
     * Therefore, you can convert from Eigen rotation and to them
     *  (including rotation matrices, which is not called "rotations" by Eigen design).
     *
@@ -55,33 +50,27 @@ namespace Eigen
     * Additionally, some axes related computation is done in compile time.
     *
     * #### Euler angles ranges in conversions ####
+    * Rotations representation as EulerAngles are not single (unlike matrices),
+    *  and even have infinite EulerAngles representations.<BR>
+    * For example, add or subtract 2*PI from either angle of EulerAngles
+    *  and you'll get the same rotation.
+    * This is the general reason for infinite representation,
+    *  but it's not the only general reason for not having a single representation.
     *
-    * When converting some rotation to Euler angles, there are some ways you can guarantee
-    *  the Euler angles ranges.
+    * When converting rotation to EulerAngles, this class convert it to specific ranges
+    * When converting some rotation to EulerAngles, the rules for ranges are as follow:
+    * - If the rotation we converting from is an EulerAngles
+    *  (even when it represented as RotationBase explicitly), angles ranges are __undefined__.
+    * - otherwise, alpha and gamma angles will be in the range [-PI, PI].<BR>
+    *   As for Beta angle:
+    *    - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
+    *    - otherwise:
+    *      - If the beta axis is positive, the beta angle will be in the range [0, PI]
+    *      - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
     *
-    * #### implicit ranges ####
-    * When using implicit ranges, all angles are guarantee to be in the range [-PI, +PI],
-    *  unless you convert from some other Euler angles.
-    * In this case, the range is __undefined__ (might be even less than -PI or greater than +2*PI).
     * \sa EulerAngles(const MatrixBase<Derived>&)
     * \sa EulerAngles(const RotationBase<Derived, 3>&)
     *
-    * #### explicit ranges ####
-    * When using explicit ranges, all angles are guarantee to be in the range you choose.
-    * In the range Boolean parameter, you're been ask whether you prefer the positive range or not:
-    * - _true_ - force the range between [0, +2*PI]
-    * - _false_ - force the range between [-PI, +PI]
-    *
-    * ##### compile time ranges #####
-    * This is when you have compile time ranges and you prefer to
-    *  use template parameter. (e.g. for performance)
-    * \sa FromRotation()
-    *
-    * ##### run-time time ranges #####
-    * Run-time ranges are also supported.
-    * \sa EulerAngles(const MatrixBase<Derived>&, bool, bool, bool)
-    * \sa EulerAngles(const RotationBase<Derived, 3>&, bool, bool, bool)
-    *
     * ### Convenient user typedefs ###
     *
     * Convenient typedefs for EulerAngles exist for float and double scalar,
@@ -103,7 +92,7 @@ namespace Eigen
     *
     * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
     *
-    * \tparam _Scalar the scalar type, i.e., the type of the angles.
+    * \tparam _Scalar the scalar type, i.e. the type of the angles.
     *
     * \tparam _System the EulerSystem to use, which represents the axes of rotation.
     */
@@ -111,8 +100,11 @@ namespace Eigen
   class EulerAngles : public RotationBase<EulerAngles<_Scalar, _System>, 3>
   {
     public:
+      typedef RotationBase<EulerAngles<_Scalar, _System>, 3> Base;
+      
       /** the scalar type of the angles */
       typedef _Scalar Scalar;
+      typedef typename NumTraits<Scalar>::Real RealScalar;
       
       /** the EulerSystem to use, which represents the axes of rotation. */
       typedef _System System;
@@ -146,67 +138,56 @@ namespace Eigen
     public:
       /** Default constructor without initialization. */
       EulerAngles() {}
-      /** Constructs and initialize Euler angles(\p alpha, \p beta, \p gamma). */
+      /** Constructs and initialize an EulerAngles (\p alpha, \p beta, \p gamma). */
       EulerAngles(const Scalar& alpha, const Scalar& beta, const Scalar& gamma) :
         m_angles(alpha, beta, gamma) {}
       
-      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m.
-        *
-        * \note All angles will be in the range [-PI, PI].
-      */
-      template<typename Derived>
-      EulerAngles(const MatrixBase<Derived>& m) { *this = m; }
+      // TODO: Test this constructor
+      /** Constructs and initialize an EulerAngles from the array data {alpha, beta, gamma} */
+      explicit EulerAngles(const Scalar* data) : m_angles(data) {}
       
-      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m,
-        *  with options to choose for each angle the requested range.
-        *
-        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
-        * Otherwise, the specified angle will be in the range [-PI, +PI].
+      /** Constructs and initializes an EulerAngles from either:
+        *  - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1),
+        *  - a 3D vector expression representing Euler angles.
         *
-        * \param m The 3x3 rotation matrix to convert
-        * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-      */
+        * \note If \p other is a 3x3 rotation matrix, the angles range rules will be as follow:<BR>
+        *  Alpha and gamma angles will be in the range [-PI, PI].<BR>
+        *  As for Beta angle:
+        *   - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
+        *   - otherwise:
+        *     - If the beta axis is positive, the beta angle will be in the range [0, PI]
+        *     - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
+       */
       template<typename Derived>
-      EulerAngles(
-        const MatrixBase<Derived>& m,
-        bool positiveRangeAlpha,
-        bool positiveRangeBeta,
-        bool positiveRangeGamma) {
-        
-        System::CalcEulerAngles(*this, m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma);
-      }
+      explicit EulerAngles(const MatrixBase<Derived>& other) { *this = other; }
       
       /** Constructs and initialize Euler angles from a rotation \p rot.
         *
-        * \note All angles will be in the range [-PI, PI], unless \p rot is an EulerAngles.
-        *  If rot is an EulerAngles, expected EulerAngles range is __undefined__.
-        *  (Use other functions here for enforcing range if this effect is desired)
+        * \note If \p rot is an EulerAngles (even when it represented as RotationBase explicitly),
+        *  angles ranges are __undefined__.
+        *  Otherwise, alpha and gamma angles will be in the range [-PI, PI].<BR>
+        *  As for Beta angle:
+        *   - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
+        *   - otherwise:
+        *     - If the beta axis is positive, the beta angle will be in the range [0, PI]
+        *     - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
       */
       template<typename Derived>
-      EulerAngles(const RotationBase<Derived, 3>& rot) { *this = rot; }
+      EulerAngles(const RotationBase<Derived, 3>& rot) { System::CalcEulerAngles(*this, rot.toRotationMatrix()); }
       
-      /** Constructs and initialize Euler angles from a rotation \p rot,
-        *  with options to choose for each angle the requested range.
-        *
-        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
-        * Otherwise, the specified angle will be in the range [-PI, +PI].
-        *
-        * \param rot The 3x3 rotation matrix to convert
-        * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-      */
-      template<typename Derived>
-      EulerAngles(
-        const RotationBase<Derived, 3>& rot,
-        bool positiveRangeAlpha,
-        bool positiveRangeBeta,
-        bool positiveRangeGamma) {
-        
-        System::CalcEulerAngles(*this, rot.toRotationMatrix(), positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma);
-      }
+      /*EulerAngles(const QuaternionType& q)
+      {
+        // TODO: Implement it in a faster way for quaternions
+        // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/
+        //  we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below)
+        // Currently we compute all matrix cells from quaternion.
+
+        // Special case only for ZYX
+        //Scalar y2 = q.y() * q.y();
+        //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z())));
+        //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x()));
+        //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2)));
+      }*/
 
       /** \returns The angle values stored in a vector (alpha, beta, gamma). */
       const Vector3& angles() const { return m_angles; }
@@ -246,90 +227,48 @@ namespace Eigen
         return inverse();
       }
       
-      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m,
-        *  with options to choose for each angle the requested range (__only in compile time__).
+      /** Set \c *this from either:
+        *  - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1),
+        *  - a 3D vector expression representing Euler angles.
         *
-        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
-        * Otherwise, the specified angle will be in the range [-PI, +PI].
-        *
-        * \param m The 3x3 rotation matrix to convert
-        * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        */
-      template<
-        bool PositiveRangeAlpha,
-        bool PositiveRangeBeta,
-        bool PositiveRangeGamma,
-        typename Derived>
-      static EulerAngles FromRotation(const MatrixBase<Derived>& m)
-      {
-        EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
-        
-        EulerAngles e;
-        System::template CalcEulerAngles<
-          PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma, _Scalar>(e, m);
-        return e;
-      }
-      
-      /** Constructs and initialize Euler angles from a rotation \p rot,
-        *  with options to choose for each angle the requested range (__only in compile time__).
-        *
-        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
-        * Otherwise, the specified angle will be in the range [-PI, +PI].
-        *
-        * \param rot The 3x3 rotation matrix to convert
-        * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * See EulerAngles(const MatrixBase<Derived, 3>&) for more information about
+        *  angles ranges output.
       */
-      template<
-        bool PositiveRangeAlpha,
-        bool PositiveRangeBeta,
-        bool PositiveRangeGamma,
-        typename Derived>
-      static EulerAngles FromRotation(const RotationBase<Derived, 3>& rot)
-      {
-        return FromRotation<PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma>(rot.toRotationMatrix());
-      }
-      
-      /*EulerAngles& fromQuaternion(const QuaternionType& q)
+      template<class Derived>
+      EulerAngles& operator=(const MatrixBase<Derived>& other)
       {
-        // TODO: Implement it in a faster way for quaternions
-        // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/
-        //  we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below)
-        // Currently we compute all matrix cells from quaternion.
-
-        // Special case only for ZYX
-        //Scalar y2 = q.y() * q.y();
-        //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z())));
-        //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x()));
-        //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2)));
-      }*/
-      
-      /** Set \c *this from a rotation matrix(i.e. pure orthogonal matrix with determinant of +1). */
-      template<typename Derived>
-      EulerAngles& operator=(const MatrixBase<Derived>& m) {
-        EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
+        EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename Derived::Scalar>::value),
+         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
         
-        System::CalcEulerAngles(*this, m);
+        internal::eulerangles_assign_impl<System, Derived>::run(*this, other.derived());
         return *this;
       }
 
       // TODO: Assign and construct from another EulerAngles (with different system)
       
-      /** Set \c *this from a rotation. */
+      /** Set \c *this from a rotation.
+        *
+        * See EulerAngles(const RotationBase<Derived, 3>&) for more information about
+        *  angles ranges output.
+      */
       template<typename Derived>
       EulerAngles& operator=(const RotationBase<Derived, 3>& rot) {
         System::CalcEulerAngles(*this, rot.toRotationMatrix());
         return *this;
       }
       
-      // TODO: Support isApprox function
+      /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+        * determined by \a prec.
+        *
+        * \sa MatrixBase::isApprox() */
+      bool isApprox(const EulerAngles& other,
+        const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
+      { return angles().isApprox(other.angles(), prec); }
 
       /** \returns an equivalent 3x3 rotation matrix. */
       Matrix3 toRotationMatrix() const
       {
+        // TODO: Calc it faster
         return static_cast<QuaternionType>(*this).toRotationMatrix();
       }
 
@@ -347,6 +286,15 @@ namespace Eigen
         s << eulerAngles.angles().transpose();
         return s;
       }
+      
+      /** \returns \c *this with scalar type casted to \a NewScalarType */
+      template <typename NewScalarType>
+      EulerAngles<NewScalarType, System> cast() const
+      {
+        EulerAngles<NewScalarType, System> e;
+        e.angles() = angles().template cast<NewScalarType>();
+        return e;
+      }
   };
 
 #define EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(AXES, SCALAR_TYPE, SCALAR_POSTFIX) \
@@ -379,8 +327,29 @@ EIGEN_EULER_ANGLES_TYPEDEFS(double, d)
     {
       typedef _Scalar Scalar;
     };
+    
+    // set from a rotation matrix
+    template<class System, class Other>
+    struct eulerangles_assign_impl<System,Other,3,3>
+    {
+      typedef typename Other::Scalar Scalar;
+      static void run(EulerAngles<Scalar, System>& e, const Other& m)
+      {
+        System::CalcEulerAngles(e, m);
+      }
+    };
+    
+    // set from a vector of Euler angles
+    template<class System, class Other>
+    struct eulerangles_assign_impl<System,Other,3,1>
+    {
+      typedef typename Other::Scalar Scalar;
+      static void run(EulerAngles<Scalar, System>& e, const Other& vec)
+      {
+        e.angles() = vec;
+      }
+    };
   }
-  
 }
 
 #endif // EIGEN_EULERANGLESCLASS_H
diff --git a/contrib/eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/contrib/eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h
index 98f9f647df170b0086437a367ed81bc1fe7611c4..2a833b0a48db7a64703e5bc8a25fff36d32825c0 100644
--- a/contrib/eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h
+++ b/contrib/eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h
@@ -12,13 +12,13 @@
 
 namespace Eigen
 {
-  // Forward declerations
+  // Forward declarations
   template <typename _Scalar, class _System>
   class EulerAngles;
   
   namespace internal
   {
-    // TODO: Check if already exists on the rest API
+    // TODO: Add this trait to the Eigen internal API?
     template <int Num, bool IsPositive = (Num > 0)>
     struct Abs
     {
@@ -36,6 +36,12 @@ namespace Eigen
     {
       enum { value = Axis != 0 && Abs<Axis>::value <= 3 };
     };
+    
+    template<typename System,
+            typename Other,
+            int OtherRows=Other::RowsAtCompileTime,
+            int OtherCols=Other::ColsAtCompileTime>
+    struct eulerangles_assign_impl;
   }
   
   #define EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(COND,MSG) typedef char static_assertion_##MSG[(COND)?1:-1]
@@ -69,7 +75,7 @@ namespace Eigen
     *
     * You can use this class to get two things:
     *  - Build an Euler system, and then pass it as a template parameter to EulerAngles.
-    *  - Query some compile time data about an Euler system. (e.g. Whether it's tait bryan)
+    *  - Query some compile time data about an Euler system. (e.g. Whether it's Tait-Bryan)
     *
     * Euler rotation is a set of three rotation on fixed axes. (see \ref EulerAngles)
     * This meta-class store constantly those signed axes. (see \ref EulerAxis)
@@ -80,7 +86,7 @@ namespace Eigen
     *  signed axes{+X,+Y,+Z,-X,-Y,-Z} are supported:
     *  - all axes X, Y, Z in each valid order (see below what order is valid)
     *  - rotation over the axis is supported both over the positive and negative directions.
-    *  - both tait bryan and proper/classic Euler angles (i.e. the opposite).
+    *  - both Tait-Bryan and proper/classic Euler angles (i.e. the opposite).
     *
     * Since EulerSystem support both positive and negative directions,
     *  you may call this rotation distinction in other names:
@@ -90,7 +96,7 @@ namespace Eigen
     * Notice all axed combination are valid, and would trigger a static assertion.
     * Same unsigned axes can't be neighbors, e.g. {X,X,Y} is invalid.
     * This yield two and only two classes:
-    *  - _tait bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z}
+    *  - _Tait-Bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z}
     *  - _proper/classic Euler angles_ - The first and the third unsigned axes is equal,
     *     and the second is different, e.g. {X,Y,X}
     *
@@ -112,9 +118,9 @@ namespace Eigen
     *
     * \tparam _AlphaAxis the first fixed EulerAxis
     *
-    * \tparam _AlphaAxis the second fixed EulerAxis
+    * \tparam _BetaAxis the second fixed EulerAxis
     *
-    * \tparam _AlphaAxis the third fixed EulerAxis
+    * \tparam _GammaAxis the third fixed EulerAxis
     */
   template <int _AlphaAxis, int _BetaAxis, int _GammaAxis>
   class EulerSystem
@@ -138,14 +144,16 @@ namespace Eigen
       BetaAxisAbs = internal::Abs<BetaAxis>::value, /*!< the second rotation axis unsigned */
       GammaAxisAbs = internal::Abs<GammaAxis>::value, /*!< the third rotation axis unsigned */
       
-      IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< weather alpha axis is negative */
-      IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< weather beta axis is negative */
-      IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< weather gamma axis is negative */
-      
-      IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< weather the Euler system is odd */
-      IsEven = IsOdd ? 0 : 1, /*!< weather the Euler system is even */
+      IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< whether alpha axis is negative */
+      IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< whether beta axis is negative */
+      IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< whether gamma axis is negative */
+
+      // Parity is even if alpha axis X is followed by beta axis Y, or Y is followed
+      // by Z, or Z is followed by X; otherwise it is odd.
+      IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< whether the Euler system is odd */
+      IsEven = IsOdd ? 0 : 1, /*!< whether the Euler system is even */
 
-      IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< weather the Euler system is tait bryan */
+      IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< whether the Euler system is Tait-Bryan */
     };
     
     private:
@@ -165,142 +173,113 @@ namespace Eigen
     EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)BetaAxisAbs != (unsigned)GammaAxisAbs,
       BETA_AXIS_CANT_BE_EQUAL_TO_GAMMA_AXIS);
 
-    enum
-    {
+    static const int
       // I, J, K are the pivot indexes permutation for the rotation matrix, that match this Euler system. 
       // They are used in this class converters.
       // They are always different from each other, and their possible values are: 0, 1, or 2.
-      I = AlphaAxisAbs - 1,
-      J = (AlphaAxisAbs - 1 + 1 + IsOdd)%3,
-      K = (AlphaAxisAbs - 1 + 2 - IsOdd)%3
-    };
+      I_ = AlphaAxisAbs - 1,
+      J_ = (AlphaAxisAbs - 1 + 1 + IsOdd)%3,
+      K_ = (AlphaAxisAbs - 1 + 2 - IsOdd)%3
+    ;
     
     // TODO: Get @mat parameter in form that avoids double evaluation.
     template <typename Derived>
     static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar, 3, 1>& res, const MatrixBase<Derived>& mat, internal::true_type /*isTaitBryan*/)
     {
       using std::atan2;
-      using std::sin;
-      using std::cos;
+      using std::sqrt;
       
       typedef typename Derived::Scalar Scalar;
-      typedef Matrix<Scalar,2,1> Vector2;
-      
-      res[0] = atan2(mat(J,K), mat(K,K));
-      Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm();
-      if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) {
-        if(res[0] > Scalar(0)) {
-          res[0] -= Scalar(EIGEN_PI);
-        }
-        else {
-          res[0] += Scalar(EIGEN_PI);
-        }
-        res[1] = atan2(-mat(I,K), -c2);
+
+      const Scalar plusMinus = IsEven? 1 : -1;
+      const Scalar minusPlus = IsOdd?  1 : -1;
+
+      const Scalar Rsum = sqrt((mat(I_,I_) * mat(I_,I_) + mat(I_,J_) * mat(I_,J_) + mat(J_,K_) * mat(J_,K_) + mat(K_,K_) * mat(K_,K_))/2);
+      res[1] = atan2(plusMinus * mat(I_,K_), Rsum);
+
+      // There is a singularity when cos(beta) == 0
+      if(Rsum > 4 * NumTraits<Scalar>::epsilon()) {// cos(beta) != 0
+        res[0] = atan2(minusPlus * mat(J_, K_), mat(K_, K_));
+        res[2] = atan2(minusPlus * mat(I_, J_), mat(I_, I_));
+      }
+      else if(plusMinus * mat(I_, K_) > 0) {// cos(beta) == 0 and sin(beta) == 1
+        Scalar spos = mat(J_, I_) + plusMinus * mat(K_, J_); // 2*sin(alpha + plusMinus * gamma
+        Scalar cpos = mat(J_, J_) + minusPlus * mat(K_, I_); // 2*cos(alpha + plusMinus * gamma)
+        Scalar alphaPlusMinusGamma = atan2(spos, cpos);
+        res[0] = alphaPlusMinusGamma;
+        res[2] = 0;
+      }
+      else {// cos(beta) == 0 and sin(beta) == -1
+        Scalar sneg = plusMinus * (mat(K_, J_) + minusPlus * mat(J_, I_)); // 2*sin(alpha + minusPlus*gamma)
+        Scalar cneg = mat(J_, J_) + plusMinus * mat(K_, I_);               // 2*cos(alpha + minusPlus*gamma)
+        Scalar alphaMinusPlusBeta = atan2(sneg, cneg);
+        res[0] = alphaMinusPlusBeta;
+        res[2] = 0;
       }
-      else
-        res[1] = atan2(-mat(I,K), c2);
-      Scalar s1 = sin(res[0]);
-      Scalar c1 = cos(res[0]);
-      res[2] = atan2(s1*mat(K,I)-c1*mat(J,I), c1*mat(J,J) - s1 * mat(K,J));
     }
 
     template <typename Derived>
-    static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res, const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/)
+    static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res,
+                                    const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/)
     {
       using std::atan2;
-      using std::sin;
-      using std::cos;
+      using std::sqrt;
 
       typedef typename Derived::Scalar Scalar;
-      typedef Matrix<Scalar,2,1> Vector2;
-      
-      res[0] = atan2(mat(J,I), mat(K,I));
-      if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0)))
-      {
-        if(res[0] > Scalar(0)) {
-          res[0] -= Scalar(EIGEN_PI);
-        }
-        else {
-          res[0] += Scalar(EIGEN_PI);
-        }
-        Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
-        res[1] = -atan2(s2, mat(I,I));
-      }
-      else
-      {
-        Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
-        res[1] = atan2(s2, mat(I,I));
-      }
 
-      // With a=(0,1,0), we have i=0; j=1; k=2, and after computing the first two angles,
-      // we can compute their respective rotation, and apply its inverse to M. Since the result must
-      // be a rotation around x, we have:
-      //
-      //  c2  s1.s2 c1.s2                   1  0   0 
-      //  0   c1    -s1       *    M    =   0  c3  s3
-      //  -s2 s1.c2 c1.c2                   0 -s3  c3
-      //
-      //  Thus:  m11.c1 - m21.s1 = c3  &   m12.c1 - m22.s1 = s3
+      const Scalar plusMinus = IsEven? 1 : -1;
+      const Scalar minusPlus = IsOdd?  1 : -1;
+
+      const Scalar Rsum = sqrt((mat(I_, J_) * mat(I_, J_) + mat(I_, K_) * mat(I_, K_) + mat(J_, I_) * mat(J_, I_) + mat(K_, I_) * mat(K_, I_)) / 2);
 
-      Scalar s1 = sin(res[0]);
-      Scalar c1 = cos(res[0]);
-      res[2] = atan2(c1*mat(J,K)-s1*mat(K,K), c1*mat(J,J) - s1 * mat(K,J));
+      res[1] = atan2(Rsum, mat(I_, I_));
+
+      // There is a singularity when sin(beta) == 0
+      if(Rsum > 4 * NumTraits<Scalar>::epsilon()) {// sin(beta) != 0
+        res[0] = atan2(mat(J_, I_), minusPlus * mat(K_, I_));
+        res[2] = atan2(mat(I_, J_), plusMinus * mat(I_, K_));
+      }
+      else if(mat(I_, I_) > 0) {// sin(beta) == 0 and cos(beta) == 1
+        Scalar spos = plusMinus * mat(K_, J_) + minusPlus * mat(J_, K_); // 2*sin(alpha + gamma)
+        Scalar cpos = mat(J_, J_) + mat(K_, K_);                         // 2*cos(alpha + gamma)
+        res[0] = atan2(spos, cpos);
+        res[2] = 0;
+      }
+      else {// sin(beta) == 0 and cos(beta) == -1
+        Scalar sneg = plusMinus * mat(K_, J_) + plusMinus * mat(J_, K_); // 2*sin(alpha - gamma)
+        Scalar cneg = mat(J_, J_) - mat(K_, K_);                         // 2*cos(alpha - gamma)
+        res[0] = atan2(sneg, cneg);
+        res[2] = 0;
+      }
     }
     
     template<typename Scalar>
     static void CalcEulerAngles(
       EulerAngles<Scalar, EulerSystem>& res,
       const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
-    {
-      CalcEulerAngles(res, mat, false, false, false);
-    }
-    
-    template<
-      bool PositiveRangeAlpha,
-      bool PositiveRangeBeta,
-      bool PositiveRangeGamma,
-      typename Scalar>
-    static void CalcEulerAngles(
-      EulerAngles<Scalar, EulerSystem>& res,
-      const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
-    {
-      CalcEulerAngles(res, mat, PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma);
-    }
-    
-    template<typename Scalar>
-    static void CalcEulerAngles(
-      EulerAngles<Scalar, EulerSystem>& res,
-      const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat,
-      bool PositiveRangeAlpha,
-      bool PositiveRangeBeta,
-      bool PositiveRangeGamma)
     {
       CalcEulerAngles_imp(
         res.angles(), mat,
         typename internal::conditional<IsTaitBryan, internal::true_type, internal::false_type>::type());
 
-      if (IsAlphaOpposite == IsOdd)
+      if (IsAlphaOpposite)
         res.alpha() = -res.alpha();
         
-      if (IsBetaOpposite == IsOdd)
+      if (IsBetaOpposite)
         res.beta() = -res.beta();
         
-      if (IsGammaOpposite == IsOdd)
+      if (IsGammaOpposite)
         res.gamma() = -res.gamma();
-      
-      // Saturate results to the requested range
-      if (PositiveRangeAlpha && (res.alpha() < 0))
-        res.alpha() += Scalar(2 * EIGEN_PI);
-      
-      if (PositiveRangeBeta && (res.beta() < 0))
-        res.beta() += Scalar(2 * EIGEN_PI);
-      
-      if (PositiveRangeGamma && (res.gamma() < 0))
-        res.gamma() += Scalar(2 * EIGEN_PI);
     }
     
     template <typename _Scalar, class _System>
     friend class Eigen::EulerAngles;
+    
+    template<typename System,
+            typename Other,
+            int OtherRows,
+            int OtherCols>
+    friend struct internal::eulerangles_assign_impl;
   };
 
 #define EIGEN_EULER_SYSTEM_TYPEDEF(A, B, C) \
diff --git a/contrib/eigen/unsupported/Eigen/src/FFT/ei_fftw_impl.h b/contrib/eigen/unsupported/Eigen/src/FFT/ei_fftw_impl.h
index 7c1f716e2a866937fda26c4ae6e96e8eec2d82eb..1c2cd24a0b458b8431f5dcb2da84c9620f9d77de 100644
--- a/contrib/eigen/unsupported/Eigen/src/FFT/ei_fftw_impl.h
+++ b/contrib/eigen/unsupported/Eigen/src/FFT/ei_fftw_impl.h
@@ -259,5 +259,3 @@ namespace internal {
 } // end namespace internal
 
 } // end namespace Eigen
-
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/contrib/eigen/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/contrib/eigen/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index be51b4e6feb1aae4c05930dae3983d31b7e12d34..430953aeec7ead7bb5a306325da81df1d2ea166a 100644
--- a/contrib/eigen/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/contrib/eigen/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -25,16 +25,47 @@ struct kiss_cpx_fft
   std::vector<Complex> m_scratchBuf;
   bool m_inverse;
 
-  inline
-    void make_twiddles(int nfft,bool inverse)
+  inline void make_twiddles(int nfft, bool inverse)
+  {
+    using numext::sin;
+    using numext::cos;
+    m_inverse = inverse;
+    m_twiddles.resize(nfft);
+    double phinc =  0.25 * double(EIGEN_PI) / nfft;
+    Scalar flip = inverse ? Scalar(1) : Scalar(-1);
+    m_twiddles[0] = Complex(Scalar(1), Scalar(0));
+    if ((nfft&1)==0)
+      m_twiddles[nfft/2] = Complex(Scalar(-1), Scalar(0));
+    int i=1;
+    for (;i*8<nfft;++i)
     {
-      using std::acos;
-      m_inverse = inverse;
-      m_twiddles.resize(nfft);
-      Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
-      for (int i=0;i<nfft;++i)
-        m_twiddles[i] = exp( Complex(0,i*phinc) );
+      Scalar c = Scalar(cos(i*8*phinc));
+      Scalar s = Scalar(sin(i*8*phinc));
+      m_twiddles[i] = Complex(c, s*flip);
+      m_twiddles[nfft-i] = Complex(c, -s*flip);
     }
+    for (;i*4<nfft;++i)
+    {
+      Scalar c = Scalar(cos((2*nfft-8*i)*phinc));
+      Scalar s = Scalar(sin((2*nfft-8*i)*phinc));
+      m_twiddles[i] = Complex(s, c*flip);
+      m_twiddles[nfft-i] = Complex(s, -c*flip);
+    }
+    for (;i*8<3*nfft;++i)
+    {
+      Scalar c = Scalar(cos((8*i-2*nfft)*phinc));
+      Scalar s = Scalar(sin((8*i-2*nfft)*phinc));
+      m_twiddles[i] = Complex(-s, c*flip);
+      m_twiddles[nfft-i] = Complex(-s, -c*flip);
+    }
+    for (;i*2<nfft;++i)
+    {
+      Scalar c = Scalar(cos((4*nfft-8*i)*phinc));
+      Scalar s = Scalar(sin((4*nfft-8*i)*phinc));
+      m_twiddles[i] = Complex(-c, s*flip);
+      m_twiddles[nfft-i] = Complex(-c, -s*flip);
+    }
+  }
 
   void factorize(int nfft)
   {
@@ -316,8 +347,8 @@ struct kissfft_impl
 
         // use optimized mode for even real
         fwd( dst, reinterpret_cast<const Complex*> (src), ncfft);
-        Complex dc = dst[0].real() +  dst[0].imag();
-        Complex nyquist = dst[0].real() -  dst[0].imag();
+        Complex dc(dst[0].real() +  dst[0].imag());
+        Complex nyquist(dst[0].real() -  dst[0].imag());
         int k;
         for ( k=1;k <= ncfft2 ; ++k ) {
           Complex fpk = dst[k];
@@ -416,5 +447,3 @@ struct kissfft_impl
 } // end namespace internal
 
 } // end namespace Eigen
-
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h b/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
index dc0093eb9792c818008af8cb161ebfc25ba0ba76..e7d70f39d3824abce9efb97b671e295492444276 100644
--- a/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
+++ b/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
@@ -31,13 +31,13 @@
 #ifndef EIGEN_CONSTRAINEDCG_H
 #define EIGEN_CONSTRAINEDCG_H
 
-#include <Eigen/Core>
+#include "../../../../Eigen/Core"
 
 namespace Eigen { 
 
 namespace internal {
 
-/** \ingroup IterativeSolvers_Module
+/** \ingroup IterativeLinearSolvers_Module
   * Compute the pseudo inverse of the non-square matrix C such that
   * \f$ CINV = (C * C^T)^{-1} * C \f$ based on a conjugate gradient method.
   *
@@ -96,10 +96,10 @@ void pseudo_inverse(const CMatrix &C, CINVMatrix &CINV)
 
 
 
-/** \ingroup IterativeSolvers_Module
+/** \ingroup IterativeLinearSolvers_Module
   * Constrained conjugate gradient
   *
-  * Computes the minimum of \f$ 1/2((Ax).x) - bx \f$ under the contraint \f$ Cx \le f \f$
+  * Computes the minimum of \f$ 1/2((Ax).x) - bx \f$ under the constraint \f$ Cx \le f \f$
   */
 template<typename TMatrix, typename CMatrix,
          typename VectorX, typename VectorB, typename VectorF>
@@ -158,8 +158,6 @@ void constrained_cg(const TMatrix& A, const CMatrix& C, VectorX& x,
     rho = r.dot(z);
 
     if (iter.finished(rho)) break;
-
-    if (iter.noiseLevel() > 0 && transition) std::cerr << "CCG: transition\n";
     if (transition || iter.first()) gamma = 0.0;
     else gamma = (std::max)(0.0, (rho - old_z.dot(z)) / rho_1);
     p = z + gamma*p;
diff --git a/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/DGMRES.h b/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
index 4079e2367d63f16d52b6568244ae8dd6502547cc..5ae011b7592a82922225bdfe199e03b8b9009a19 100644
--- a/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
+++ b/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_DGMRES_H
 #define EIGEN_DGMRES_H
 
-#include <Eigen/Eigenvalues>
+#include "../../../../Eigen/Eigenvalues"
 
 namespace Eigen { 
   
@@ -57,7 +57,7 @@ void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::
 
 }
 /**
- * \ingroup IterativeLInearSolvers_Module
+ * \ingroup IterativeLinearSolvers_Module
  * \brief A Restarted GMRES with deflation.
  * This class implements a modification of the GMRES solver for
  * sparse linear systems. The basis is built with modified 
@@ -88,7 +88,7 @@ void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::
  * [1] D. NUENTSA WAKAM and F. PACULL, Memory Efficient Hybrid
  *  Algebraic Solvers for Linear Systems Arising from Compressible
  *  Flows, Computers and Fluids, In Press,
- *  http://dx.doi.org/10.1016/j.compfluid.2012.03.023   
+ *  https://doi.org/10.1016/j.compfluid.2012.03.023   
  * [2] K. Burrage and J. Erhel, On the performance of various 
  * adaptive preconditioned GMRES strategies, 5(1998), 101-121.
  * [3] J. Erhel, K. Burrage and B. Pohl, Restarted GMRES 
@@ -109,6 +109,7 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
     using Base::m_tolerance; 
   public:
     using Base::_solve_impl;
+    using Base::_solve_with_guess_impl;
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::StorageIndex StorageIndex;
@@ -141,30 +142,16 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
   
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
-  {    
-    bool failed = false;
-    for(Index j=0; j<b.cols(); ++j)
-    {
-      m_iterations = Base::maxIterations();
-      m_error = Base::m_tolerance;
-      
-      typename Dest::ColXpr xj(x,j);
-      dgmres(matrix(), b.col(j), xj, Base::m_preconditioner);
-    }
-    m_info = failed ? NumericalIssue
-           : m_error <= Base::m_tolerance ? Success
-           : NoConvergence;
-    m_isInitialized = true;
-  }
-
-  /** \internal */
-  template<typename Rhs,typename Dest>
-  void _solve_impl(const Rhs& b, MatrixBase<Dest>& x) const
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
   {
-    x = b;
-    _solve_with_guess_impl(b,x.derived());
+    EIGEN_STATIC_ASSERT(Rhs::ColsAtCompileTime==1 || Dest::ColsAtCompileTime==1, YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX);
+    
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+    
+    dgmres(matrix(), b, x, Base::m_preconditioner);
   }
+
   /** 
    * Get the restart value
     */
@@ -212,7 +199,7 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
     void dgmresInitDeflation(Index& rows) const; 
     mutable DenseMatrix m_V; // Krylov basis vectors
     mutable DenseMatrix m_H; // Hessenberg matrix 
-    mutable DenseMatrix m_Hes; // Initial hessenberg matrix wihout Givens rotations applied
+    mutable DenseMatrix m_Hes; // Initial hessenberg matrix without Givens rotations applied
     mutable Index m_restart; // Maximum size of the Krylov subspace
     mutable DenseMatrix m_U; // Vectors that form the basis of the invariant subspace 
     mutable DenseMatrix m_MU; // matrix operator applied to m_U (for next cycles)
@@ -241,18 +228,30 @@ template<typename Rhs, typename Dest>
 void DGMRES<_MatrixType, _Preconditioner>::dgmres(const MatrixType& mat,const Rhs& rhs, Dest& x,
               const Preconditioner& precond) const
 {
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+
+  RealScalar normRhs = rhs.norm();
+  if(normRhs <= considerAsZero) 
+  {
+    x.setZero();
+    m_error = 0;
+    return;
+  }
+
   //Initialization
+  m_isDeflInitialized = false;
   Index n = mat.rows(); 
   DenseVector r0(n); 
   Index nbIts = 0; 
   m_H.resize(m_restart+1, m_restart);
   m_Hes.resize(m_restart, m_restart);
   m_V.resize(n,m_restart+1);
-  //Initial residual vector and intial norm
-  x = precond.solve(x);
+  //Initial residual vector and initial norm
+  if(x.squaredNorm()==0) 
+    x = precond.solve(rhs);
   r0 = rhs - mat * x; 
   RealScalar beta = r0.norm(); 
-  RealScalar normRhs = rhs.norm();
+  
   m_error = beta/normRhs; 
   if(m_error < m_tolerance)
     m_info = Success; 
@@ -265,8 +264,10 @@ void DGMRES<_MatrixType, _Preconditioner>::dgmres(const MatrixType& mat,const Rh
     dgmresCycle(mat, precond, x, r0, beta, normRhs, nbIts); 
     
     // Compute the new residual vector for the restart 
-    if (nbIts < m_iterations && m_info == NoConvergence)
-      r0 = rhs - mat * x; 
+    if (nbIts < m_iterations && m_info == NoConvergence) {
+      r0 = rhs - mat * x;
+      beta = r0.norm();
+    }
   }
 } 
 
diff --git a/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/GMRES.h
index 92618b107014b1d0b1ba47f49c2e86021140e567..ff912094f1ae440ea0323dd18de4eaeb3142c712 100644
--- a/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/GMRES.h
+++ b/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/GMRES.h
@@ -64,6 +64,15 @@ bool gmres(const MatrixType & mat, const Rhs & rhs, Dest & x, const Precondition
   typedef Matrix < Scalar, Dynamic, 1 > VectorType;
   typedef Matrix < Scalar, Dynamic, Dynamic, ColMajor> FMatrixType;
 
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+
+  if(rhs.norm() <= considerAsZero) 
+  {
+    x.setZero();
+    tol_error = 0;
+    return true;
+  }
+
   RealScalar tol = tol_error;
   const Index maxIters = iters;
   iters = 0;
@@ -307,31 +316,14 @@ public:
 
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
   {
-    bool failed = false;
-    for(Index j=0; j<b.cols(); ++j)
-    {
-      m_iterations = Base::maxIterations();
-      m_error = Base::m_tolerance;
-
-      typename Dest::ColXpr xj(x,j);
-      if(!internal::gmres(matrix(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_restart, m_error))
-        failed = true;
-    }
-    m_info = failed ? NumericalIssue
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+    bool ret = internal::gmres(matrix(), b, x, Base::m_preconditioner, m_iterations, m_restart, m_error);
+    m_info = (!ret) ? NumericalIssue
           : m_error <= Base::m_tolerance ? Success
           : NoConvergence;
-    m_isInitialized = true;
-  }
-
-  /** \internal */
-  template<typename Rhs,typename Dest>
-  void _solve_impl(const Rhs& b, MatrixBase<Dest> &x) const
-  {
-    x = b;
-    if(x.squaredNorm() == 0) return; // Check Zero right hand side
-    _solve_with_guess_impl(b,x.derived());
   }
 
 protected:
diff --git a/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/IDRS.h b/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/IDRS.h
new file mode 100755
index 0000000000000000000000000000000000000000..90d20fad4858c2497bde0625b55686bda6a83c36
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/IDRS.h
@@ -0,0 +1,436 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Chris Schoutrop <c.e.m.schoutrop@tue.nl>
+// Copyright (C) 2020 Jens Wehner <j.wehner@esciencecenter.nl>
+// Copyright (C) 2020 Jan van Dijk <j.v.dijk@tue.nl>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_IDRS_H
+#define EIGEN_IDRS_H
+
+namespace Eigen
+{
+
+	namespace internal
+	{
+		/**     \internal Low-level Induced Dimension Reduction algoritm
+		        \param A The matrix A
+		        \param b The right hand side vector b
+		        \param x On input and initial solution, on output the computed solution.
+		        \param precond A preconditioner being able to efficiently solve for an
+		                  approximation of Ax=b (regardless of b)
+		        \param iter On input the max number of iteration, on output the number of performed iterations.
+		        \param relres On input the tolerance error, on output an estimation of the relative error.
+		        \param S On input Number of the dimension of the shadow space.
+				\param smoothing switches residual smoothing on.
+				\param angle small omega lead to faster convergence at the expense of numerical stability
+				\param replacement switches on a residual replacement strategy to increase accuracy of residual at the expense of more Mat*vec products
+		        \return false in the case of numerical issue, for example a break down of IDRS.
+		*/
+		template<typename Vector, typename RealScalar>
+		typename Vector::Scalar omega(const Vector& t, const Vector& s, RealScalar angle)
+		{
+			using numext::abs;
+			typedef typename Vector::Scalar Scalar;
+			const RealScalar ns = s.norm();
+			const RealScalar nt = t.norm();
+			const Scalar ts = t.dot(s);
+			const RealScalar rho = abs(ts / (nt * ns));
+
+			if (rho < angle) {
+				if (ts == Scalar(0)) {
+					return Scalar(0);
+				}
+				// Original relation for om is given by
+				// om = om * angle / rho;
+				// To alleviate potential (near) division by zero this can be rewritten as
+				// om = angle * (ns / nt) * (ts / abs(ts)) = angle * (ns / nt) * sgn(ts)
+  				return angle * (ns / nt) * (ts / abs(ts));
+			}
+			return ts / (nt * nt);
+		}
+
+		template <typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+		bool idrs(const MatrixType& A, const Rhs& b, Dest& x, const Preconditioner& precond,
+			Index& iter,
+			typename Dest::RealScalar& relres, Index S, bool smoothing, typename Dest::RealScalar angle, bool replacement)
+		{
+			typedef typename Dest::RealScalar RealScalar;
+			typedef typename Dest::Scalar Scalar;
+			typedef Matrix<Scalar, Dynamic, 1> VectorType;
+			typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> DenseMatrixType;
+			const Index N = b.size();
+			S = S < x.rows() ? S : x.rows();
+			const RealScalar tol = relres;
+			const Index maxit = iter;
+
+			Index replacements = 0;
+			bool trueres = false;
+
+			FullPivLU<DenseMatrixType> lu_solver;
+
+			DenseMatrixType P;
+			{
+				HouseholderQR<DenseMatrixType> qr(DenseMatrixType::Random(N, S));
+			    P = (qr.householderQ() * DenseMatrixType::Identity(N, S));
+			}
+
+			const RealScalar normb = b.norm();
+
+			if (internal::isApprox(normb, RealScalar(0)))
+			{
+				//Solution is the zero vector
+				x.setZero();
+				iter = 0;
+				relres = 0;
+				return true;
+			}
+			 // from http://homepage.tudelft.nl/1w5b5/IDRS/manual.pdf
+			 // A peak in the residual is considered dangerously high if‖ri‖/‖b‖> C(tol/epsilon).
+			 // With epsilon the
+             // relative machine precision. The factor tol/epsilon corresponds to the size of a
+             // finite precision number that is so large that the absolute round-off error in
+             // this number, when propagated through the process, makes it impossible to
+             // achieve the required accuracy.The factor C accounts for the accumulation of
+             // round-off errors. This parameter has beenset to 10−3.
+			 // mp is epsilon/C
+			 // 10^3 * eps is very conservative, so normally no residual replacements will take place. 
+			 // It only happens if things go very wrong. Too many restarts may ruin the convergence.
+			const RealScalar mp = RealScalar(1e3) * NumTraits<Scalar>::epsilon();
+
+
+
+			//Compute initial residual
+			const RealScalar tolb = tol * normb; //Relative tolerance
+			VectorType r = b - A * x;
+
+			VectorType x_s, r_s;
+
+			if (smoothing)
+			{
+				x_s = x;
+				r_s = r;
+			}
+
+			RealScalar normr = r.norm();
+
+			if (normr <= tolb)
+			{
+				//Initial guess is a good enough solution
+				iter = 0;
+				relres = normr / normb;
+				return true;
+			}
+
+			DenseMatrixType G = DenseMatrixType::Zero(N, S);
+			DenseMatrixType U = DenseMatrixType::Zero(N, S);
+			DenseMatrixType M = DenseMatrixType::Identity(S, S);
+			VectorType t(N), v(N);
+			Scalar om = 1.;
+
+			//Main iteration loop, guild G-spaces:
+			iter = 0;
+
+			while (normr > tolb && iter < maxit)
+			{
+				//New right hand size for small system:
+				VectorType f = (r.adjoint() * P).adjoint();
+
+				for (Index k = 0; k < S; ++k)
+				{
+					//Solve small system and make v orthogonal to P:
+					//c = M(k:s,k:s)\f(k:s);
+					lu_solver.compute(M.block(k , k , S -k, S - k ));
+					VectorType c = lu_solver.solve(f.segment(k , S - k ));
+					//v = r - G(:,k:s)*c;
+					v = r - G.rightCols(S - k ) * c;
+					//Preconditioning
+					v = precond.solve(v);
+
+					//Compute new U(:,k) and G(:,k), G(:,k) is in space G_j
+					U.col(k) = U.rightCols(S - k ) * c + om * v;
+					G.col(k) = A * U.col(k );
+
+					//Bi-Orthogonalise the new basis vectors:
+					for (Index i = 0; i < k-1 ; ++i)
+					{
+						//alpha =  ( P(:,i)'*G(:,k) )/M(i,i);
+						Scalar alpha = P.col(i ).dot(G.col(k )) / M(i, i );
+						G.col(k ) = G.col(k ) - alpha * G.col(i );
+						U.col(k ) = U.col(k ) - alpha * U.col(i );
+					}
+
+					//New column of M = P'*G  (first k-1 entries are zero)
+					//M(k:s,k) = (G(:,k)'*P(:,k:s))';
+					M.block(k , k , S - k , 1) = (G.col(k ).adjoint() * P.rightCols(S - k )).adjoint();
+
+					if (internal::isApprox(M(k,k), Scalar(0)))
+					{
+						return false;
+					}
+
+					//Make r orthogonal to q_i, i = 0..k-1
+					Scalar beta = f(k ) / M(k , k );
+					r = r - beta * G.col(k );
+					x = x + beta * U.col(k );
+					normr = r.norm();
+
+					if (replacement && normr > tolb / mp)
+					{
+						trueres = true;
+					}
+
+					//Smoothing:
+					if (smoothing)
+					{
+						t = r_s - r;
+						//gamma is a Scalar, but the conversion is not allowed
+						Scalar gamma = t.dot(r_s) / t.norm();
+						r_s = r_s - gamma * t;
+						x_s = x_s - gamma * (x_s - x);
+						normr = r_s.norm();
+					}
+
+					if (normr < tolb || iter == maxit)
+					{
+						break;
+					}
+
+					//New f = P'*r (first k  components are zero)
+					if (k < S-1)
+					{
+						f.segment(k + 1, S - (k + 1) ) = f.segment(k + 1 , S - (k + 1)) - beta * M.block(k + 1 , k , S - (k + 1), 1);
+					}
+				}//end for
+
+				if (normr < tolb || iter == maxit)
+				{
+					break;
+				}
+
+				//Now we have sufficient vectors in G_j to compute residual in G_j+1
+				//Note: r is already perpendicular to P so v = r
+				//Preconditioning
+				v = r;
+				v = precond.solve(v);
+
+				//Matrix-vector multiplication:
+				t = A * v;
+
+				//Computation of a new omega
+				om = internal::omega(t, r, angle);
+
+				if (om == RealScalar(0.0))
+				{
+					return false;
+				}
+
+				r = r - om * t;
+				x = x + om * v;
+				normr = r.norm();
+
+				if (replacement && normr > tolb / mp)
+				{
+					trueres = true;
+				}
+
+				//Residual replacement?
+				if (trueres && normr < normb)
+				{
+					r = b - A * x;
+					trueres = false;
+					replacements++;
+				}
+
+				//Smoothing:
+				if (smoothing)
+				{
+					t = r_s - r;
+					Scalar gamma = t.dot(r_s) /t.norm();
+					r_s = r_s - gamma * t;
+					x_s = x_s - gamma * (x_s - x);
+					normr = r_s.norm();
+				}
+
+				iter++;
+
+			}//end while
+
+			if (smoothing)
+			{
+				x = x_s;
+			}
+			relres=normr/normb;
+			return true;
+		}
+
+	}  // namespace internal
+
+	template <typename _MatrixType, typename _Preconditioner = DiagonalPreconditioner<typename _MatrixType::Scalar> >
+	class IDRS;
+
+	namespace internal
+	{
+
+		template <typename _MatrixType, typename _Preconditioner>
+		struct traits<Eigen::IDRS<_MatrixType, _Preconditioner> >
+		{
+			typedef _MatrixType MatrixType;
+			typedef _Preconditioner Preconditioner;
+		};
+
+	}  // namespace internal
+
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief The Induced Dimension Reduction method (IDR(s)) is a short-recurrences Krylov method for sparse square problems.
+  *
+  * This class allows to solve for A.x = b sparse linear problems. The vectors x and b can be either dense or sparse.
+  * he Induced Dimension Reduction method, IDR(), is a robust and efficient short-recurrence Krylov subspace method for
+  * solving large nonsymmetric systems of linear equations.
+  *
+  * For indefinite systems IDR(S) outperforms both BiCGStab and BiCGStab(L). Additionally, IDR(S) can handle matrices
+  * with complex eigenvalues more efficiently than BiCGStab.
+  *
+  * Many problems that do not converge for BiCGSTAB converge for IDR(s) (for larger values of s). And if both methods 
+  * converge the convergence for IDR(s) is typically much faster for difficult systems (for example indefinite problems). 
+  *
+  * IDR(s) is a limited memory finite termination method. In exact arithmetic it converges in at most N+N/s iterations,
+  * with N the system size.  It uses a fixed number of 4+3s vector. In comparison, BiCGSTAB terminates in 2N iterations 
+  * and uses 7 vectors. GMRES terminates in at most N iterations, and uses I+3 vectors, with I the number of iterations. 
+  * Restarting GMRES limits the memory consumption, but destroys the finite termination property.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
+  * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+  *
+  * \implsparsesolverconcept
+  *
+  * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+  * and NumTraits<Scalar>::epsilon() for the tolerance.
+  *
+  * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
+  *
+  * \b Performance: when using sparse matrices, best performance is achied for a row-major sparse matrix format.
+  * Moreover, in this case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
+  * See \ref TopicMultiThreading for details.
+  *
+  * By default the iterations start with x=0 as an initial guess of the solution.
+  * One can control the start using the solveWithGuess() method.
+  *
+  * IDR(s) can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+  *
+  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+  */
+	template <typename _MatrixType, typename _Preconditioner>
+	class IDRS : public IterativeSolverBase<IDRS<_MatrixType, _Preconditioner> >
+	{
+
+		public:
+			typedef _MatrixType MatrixType;
+			typedef typename MatrixType::Scalar Scalar;
+			typedef typename MatrixType::RealScalar RealScalar;
+			typedef _Preconditioner Preconditioner;
+
+		private:
+			typedef IterativeSolverBase<IDRS> Base;
+			using Base::m_error;
+			using Base::m_info;
+			using Base::m_isInitialized;
+			using Base::m_iterations;
+			using Base::matrix;
+			Index m_S;
+			bool m_smoothing;
+			RealScalar m_angle;
+			bool m_residual;
+
+		public:
+			/** Default constructor. */
+			IDRS(): m_S(4), m_smoothing(false), m_angle(RealScalar(0.7)), m_residual(false) {}
+
+			/**     Initialize the solver with matrix \a A for further \c Ax=b solving.
+
+			        This constructor is a shortcut for the default constructor followed
+			        by a call to compute().
+
+			        \warning this class stores a reference to the matrix A as well as some
+			        precomputed values that depend on it. Therefore, if \a A is changed
+			        this class becomes invalid. Call compute() to update it with the new
+			        matrix A, or modify a copy of A.
+			*/
+			template <typename MatrixDerived>
+			explicit IDRS(const EigenBase<MatrixDerived>& A) : Base(A.derived()), m_S(4), m_smoothing(false),
+															   m_angle(RealScalar(0.7)), m_residual(false) {}
+
+
+			/** \internal */
+			/**     Loops over the number of columns of b and does the following:
+			                1. sets the tolerence and maxIterations
+			                2. Calls the function that has the core solver routine
+			*/
+			template <typename Rhs, typename Dest>
+			void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+			{
+				m_iterations = Base::maxIterations();
+				m_error = Base::m_tolerance;
+
+				bool ret = internal::idrs(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error, m_S,m_smoothing,m_angle,m_residual);
+
+				m_info = (!ret) ? NumericalIssue : m_error <= Base::m_tolerance ? Success : NoConvergence;
+			}
+
+			/** Sets the parameter S, indicating the dimension of the shadow space. Default is 4*/
+			void setS(Index S)
+			{
+				if (S < 1)
+				{
+					S = 4;
+				}
+
+				m_S = S;
+			}
+
+			/** Switches off and on smoothing.
+			Residual smoothing results in monotonically decreasing residual norms at
+			the expense of two extra vectors of storage and a few extra vector
+			operations. Although monotonic decrease of the residual norms is a
+			desirable property, the rate of convergence of the unsmoothed process and
+			the smoothed process is basically the same. Default is off */
+			void setSmoothing(bool smoothing)
+			{
+				m_smoothing=smoothing;
+			}
+
+			/** The angle must be a real scalar. In IDR(s), a value for the
+			iteration parameter omega must be chosen in every s+1th step. The most
+			natural choice is to select a value to minimize the norm of the next residual.
+			This corresponds to the parameter omega = 0. In practice, this may lead to
+			values of omega that are so small that the other iteration parameters
+			cannot be computed with sufficient accuracy. In such cases it is better to
+			increase the value of omega sufficiently such that a compromise is reached
+			between accurate computations and reduction of the residual norm. The
+			parameter angle =0.7 (”maintaining the convergence strategy”)
+			results in such a compromise. */
+			void setAngle(RealScalar angle)
+			{
+				m_angle=angle;
+			}
+
+			/** The parameter replace is a logical that determines whether a
+			residual replacement strategy is employed to increase the accuracy of the
+			solution. */
+			void setResidualUpdate(bool update)
+			{
+				m_residual=update;
+			}
+
+	};
+
+}  // namespace Eigen
+
+#endif /* EIGEN_IDRS_H */
diff --git a/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/IterationController.h b/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/IterationController.h
index c9c1a4be29cd40fe335cace1c7fcce4b34d29230..a116e09e2253bf36ffbb32196f93edf483e19ff4 100644
--- a/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/IterationController.h
+++ b/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/IterationController.h
@@ -60,7 +60,7 @@
 
 namespace Eigen { 
 
-/** \ingroup IterativeSolvers_Module
+/** \ingroup IterativeLinearSolvers_Module
   * \class IterationController
   *
   * \brief Controls the iterations of the iterative solvers
diff --git a/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/MINRES.h
index 256990c1af245299b35777477308afd31a8efafe..5db454d243a850ea25b52814a852c44db4d8cce7 100644
--- a/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/MINRES.h
+++ b/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/MINRES.h
@@ -3,6 +3,7 @@
 //
 // Copyright (C) 2012 Giacomo Po <gpo@ucla.edu>
 // Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2018 David Hyde <dabh@stanford.edu>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -64,8 +65,6 @@ namespace Eigen {
             eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
             RealScalar beta_new(sqrt(beta_new2));
             const RealScalar beta_one(beta_new);
-            v_new /= beta_new;
-            w_new /= beta_new;
             // Initialize other variables
             RealScalar c(1.0); // the cosine of the Givens rotation
             RealScalar c_old(1.0);
@@ -83,18 +82,18 @@ namespace Eigen {
                 /* Note that there are 4 variants on the Lanczos algorithm. These are
                  * described in Paige, C. C. (1972). Computational variants of
                  * the Lanczos method for the eigenproblem. IMA Journal of Applied
-                 * Mathematics, 10(3), 373–381. The current implementation corresponds 
+                 * Mathematics, 10(3), 373-381. The current implementation corresponds 
                  * to the case A(2,7) in the paper. It also corresponds to 
-                 * algorithm 6.14 in Y. Saad, Iterative Methods for Sparse Linear
+                 * algorithm 6.14 in Y. Saad, Iterative Methods for Sparse Linear
                  * Systems, 2003 p.173. For the preconditioned version see 
                  * A. Greenbaum, Iterative Methods for Solving Linear Systems, SIAM (1987).
                  */
                 const RealScalar beta(beta_new);
                 v_old = v; // update: at first time step, this makes v_old = 0 so value of beta doesn't matter
-//                const VectorType v_old(v); // NOT SURE IF CREATING v_old EVERY ITERATION IS EFFICIENT
+                v_new /= beta_new; // overwrite v_new for next iteration
+                w_new /= beta_new; // overwrite w_new for next iteration
                 v = v_new; // update
                 w = w_new; // update
-//                const VectorType w(w_new); // NOT SURE IF CREATING w EVERY ITERATION IS EFFICIENT
                 v_new.noalias() = mat*w - beta*v_old; // compute v_new
                 const RealScalar alpha = v_new.dot(w);
                 v_new -= alpha*v; // overwrite v_new
@@ -102,8 +101,6 @@ namespace Eigen {
                 beta_new2 = v_new.dot(w_new); // compute beta_new
                 eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
                 beta_new = sqrt(beta_new2); // compute beta_new
-                v_new /= beta_new; // overwrite v_new for next iteration
-                w_new /= beta_new; // overwrite w_new for next iteration
                 
                 // Givens rotation
                 const RealScalar r2 =s*alpha+c*c_old*beta; // s, s_old, c and c_old are still from previous iteration
@@ -117,7 +114,6 @@ namespace Eigen {
                 
                 // Update solution
                 p_oold = p_old;
-//                const VectorType p_oold(p_old); // NOT SURE IF CREATING p_oold EVERY ITERATION IS EFFICIENT
                 p_old = p;
                 p.noalias()=(w-r2*p_old-r3*p_oold) /r1; // IS NOALIAS REQUIRED?
                 x += beta_one*c*eta*p;
@@ -237,7 +233,7 @@ namespace Eigen {
 
         /** \internal */
         template<typename Rhs,typename Dest>
-        void _solve_with_guess_impl(const Rhs& b, Dest& x) const
+        void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
         {
             typedef typename Base::MatrixWrapper MatrixWrapper;
             typedef typename Base::ActualMatrixType ActualMatrixType;
@@ -257,28 +253,11 @@ namespace Eigen {
             m_iterations = Base::maxIterations();
             m_error = Base::m_tolerance;
             RowMajorWrapper row_mat(matrix());
-            for(int j=0; j<b.cols(); ++j)
-            {
-                m_iterations = Base::maxIterations();
-                m_error = Base::m_tolerance;
-                
-                typename Dest::ColXpr xj(x,j);
-                internal::minres(SelfAdjointWrapper(row_mat), b.col(j), xj,
-                                 Base::m_preconditioner, m_iterations, m_error);
-            }
-            
-            m_isInitialized = true;
+            internal::minres(SelfAdjointWrapper(row_mat), b, x,
+                             Base::m_preconditioner, m_iterations, m_error);
             m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
         }
         
-        /** \internal */
-        template<typename Rhs,typename Dest>
-        void _solve_impl(const Rhs& b, MatrixBase<Dest> &x) const
-        {
-            x.setZero();
-            _solve_with_guess_impl(b,x.derived());
-        }
-        
     protected:
         
     };
@@ -286,4 +265,3 @@ namespace Eigen {
 } // end namespace Eigen
 
 #endif // EIGEN_MINRES_H
-
diff --git a/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/Scaling.h b/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/Scaling.h
index d113e6e9033c617f9bf4ea6a4405231fbc826e91..9b3eb53e06cc675f157b9a18991a383b947b21c9 100644
--- a/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/Scaling.h
+++ b/contrib/eigen/unsupported/Eigen/src/IterativeSolvers/Scaling.h
@@ -104,12 +104,18 @@ class IterScaling
         for (int i = 0; i < m; ++i) 
         {
           Dr(i) = std::sqrt(Dr(i));
+        }
+        for (int i = 0; i < n; ++i) 
+        {
           Dc(i) = std::sqrt(Dc(i));
         }
         // Save the scaling factors 
         for (int i = 0; i < m; ++i) 
         {
           m_left(i) /= Dr(i);
+        }
+        for (int i = 0; i < n; ++i) 
+        {
           m_right(i) /= Dc(i);
         }
         // Scale the rows and the columns of the matrix
diff --git a/contrib/eigen/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h b/contrib/eigen/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
index 582fa8512d513a58e538f6240df67f743892e40c..6a9b0be88a177a0ca761a2a4c9eb53b81db1fc64 100644
--- a/contrib/eigen/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
+++ b/contrib/eigen/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
@@ -235,10 +235,10 @@ struct traits<KroneckerProductSparse<_Lhs,_Rhs> >
     MaxRowsAtCompileTime = size_at_compile_time<traits<Lhs>::MaxRowsAtCompileTime, traits<Rhs>::MaxRowsAtCompileTime>::ret,
     MaxColsAtCompileTime = size_at_compile_time<traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime>::ret,
 
-    EvalToRowMajor = (LhsFlags & RhsFlags & RowMajorBit),
+    EvalToRowMajor = (int(LhsFlags) & int(RhsFlags) & RowMajorBit),
     RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
 
-    Flags = ((LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
+    Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & RemovedBits)
           | EvalBeforeNestingBit,
     CoeffReadCost = HugeCost
   };
diff --git a/contrib/eigen/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h b/contrib/eigen/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
index ae9d793b1173c00d0d52269d3ebbfc64a5b8b896..123485817324d50b8a574a49ead593649ed3feae 100644
--- a/contrib/eigen/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
+++ b/contrib/eigen/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
@@ -73,7 +73,7 @@ void lmqrsolv(
             qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
             wa[k] = temp;
 
-            /*           accumulate the tranformation in the row of s. */
+            /*           accumulate the transformation in the row of s. */
             for (i = k+1; i<n; ++i) {
                 temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
                 sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];
diff --git a/contrib/eigen/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h b/contrib/eigen/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
index 9954279789903fd0a25faded5cac43a51d0594b2..62561da1d77b4963059d8710914df99e7de89f75 100644
--- a/contrib/eigen/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
+++ b/contrib/eigen/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
@@ -117,7 +117,7 @@ class LevenbergMarquardt : internal::no_assignment_operator
     typedef typename JacobianType::RealScalar RealScalar; 
     typedef typename QRSolver::StorageIndex PermIndex;
     typedef Matrix<Scalar,Dynamic,1> FVectorType;
-    typedef PermutationMatrix<Dynamic,Dynamic> PermutationType;
+    typedef PermutationMatrix<Dynamic,Dynamic,int> PermutationType;
   public:
     LevenbergMarquardt(FunctorType& functor) 
     : m_functor(functor),m_nfev(0),m_njev(0),m_fnorm(0.0),m_gnorm(0),
@@ -233,9 +233,9 @@ class LevenbergMarquardt : internal::no_assignment_operator
     
     /** 
      * \brief Reports whether the minimization was successful
-     * \returns \c Success if the minimization was succesful,
+     * \returns \c Success if the minimization was successful,
      *         \c NumericalIssue if a numerical problem arises during the 
-     *          minimization process, for exemple during the QR factorization
+     *          minimization process, for example during the QR factorization
      *         \c NoConvergence if the minimization did not converge after 
      *          the maximum number of function evaluation allowed
      *          \c InvalidInput if the input matrix is invalid
diff --git a/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
index 0b0ee6546a8ddb1889889efd9da7c434dc77ceaf..02284b0ddf0bc4799ff6721f8c1f7cd55e026199 100644
--- a/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@@ -314,7 +314,7 @@ struct matrix_exp_computeUV<MatrixType, long double>
       matrix_exp_pade17(A, U, V);
     }
   
-#elif LDBL_MANT_DIG <= 112  // quadruple precison
+#elif LDBL_MANT_DIG <= 113  // quadruple precision
   
     if (l1norm < 1.639394610288918690547467954466970e-005L) {
       matrix_exp_pade3(arg, U, V);
@@ -347,7 +347,7 @@ struct matrix_exp_computeUV<MatrixType, long double>
 template<typename T> struct is_exp_known_type : false_type {};
 template<> struct is_exp_known_type<float> : true_type {};
 template<> struct is_exp_known_type<double> : true_type {};
-#if LDBL_MANT_DIG <= 112
+#if LDBL_MANT_DIG <= 113
 template<> struct is_exp_known_type<long double> : true_type {};
 #endif
 
@@ -396,7 +396,6 @@ void matrix_exp_compute(const ArgType& arg, ResultType &result, false_type) // d
 template<typename Derived> struct MatrixExponentialReturnValue
 : public ReturnByValue<MatrixExponentialReturnValue<Derived> >
 {
-    typedef typename Derived::Index Index;
   public:
     /** \brief Constructor.
       *
diff --git a/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
index 3df82394cb486a3453ed0644d2b731d4face5264..cc12ab62bae0569a70ebdd6c44e64a1feec48c17 100644
--- a/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
+++ b/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
@@ -53,7 +53,7 @@ template <typename MatrixType>
 typename NumTraits<typename MatrixType::Scalar>::Real matrix_function_compute_mu(const MatrixType& A)
 {
   typedef typename plain_col_type<MatrixType>::type VectorType;
-  typename MatrixType::Index rows = A.rows();
+  Index rows = A.rows();
   const MatrixType N = MatrixType::Identity(rows, rows) - A;
   VectorType e = VectorType::Ones(rows);
   N.template triangularView<Upper>().solveInPlace(e);
@@ -65,7 +65,6 @@ MatrixType MatrixFunctionAtomic<MatrixType>::compute(const MatrixType& A)
 {
   // TODO: Use that A is upper triangular
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef typename MatrixType::Index Index;
   Index rows = A.rows();
   Scalar avgEival = A.trace() / Scalar(RealScalar(rows));
   MatrixType Ashifted = A - avgEival * MatrixType::Identity(rows, rows);
@@ -73,10 +72,10 @@ MatrixType MatrixFunctionAtomic<MatrixType>::compute(const MatrixType& A)
   MatrixType F = m_f(avgEival, 0) * MatrixType::Identity(rows, rows);
   MatrixType P = Ashifted;
   MatrixType Fincr;
-  for (Index s = 1; s < 1.1 * rows + 10; s++) { // upper limit is fairly arbitrary
+  for (Index s = 1; double(s) < 1.1 * double(rows) + 10.0; s++) { // upper limit is fairly arbitrary
     Fincr = m_f(avgEival, static_cast<int>(s)) * P;
     F += Fincr;
-    P = Scalar(RealScalar(1.0/(s + 1))) * P * Ashifted;
+    P = Scalar(RealScalar(1)/RealScalar(s + 1)) * P * Ashifted;
 
     // test whether Taylor series converged
     const RealScalar F_norm = F.cwiseAbs().rowwise().sum().maxCoeff();
@@ -131,7 +130,6 @@ typename ListOfClusters::iterator matrix_function_find_cluster(Index key, ListOf
 template <typename EivalsType, typename Cluster>
 void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<Cluster>& clusters)
 {
-  typedef typename EivalsType::Index Index;
   typedef typename EivalsType::RealScalar RealScalar;
   for (Index i=0; i<eivals.rows(); ++i) {
     // Find cluster containing i-th ei'val, adding a new cluster if necessary
@@ -179,7 +177,7 @@ void matrix_function_compute_block_start(const VectorType& clusterSize, VectorTy
 {
   blockStart.resize(clusterSize.rows());
   blockStart(0) = 0;
-  for (typename VectorType::Index i = 1; i < clusterSize.rows(); i++) {
+  for (Index i = 1; i < clusterSize.rows(); i++) {
     blockStart(i) = blockStart(i-1) + clusterSize(i-1);
   }
 }
@@ -188,7 +186,6 @@ void matrix_function_compute_block_start(const VectorType& clusterSize, VectorTy
 template <typename EivalsType, typename ListOfClusters, typename VectorType>
 void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters& clusters, VectorType& eivalToCluster)
 {
-  typedef typename EivalsType::Index Index;
   eivalToCluster.resize(eivals.rows());
   Index clusterIndex = 0;
   for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) {
@@ -205,7 +202,6 @@ void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters&
 template <typename DynVectorType, typename VectorType>
 void matrix_function_compute_permutation(const DynVectorType& blockStart, const DynVectorType& eivalToCluster, VectorType& permutation)
 {
-  typedef typename VectorType::Index Index;
   DynVectorType indexNextEntry = blockStart;
   permutation.resize(eivalToCluster.rows());
   for (Index i = 0; i < eivalToCluster.rows(); i++) {
@@ -219,7 +215,6 @@ void matrix_function_compute_permutation(const DynVectorType& blockStart, const
 template <typename VectorType, typename MatrixType>
 void matrix_function_permute_schur(VectorType& permutation, MatrixType& U, MatrixType& T)
 {
-  typedef typename VectorType::Index Index;
   for (Index i = 0; i < permutation.rows() - 1; i++) {
     Index j;
     for (j = i; j < permutation.rows(); j++) {
@@ -247,7 +242,7 @@ template <typename MatrixType, typename AtomicType, typename VectorType>
 void matrix_function_compute_block_atomic(const MatrixType& T, AtomicType& atomic, const VectorType& blockStart, const VectorType& clusterSize, MatrixType& fT)
 { 
   fT.setZero(T.rows(), T.cols());
-  for (typename VectorType::Index i = 0; i < clusterSize.rows(); ++i) {
+  for (Index i = 0; i < clusterSize.rows(); ++i) {
     fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i))
       = atomic.compute(T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i)));
   }
@@ -285,7 +280,6 @@ MatrixType matrix_function_solve_triangular_sylvester(const MatrixType& A, const
   eigen_assert(C.rows() == A.rows());
   eigen_assert(C.cols() == B.rows());
 
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
 
   Index m = A.rows();
@@ -330,11 +324,8 @@ void matrix_function_compute_above_diagonal(const MatrixType& T, const VectorTyp
 { 
   typedef internal::traits<MatrixType> Traits;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
-  static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
-  static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
   static const int Options = MatrixType::Options;
-  typedef Matrix<Scalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+  typedef Matrix<Scalar, Dynamic, Dynamic, Options, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
 
   for (Index k = 1; k < clusterSize.rows(); k++) {
     for (Index i = 0; i < clusterSize.rows() - k; i++) {
@@ -428,7 +419,8 @@ struct matrix_function_compute<MatrixType, 1>
     typedef internal::traits<MatrixType> Traits;
     
     // compute Schur decomposition of A
-    const ComplexSchur<MatrixType> schurOfA(A);  
+    const ComplexSchur<MatrixType> schurOfA(A);
+    eigen_assert(schurOfA.info()==Success);
     MatrixType T = schurOfA.matrixT();
     MatrixType U = schurOfA.matrixU();
 
@@ -480,7 +472,6 @@ template<typename Derived> class MatrixFunctionReturnValue
 {
   public:
     typedef typename Derived::Scalar Scalar;
-    typedef typename Derived::Index Index;
     typedef typename internal::stem_function<Scalar>::type StemFunction;
 
   protected:
@@ -505,10 +496,8 @@ template<typename Derived> class MatrixFunctionReturnValue
       typedef typename internal::nested_eval<Derived, 10>::type NestedEvalType;
       typedef typename internal::remove_all<NestedEvalType>::type NestedEvalTypeClean;
       typedef internal::traits<NestedEvalTypeClean> Traits;
-      static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
-      static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
       typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-      typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+      typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
 
       typedef internal::MatrixFunctionAtomic<DynMatrixType> AtomicType;
       AtomicType atomic(m_f);
diff --git a/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
index cf5fffad3067d708f0011add329f66d579a39481..e917013e0ed5d91b808fb9d59aa09ce643f27deb 100644
--- a/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
@@ -62,8 +62,8 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
   else
   {
     // computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
-    int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)));
-    result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y;
+    RealScalar unwindingNumber = ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
+    result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,RealScalar(2*EIGEN_PI)*unwindingNumber)) / y;
   }
 }
 
@@ -135,7 +135,8 @@ void matrix_log_compute_pade(MatrixType& result, const MatrixType& T, int degree
   const int minPadeDegree = 3;
   const int maxPadeDegree = 11;
   assert(degree >= minPadeDegree && degree <= maxPadeDegree);
-
+  // FIXME this creates float-conversion-warnings if these are enabled.
+  // Either manually convert each value, or disable the warning locally
   const RealScalar nodes[][maxPadeDegree] = { 
     { 0.1127016653792583114820734600217600L, 0.5000000000000000000000000000000000L,  // degree 3
       0.8872983346207416885179265399782400L }, 
@@ -232,12 +233,13 @@ void matrix_log_compute_big(const MatrixType& A, MatrixType& result)
   int degree;
   MatrixType T = A, sqrtT;
 
-  int maxPadeDegree = matrix_log_max_pade_degree<Scalar>::value;
-  const RealScalar maxNormForPade = maxPadeDegree<= 5? 5.3149729967117310e-1L:                    // single precision
+  const int maxPadeDegree = matrix_log_max_pade_degree<Scalar>::value;
+  const RealScalar maxNormForPade = RealScalar(
+                                    maxPadeDegree<= 5? 5.3149729967117310e-1L:                    // single precision
                                     maxPadeDegree<= 7? 2.6429608311114350e-1L:                    // double precision
                                     maxPadeDegree<= 8? 2.32777776523703892094e-1L:                // extended precision
                                     maxPadeDegree<=10? 1.05026503471351080481093652651105e-1L:    // double-double
-                                                       1.1880960220216759245467951592883642e-1L;  // quadruple precision
+                                                       1.1880960220216759245467951592883642e-1L); // quadruple precision
 
   while (true) {
     RealScalar normTminusI = (T - MatrixType::Identity(T.rows(), T.rows())).cwiseAbs().colwise().sum().maxCoeff();
@@ -254,7 +256,7 @@ void matrix_log_compute_big(const MatrixType& A, MatrixType& result)
   }
 
   matrix_log_compute_pade(result, T, degree);
-  result *= pow(RealScalar(2), numberOfSquareRoots);
+  result *= pow(RealScalar(2), RealScalar(numberOfSquareRoots)); // TODO replace by bitshift if possible
 }
 
 /** \ingroup MatrixFunctions_Module
@@ -332,10 +334,8 @@ public:
     typedef typename internal::nested_eval<Derived, 10>::type DerivedEvalType;
     typedef typename internal::remove_all<DerivedEvalType>::type DerivedEvalTypeClean;
     typedef internal::traits<DerivedEvalTypeClean> Traits;
-    static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
-    static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
     typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
     typedef internal::MatrixLogarithmAtomic<DynMatrixType> AtomicType;
     AtomicType atomic;
     
diff --git a/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
index a3273da4e533916576b6f45e8905c9cd17a72978..d7672d7c9739ee2f0688d3ca93b80475c4263175 100644
--- a/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
@@ -40,7 +40,6 @@ class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParen
 {
   public:
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
 
     /**
      * \brief Constructor.
@@ -81,7 +80,7 @@ class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParen
  *
  * \note Currently this class is only used by MatrixPower. One may
  * insist that this be nested into MatrixPower. This class is here to
- * faciliate future development of triangular matrix functions.
+ * facilitate future development of triangular matrix functions.
  */
 template<typename MatrixType>
 class MatrixPowerAtomic : internal::noncopyable
@@ -94,7 +93,6 @@ class MatrixPowerAtomic : internal::noncopyable
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
     typedef std::complex<RealScalar> ComplexScalar;
-    typedef typename MatrixType::Index Index;
     typedef Block<MatrixType,Dynamic,Dynamic> ResultType;
 
     const MatrixType& m_A;
@@ -162,11 +160,11 @@ template<typename MatrixType>
 void MatrixPowerAtomic<MatrixType>::computePade(int degree, const MatrixType& IminusT, ResultType& res) const
 {
   int i = 2*degree;
-  res = (m_p-degree) / (2*i-2) * IminusT;
+  res = (m_p-RealScalar(degree)) / RealScalar(2*i-2) * IminusT;
 
   for (--i; i; --i) {
     res = (MatrixType::Identity(IminusT.rows(), IminusT.cols()) + res).template triangularView<Upper>()
-	.solve((i==1 ? -m_p : i&1 ? (-m_p-i/2)/(2*i) : (m_p-i/2)/(2*i-2)) * IminusT).eval();
+	.solve((i==1 ? -m_p : i&1 ? (-m_p-RealScalar(i/2))/RealScalar(2*i) : (m_p-RealScalar(i/2))/RealScalar(2*i-2)) * IminusT).eval();
   }
   res += MatrixType::Identity(IminusT.rows(), IminusT.cols());
 }
@@ -196,11 +194,12 @@ void MatrixPowerAtomic<MatrixType>::computeBig(ResultType& res) const
 {
   using std::ldexp;
   const int digits = std::numeric_limits<RealScalar>::digits;
-  const RealScalar maxNormForPade = digits <=  24? 4.3386528e-1L                            // single precision
+  const RealScalar maxNormForPade = RealScalar(
+                                    digits <=  24? 4.3386528e-1L                            // single precision
                                   : digits <=  53? 2.789358995219730e-1L                    // double precision
                                   : digits <=  64? 2.4471944416607995472e-1L                // extended precision
                                   : digits <= 106? 1.1016843812851143391275867258512e-1L    // double-double
-                                  :                9.134603732914548552537150753385375e-2L; // quadruple precision
+                                  :                9.134603732914548552537150753385375e-2L); // quadruple precision
   MatrixType IminusT, sqrtT, T = m_A.template triangularView<Upper>();
   RealScalar normIminusT;
   int degree, degree2, numberOfSquareRoots = 0;
@@ -298,8 +297,8 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const
 
   ComplexScalar logCurr = log(curr);
   ComplexScalar logPrev = log(prev);
-  int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
-  ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber);
+  RealScalar unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
+  ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, RealScalar(EIGEN_PI)*unwindingNumber);
   return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev);
 }
 
@@ -340,7 +339,6 @@ class MatrixPower : internal::noncopyable
   private:
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
 
   public:
     /**
@@ -600,7 +598,6 @@ class MatrixPowerReturnValue : public ReturnByValue< MatrixPowerReturnValue<Deri
   public:
     typedef typename Derived::PlainObject PlainObject;
     typedef typename Derived::RealScalar RealScalar;
-    typedef typename Derived::Index Index;
 
     /**
      * \brief Constructor.
@@ -648,7 +645,6 @@ class MatrixComplexPowerReturnValue : public ReturnByValue< MatrixComplexPowerRe
   public:
     typedef typename Derived::PlainObject PlainObject;
     typedef typename std::complex<typename Derived::RealScalar> ComplexScalar;
-    typedef typename Derived::Index Index;
 
     /**
      * \brief Constructor.
diff --git a/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
index 9de0c3574eb522d7a6dc0ee0c54a2a32177a6e17..e363e779dea5da37ff6aab6b4b61a5ac6c5bd205 100644
--- a/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
+++ b/contrib/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
@@ -17,7 +17,7 @@ namespace internal {
 // pre:  T.block(i,i,2,2) has complex conjugate eigenvalues
 // post: sqrtT.block(i,i,2,2) is square root of T.block(i,i,2,2)
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, typename MatrixType::Index i, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, Index i, ResultType& sqrtT)
 {
   // TODO: This case (2-by-2 blocks with complex conjugate eigenvalues) is probably hidden somewhere
   //       in EigenSolver. If we expose it, we could call it directly from here.
@@ -32,7 +32,7 @@ void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, typena
 //       all blocks of sqrtT to left of and below (i,j) are correct
 // post: sqrtT(i,j) has the correct value
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
 {
   typedef typename traits<MatrixType>::Scalar Scalar;
   Scalar tmp = (sqrtT.row(i).segment(i+1,j-i-1) * sqrtT.col(j).segment(i+1,j-i-1)).value();
@@ -41,7 +41,7 @@ void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, ty
 
 // similar to compute1x1offDiagonalBlock()
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
 {
   typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar,1,2> rhs = T.template block<1,2>(i,j);
@@ -54,7 +54,7 @@ void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, ty
 
 // similar to compute1x1offDiagonalBlock()
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
 {
   typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar,2,1> rhs = T.template block<2,1>(i,j);
@@ -101,7 +101,7 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const
 
 // similar to compute1x1offDiagonalBlock()
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
 {
   typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
@@ -204,7 +204,7 @@ template <typename MatrixType, typename ResultType>
 void matrix_sqrt_triangular(const MatrixType &arg, ResultType &result)
 {
   using std::sqrt;
-      typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::Scalar Scalar;
 
   eigen_assert(arg.rows() == arg.cols());
 
diff --git a/contrib/eigen/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h b/contrib/eigen/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
index 8fe3ed86b28aa929c9e310d07d8d70dffb4ff6fe..07c5ef01420b83443a795b5a02342d05843d2f7c 100644
--- a/contrib/eigen/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
+++ b/contrib/eigen/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
@@ -52,7 +52,7 @@ public:
         Parameters()
             : factor(Scalar(100.))
             , maxfev(1000)
-            , xtol(std::sqrt(NumTraits<Scalar>::epsilon()))
+            , xtol(numext::sqrt(NumTraits<Scalar>::epsilon()))
             , nb_of_subdiagonals(-1)
             , nb_of_superdiagonals(-1)
             , epsfcn(Scalar(0.)) {}
@@ -70,7 +70,7 @@ public:
 
     HybridNonLinearSolverSpace::Status hybrj1(
             FVectorType  &x,
-            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+            const Scalar tol = numext::sqrt(NumTraits<Scalar>::epsilon())
             );
 
     HybridNonLinearSolverSpace::Status solveInit(FVectorType  &x);
@@ -79,7 +79,7 @@ public:
 
     HybridNonLinearSolverSpace::Status hybrd1(
             FVectorType  &x,
-            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+            const Scalar tol = numext::sqrt(NumTraits<Scalar>::epsilon())
             );
 
     HybridNonLinearSolverSpace::Status solveNumericalDiffInit(FVectorType  &x);
diff --git a/contrib/eigen/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h b/contrib/eigen/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
index feafd62a8dd686442979afc9098e86de1da9484a..4f2f560b3cbc6b1de08860e99b04253a1b7fe49b 100644
--- a/contrib/eigen/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
+++ b/contrib/eigen/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
@@ -61,7 +61,7 @@ void qrsolv(
             qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
             wa[k] = temp;
 
-            /*           accumulate the tranformation in the row of s. */
+            /*           accumulate the transformation in the row of s. */
             for (i = k+1; i<n; ++i) {
                 temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
                 sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];
diff --git a/contrib/eigen/unsupported/Eigen/src/NonLinearOptimization/r1updt.h b/contrib/eigen/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
index f28766061d5d9880eb641a081aed50ea32343ab9..09fc6525590c6233344b88bcb58a7999ad045e93 100644
--- a/contrib/eigen/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
+++ b/contrib/eigen/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
@@ -22,7 +22,7 @@ void r1updt(
     Scalar temp;
     JacobiRotation<Scalar> givens;
 
-    // r1updt had a broader usecase, but we dont use it here. And, more
+    // r1updt had a broader usecase, but we don't use it here. And, more
     // importantly, we can not test it.
     eigen_assert(m==n);
     eigen_assert(u.size()==m);
diff --git a/contrib/eigen/unsupported/Eigen/src/Polynomials/Companion.h b/contrib/eigen/unsupported/Eigen/src/Polynomials/Companion.h
index 359836cac8018dbfe6a200b2615753f47f1a8b8b..59a15b098e2e6e655babaa6b5beccb68e4daefda 100644
--- a/contrib/eigen/unsupported/Eigen/src/Polynomials/Companion.h
+++ b/contrib/eigen/unsupported/Eigen/src/Polynomials/Companion.h
@@ -20,12 +20,6 @@ namespace internal {
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
-template <typename T>
-T radix(){ return 2; }
-
-template <typename T>
-T radix2(){ return radix<T>()*radix<T>(); }
-
 template<int Size>
 struct decrement_if_fixed_size
 {
@@ -103,7 +97,7 @@ class companion
     /** Helper function for the balancing algorithm.
      * \returns true if the row and the column, having colNorm and rowNorm
      * as norms, are balanced, false otherwise.
-     * colB and rowB are repectively the multipliers for
+     * colB and rowB are respectively the multipliers for
      * the column and the row in order to balance them.
      * */
     bool balanced( RealScalar colNorm, RealScalar rowNorm,
@@ -112,7 +106,7 @@ class companion
     /** Helper function for the balancing algorithm.
      * \returns true if the row and the column, having colNorm and rowNorm
      * as norms, are balanced, false otherwise.
-     * colB and rowB are repectively the multipliers for
+     * colB and rowB are respectively the multipliers for
      * the column and the row in order to balance them.
      * */
     bool balancedR( RealScalar colNorm, RealScalar rowNorm,
@@ -141,7 +135,10 @@ inline
 bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm,
     bool& isBalanced, RealScalar& colB, RealScalar& rowB )
 {
-  if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm ){ return true; }
+  if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm 
+      || !(numext::isfinite)(colNorm) || !(numext::isfinite)(rowNorm)){
+    return true;
+  }
   else
   {
     //To find the balancing coefficients, if the radix is 2,
@@ -149,33 +146,41 @@ bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm,
     // \f$ 2^{2\sigma-1} < rowNorm / colNorm \le 2^{2\sigma+1} \f$
     // then the balancing coefficient for the row is \f$ 1/2^{\sigma} \f$
     // and the balancing coefficient for the column is \f$ 2^{\sigma} \f$
-    rowB = rowNorm / radix<RealScalar>();
+    const RealScalar radix = RealScalar(2);
+    const RealScalar radix2 = RealScalar(4);
+    
+    rowB = rowNorm / radix;
     colB = RealScalar(1);
     const RealScalar s = colNorm + rowNorm;
 
-    while (colNorm < rowB)
+    // Find sigma s.t. rowNorm / 2 <= 2^(2*sigma) * colNorm
+    RealScalar scout = colNorm;
+    while (scout < rowB)
     {
-      colB *= radix<RealScalar>();
-      colNorm *= radix2<RealScalar>();
+      colB *= radix;
+      scout *= radix2;
     }
-
-    rowB = rowNorm * radix<RealScalar>();
-
-    while (colNorm >= rowB)
+    
+    // We now have an upper-bound for sigma, try to lower it.
+    // Find sigma s.t. 2^(2*sigma) * colNorm / 2 < rowNorm
+    scout = colNorm * (colB / radix) * colB;  // Avoid overflow.
+    while (scout >= rowNorm)
     {
-      colB /= radix<RealScalar>();
-      colNorm /= radix2<RealScalar>();
+      colB /= radix;
+      scout /= radix2;
     }
 
-    //This line is used to avoid insubstantial balancing
-    if ((rowNorm + colNorm) < RealScalar(0.95) * s * colB)
+    // This line is used to avoid insubstantial balancing.
+    if ((rowNorm + radix * scout) < RealScalar(0.95) * s * colB)
     {
       isBalanced = false;
       rowB = RealScalar(1) / colB;
       return false;
     }
-    else{
-      return true; }
+    else
+    {
+      return true;
+    }
   }
 }
 
diff --git a/contrib/eigen/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h b/contrib/eigen/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
index a1f54ed359b037f4fd99a2cf134306ae77fc104f..6d0370d5b8c0f26ed87c22bd778a05bb21cd7628 100644
--- a/contrib/eigen/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
+++ b/contrib/eigen/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
@@ -41,7 +41,7 @@ public:
 
     /** Sets the relative threshold value used to prune zero coefficients during the decomposition.
      *
-     * Setting a value greater than zero speeds up computation, and yields to an imcomplete
+     * Setting a value greater than zero speeds up computation, and yields to an incomplete
      * factorization with fewer non zero coefficients. Such approximate factors are especially
      * useful to initialize an iterative solver.
      *
@@ -349,4 +349,4 @@ bool SkylineInplaceLU<MatrixType>::solve(const MatrixBase<BDerived> &b, MatrixBa
 
 } // end namespace Eigen
 
-#endif // EIGEN_SKYLINELU_H
+#endif // EIGEN_SKYLINEINPLACELU_H
diff --git a/contrib/eigen/unsupported/Eigen/src/Skyline/SkylineMatrix.h b/contrib/eigen/unsupported/Eigen/src/Skyline/SkylineMatrix.h
index a2a8933ca50481637264a4f60b366af9c47e00b4..7c7eace7f941c3594523e15c6c455485ca3827a7 100644
--- a/contrib/eigen/unsupported/Eigen/src/Skyline/SkylineMatrix.h
+++ b/contrib/eigen/unsupported/Eigen/src/Skyline/SkylineMatrix.h
@@ -206,26 +206,26 @@ public:
             if (col > row) //upper matrix
             {
                 const Index minOuterIndex = inner - m_data.upperProfile(inner);
-                eigen_assert(outer >= minOuterIndex && "you try to acces a coeff that do not exist in the storage");
+                eigen_assert(outer >= minOuterIndex && "You tried to access a coeff that does not exist in the storage");
                 return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
             }
             if (col < row) //lower matrix
             {
                 const Index minInnerIndex = outer - m_data.lowerProfile(outer);
-                eigen_assert(inner >= minInnerIndex && "you try to acces a coeff that do not exist in the storage");
+                eigen_assert(inner >= minInnerIndex && "You tried to access a coeff that does not exist in the storage");
                 return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
             }
         } else {
             if (outer > inner) //upper matrix
             {
                 const Index maxOuterIndex = inner + m_data.upperProfile(inner);
-                eigen_assert(outer <= maxOuterIndex && "you try to acces a coeff that do not exist in the storage");
+                eigen_assert(outer <= maxOuterIndex && "You tried to access a coeff that does not exist in the storage");
                 return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
             }
             if (outer < inner) //lower matrix
             {
                 const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
-                eigen_assert(inner <= maxInnerIndex && "you try to acces a coeff that do not exist in the storage");
+                eigen_assert(inner <= maxInnerIndex && "You tried to access a coeff that does not exist in the storage");
                 return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
             }
         }
@@ -300,11 +300,11 @@ public:
 
         if (IsRowMajor) {
             const Index minInnerIndex = outer - m_data.lowerProfile(outer);
-            eigen_assert(inner >= minInnerIndex && "you try to acces a coeff that do not exist in the storage");
+            eigen_assert(inner >= minInnerIndex && "You tried to access a coeff that does not exist in the storage");
             return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
         } else {
             const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
-            eigen_assert(inner <= maxInnerIndex && "you try to acces a coeff that do not exist in the storage");
+            eigen_assert(inner <= maxInnerIndex && "You tried to access a coeff that does not exist in the storage");
             return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
         }
     }
@@ -336,11 +336,11 @@ public:
 
         if (IsRowMajor) {
             const Index minOuterIndex = inner - m_data.upperProfile(inner);
-            eigen_assert(outer >= minOuterIndex && "you try to acces a coeff that do not exist in the storage");
+            eigen_assert(outer >= minOuterIndex && "You tried to access a coeff that does not exist in the storage");
             return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
         } else {
             const Index maxOuterIndex = inner + m_data.upperProfile(inner);
-            eigen_assert(outer <= maxOuterIndex && "you try to acces a coeff that do not exist in the storage");
+            eigen_assert(outer <= maxOuterIndex && "You tried to access a coeff that does not exist in the storage");
             return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
         }
     }
@@ -859,4 +859,4 @@ protected:
 
 } // end namespace Eigen
 
-#endif // EIGEN_SkylineMatrix_H
+#endif // EIGEN_SKYLINEMATRIX_H
diff --git a/contrib/eigen/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h b/contrib/eigen/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
index b3a237230c14a4d2536dd57b7e85ae0861ac3f2f..b0d5e10011fd2e6fca916069aae2f015c8985ac8 100644
--- a/contrib/eigen/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
+++ b/contrib/eigen/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
@@ -12,7 +12,7 @@
 
 #include "SkylineUtil.h"
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \ingroup Skyline_Module
  *
@@ -102,18 +102,18 @@ public:
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
     /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
-    inline Index rows() const {
+    inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
         return derived().rows();
     }
 
     /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
-    inline Index cols() const {
+    inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT {
         return derived().cols();
     }
 
     /** \returns the number of coefficients, which is \a rows()*cols().
      * \sa rows(), cols(), SizeAtCompileTime. */
-    inline Index size() const {
+    inline EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT {
         return rows() * cols();
     }
 
@@ -209,4 +209,4 @@ protected:
 
 } // end namespace Eigen
 
-#endif // EIGEN_SkylineMatrixBase_H
+#endif // EIGEN_SKYLINEMATRIXBASE_H
diff --git a/contrib/eigen/unsupported/Eigen/src/Skyline/SkylineStorage.h b/contrib/eigen/unsupported/Eigen/src/Skyline/SkylineStorage.h
index 378a8deb4fc1c9af3e4a8f4f9f3487a55958b221..cc7514f1232e1641a6abf272cd315225b1c2520f 100644
--- a/contrib/eigen/unsupported/Eigen/src/Skyline/SkylineStorage.h
+++ b/contrib/eigen/unsupported/Eigen/src/Skyline/SkylineStorage.h
@@ -256,4 +256,4 @@ public:
 
 } // end namespace Eigen
 
-#endif // EIGEN_COMPRESSED_STORAGE_H
+#endif // EIGEN_SKYLINE_STORAGE_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h b/contrib/eigen/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
index 0ffbc43d23e931d1d542b51d56dfd20a7775da03..42c99e4670cd0662f80f4fa8a24a9fb8aee858f3 100644
--- a/contrib/eigen/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
+++ b/contrib/eigen/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
@@ -187,7 +187,7 @@ template<typename _Scalar, int _Options, typename _StorageIndex>
     /** Does nothing: provided for compatibility with SparseMatrix */
     inline void finalize() {}
 
-    /** Suppress all nonzeros which are smaller than \a reference under the tolerence \a epsilon */
+    /** Suppress all nonzeros which are smaller than \a reference under the tolerance \a epsilon */
     void prune(Scalar reference, RealScalar epsilon = NumTraits<RealScalar>::dummy_precision())
     {
       for (Index j=0; j<outerSize(); ++j)
@@ -224,7 +224,7 @@ template<typename _Scalar, int _Options, typename _StorageIndex>
       }
     }
 
-    /** The class DynamicSparseMatrix is deprectaed */
+    /** The class DynamicSparseMatrix is deprecated */
     EIGEN_DEPRECATED inline DynamicSparseMatrix()
       : m_innerSize(0), m_data(0)
     {
@@ -234,7 +234,7 @@ template<typename _Scalar, int _Options, typename _StorageIndex>
       eigen_assert(innerSize()==0 && outerSize()==0);
     }
 
-    /** The class DynamicSparseMatrix is deprectaed */
+    /** The class DynamicSparseMatrix is deprecated */
     EIGEN_DEPRECATED inline DynamicSparseMatrix(Index rows, Index cols)
       : m_innerSize(0)
     {
@@ -244,7 +244,7 @@ template<typename _Scalar, int _Options, typename _StorageIndex>
       resize(rows, cols);
     }
 
-    /** The class DynamicSparseMatrix is deprectaed */
+    /** The class DynamicSparseMatrix is deprecated */
     template<typename OtherDerived>
     EIGEN_DEPRECATED explicit inline DynamicSparseMatrix(const SparseMatrixBase<OtherDerived>& other)
       : m_innerSize(0)
diff --git a/contrib/eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h b/contrib/eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h
index 04b7d69ac3373e649383ae33a699c0385532290d..dd786d561f27fb61d50dfa8cd58e06f48b415946 100644
--- a/contrib/eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h
+++ b/contrib/eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h
@@ -12,38 +12,38 @@
 #define EIGEN_SPARSE_MARKET_IO_H
 
 #include <iostream>
+#include <vector>
 
 namespace Eigen { 
 
 namespace internal 
 {
-  template <typename Scalar,typename IndexType>
-  inline bool GetMarketLine (std::stringstream& line, IndexType& M, IndexType& N, IndexType& i, IndexType& j, Scalar& value)
+  template <typename Scalar, typename StorageIndex>
+  inline void GetMarketLine (const char* line, StorageIndex& i, StorageIndex& j, Scalar& value)
   {
-    line >> i >> j >> value;
-    i--;
-    j--;
-    if(i>=0 && j>=0 && i<M && j<N)
-    {
-      return true; 
-    }
-    else
-      return false;
+    std::stringstream sline(line);
+    sline >> i >> j >> value;
   }
-  template <typename Scalar,typename IndexType>
-  inline bool GetMarketLine (std::stringstream& line, IndexType& M, IndexType& N, IndexType& i, IndexType& j, std::complex<Scalar>& value)
+
+  template<> inline void GetMarketLine (const char* line, int& i, int& j, float& value)
+  { std::sscanf(line, "%d %d %g", &i, &j, &value); }
+
+  template<> inline void GetMarketLine (const char* line, int& i, int& j, double& value)
+  { std::sscanf(line, "%d %d %lg", &i, &j, &value); }
+
+  template<> inline void GetMarketLine (const char* line, int& i, int& j, std::complex<float>& value)
+  { std::sscanf(line, "%d %d %g %g", &i, &j, &numext::real_ref(value), &numext::imag_ref(value)); }
+
+  template<> inline void GetMarketLine (const char* line, int& i, int& j, std::complex<double>& value)
+  { std::sscanf(line, "%d %d %lg %lg", &i, &j, &numext::real_ref(value), &numext::imag_ref(value)); }
+
+  template <typename Scalar, typename StorageIndex>
+  inline void GetMarketLine (const char* line, StorageIndex& i, StorageIndex& j, std::complex<Scalar>& value)
   {
+    std::stringstream sline(line);
     Scalar valR, valI;
-    line >> i >> j >> valR >> valI;
-    i--;
-    j--;
-    if(i>=0 && j>=0 && i<M && j<N)
-    {
-      value = std::complex<Scalar>(valR, valI);
-      return true; 
-    }
-    else
-      return false;
+    sline >> i >> j >> valR >> valI;
+    value = std::complex<Scalar>(valR,valI);
   }
 
   template <typename RealScalar>
@@ -81,13 +81,13 @@ namespace internal
     }
   }
 
-  template<typename Scalar>
-  inline void PutMatrixElt(Scalar value, int row, int col, std::ofstream& out)
+  template<typename Scalar, typename StorageIndex>
+  inline void PutMatrixElt(Scalar value, StorageIndex row, StorageIndex col, std::ofstream& out)
   {
     out << row << " "<< col << " " << value << "\n";
   }
-  template<typename Scalar>
-  inline void PutMatrixElt(std::complex<Scalar> value, int row, int col, std::ofstream& out)
+  template<typename Scalar, typename StorageIndex>
+  inline void PutMatrixElt(std::complex<Scalar> value, StorageIndex row, StorageIndex col, std::ofstream& out)
   {
     out << row << " " << col << " " << value.real() << " " << value.imag() << "\n";
   }
@@ -101,10 +101,10 @@ namespace internal
   template<typename Scalar>
   inline void putVectorElt(std::complex<Scalar> value, std::ofstream& out)
   {
-    out << value.real << " " << value.imag()<< "\n"; 
+    out << value.real() << " " << value.imag()<< "\n"; 
   }
 
-} // end namepsace internal
+} // end namespace internal
 
 inline bool getMarketHeader(const std::string& filename, int& sym, bool& iscomplex, bool& isvector)
 {
@@ -138,6 +138,9 @@ bool loadMarket(SparseMatrixType& mat, const std::string& filename)
   std::ifstream input(filename.c_str(),std::ios::in);
   if(!input)
     return false;
+
+  char rdbuffer[4096];
+  input.rdbuf()->pubsetbuf(rdbuffer, 4096);
   
   const int maxBuffersize = 2048;
   char buffer[maxBuffersize];
@@ -147,24 +150,22 @@ bool loadMarket(SparseMatrixType& mat, const std::string& filename)
   typedef Triplet<Scalar,StorageIndex> T;
   std::vector<T> elements;
   
-  StorageIndex M(-1), N(-1), NNZ(-1);
-  StorageIndex count = 0;
+  Index M(-1), N(-1), NNZ(-1);
+  Index count = 0;
   while(input.getline(buffer, maxBuffersize))
   {
     // skip comments   
     //NOTE An appropriate test should be done on the header to get the  symmetry
     if(buffer[0]=='%')
       continue;
-    
-    std::stringstream line(buffer);
-    
+
     if(!readsizes)
     {
+      std::stringstream line(buffer);
       line >> M >> N >> NNZ;
-      if(M > 0 && N > 0 && NNZ > 0) 
+      if(M > 0 && N > 0)
       {
         readsizes = true;
-        //std::cout << "sizes: " << M << "," << N << "," << NNZ << "\n";
         mat.resize(M,N);
         mat.reserve(NNZ);
       }
@@ -173,15 +174,20 @@ bool loadMarket(SparseMatrixType& mat, const std::string& filename)
     { 
       StorageIndex i(-1), j(-1);
       Scalar value; 
-      if( internal::GetMarketLine(line, M, N, i, j, value) ) 
+      internal::GetMarketLine(buffer, i, j, value);
+
+      i--;
+      j--;
+      if(i>=0 && j>=0 && i<M && j<N)
       {
-        ++ count;
+        ++count;
         elements.push_back(T(i,j,value));
       }
-      else 
+      else
         std::cerr << "Invalid read: " << i << "," << j << "\n";        
     }
   }
+
   mat.setFromTriplets(elements.begin(), elements.end());
   if(count!=NNZ)
     std::cerr << count << "!=" << NNZ << "\n";
@@ -226,12 +232,13 @@ template<typename SparseMatrixType>
 bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sym = 0)
 {
   typedef typename SparseMatrixType::Scalar Scalar;
+  typedef typename SparseMatrixType::RealScalar RealScalar;
   std::ofstream out(filename.c_str(),std::ios::out);
   if(!out)
     return false;
   
   out.flags(std::ios_base::scientific);
-  out.precision(64);
+  out.precision(std::numeric_limits<RealScalar>::digits10 + 2);
   std::string header; 
   internal::putMarketHeader<Scalar>(header, sym); 
   out << header << std::endl; 
@@ -242,7 +249,6 @@ bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sy
     {
       ++ count;
       internal::PutMatrixElt(it.value(), it.row()+1, it.col()+1, out);
-      // out << it.row()+1 << " " << it.col()+1 << " " << it.value() << "\n";
     }
   out.close();
   return true;
@@ -251,13 +257,14 @@ bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sy
 template<typename VectorType>
 bool saveMarketVector (const VectorType& vec, const std::string& filename)
 {
- typedef typename VectorType::Scalar Scalar; 
+ typedef typename VectorType::Scalar Scalar;
+ typedef typename VectorType::RealScalar RealScalar;
  std::ofstream out(filename.c_str(),std::ios::out);
   if(!out)
     return false;
   
   out.flags(std::ios_base::scientific);
-  out.precision(64);
+  out.precision(std::numeric_limits<RealScalar>::digits10 + 2);
   if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
       out << "%%MatrixMarket matrix array complex general\n"; 
   else
diff --git a/contrib/eigen/unsupported/Eigen/src/SparseExtra/RandomSetter.h b/contrib/eigen/unsupported/Eigen/src/SparseExtra/RandomSetter.h
index ee97299af9b594ad0292de42539855bae9c66880..985702b5ff0543a7dc4fb5451c9c1f4dd422b0ce 100644
--- a/contrib/eigen/unsupported/Eigen/src/SparseExtra/RandomSetter.h
+++ b/contrib/eigen/unsupported/Eigen/src/SparseExtra/RandomSetter.h
@@ -10,7 +10,13 @@
 #ifndef EIGEN_RANDOMSETTER_H
 #define EIGEN_RANDOMSETTER_H
 
-namespace Eigen { 
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
+// Ensure the ::google namespace exists, required for checking existence of 
+// ::google::dense_hash_map and ::google::sparse_hash_map.
+namespace google {}
+#endif
+
+namespace Eigen {
 
 /** Represents a std::map
   *
@@ -56,7 +62,26 @@ template<typename Scalar> struct StdUnorderedMapTraits
 };
 #endif // EIGEN_UNORDERED_MAP_SUPPORT
 
-#ifdef _DENSE_HASH_MAP_H_
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
+
+namespace google {
+  
+// Namespace work-around, since sometimes dense_hash_map and sparse_hash_map
+// are in the global namespace, and other times they are under ::google.
+using namespace ::google;
+
+template<typename KeyType, typename Scalar>
+struct DenseHashMap {
+  typedef dense_hash_map<KeyType, Scalar> type;
+};
+
+template<typename KeyType, typename Scalar>
+struct SparseHashMap {
+  typedef sparse_hash_map<KeyType, Scalar> type;
+};
+
+} // namespace google
+
 /** Represents a google::dense_hash_map
   *
   * \see RandomSetter
@@ -64,7 +89,7 @@ template<typename Scalar> struct StdUnorderedMapTraits
 template<typename Scalar> struct GoogleDenseHashMapTraits
 {
   typedef int KeyType;
-  typedef google::dense_hash_map<KeyType,Scalar> Type;
+  typedef typename google::DenseHashMap<KeyType,Scalar>::type Type;
   enum {
     IsSorted = 0
   };
@@ -72,9 +97,7 @@ template<typename Scalar> struct GoogleDenseHashMapTraits
   static void setInvalidKey(Type& map, const KeyType& k)
   { map.set_empty_key(k); }
 };
-#endif
 
-#ifdef _SPARSE_HASH_MAP_H_
 /** Represents a google::sparse_hash_map
   *
   * \see RandomSetter
@@ -82,7 +105,7 @@ template<typename Scalar> struct GoogleDenseHashMapTraits
 template<typename Scalar> struct GoogleSparseHashMapTraits
 {
   typedef int KeyType;
-  typedef google::sparse_hash_map<KeyType,Scalar> Type;
+  typedef typename google::SparseHashMap<KeyType,Scalar>::type Type;
   enum {
     IsSorted = 0
   };
@@ -134,18 +157,17 @@ template<typename Scalar> struct GoogleSparseHashMapTraits
   * GoogleSparseHashMapTraits, GnuHashMapTraits, and finally StdMapTraits.
   *
   * For performance and memory consumption reasons it is highly recommended to use one of
-  * the Google's hash_map implementation. To enable the support for them, you have two options:
-  *  - \#include <google/dense_hash_map> yourself \b before Eigen/Sparse header
-  *  - define EIGEN_GOOGLEHASH_SUPPORT
-  * In the later case the inclusion of <google/dense_hash_map> is made for you.
+  * Google's hash_map implementations. To enable the support for them, you must define
+  * EIGEN_GOOGLEHASH_SUPPORT. This will include both <google/dense_hash_map> and
+  * <google/sparse_hash_map> for you.
   *
-  * \see http://code.google.com/p/google-sparsehash/
+  * \see https://github.com/sparsehash/sparsehash
   */
 template<typename SparseMatrixType,
          template <typename T> class MapTraits =
-#if defined _DENSE_HASH_MAP_H_
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
           GoogleDenseHashMapTraits
-#elif defined _HASH_MAP
+#elif defined(_HASH_MAP)
           GnuHashMapTraits
 #else
           StdMapTraits
@@ -249,10 +271,10 @@ class RandomSetter
           }
         }
         // prefix sum
-        Index count = 0;
+        StorageIndex count = 0;
         for (Index j=0; j<mp_target->outerSize(); ++j)
         {
-          Index tmp = positions[j];
+          StorageIndex tmp = positions[j];
           mp_target->outerIndexPtr()[j] = count;
           positions[j] = count;
           count += tmp;
@@ -281,7 +303,7 @@ class RandomSetter
               mp_target->innerIndexPtr()[i+1] = mp_target->innerIndexPtr()[i];
               --i;
             }
-            mp_target->innerIndexPtr()[i+1] = inner;
+            mp_target->innerIndexPtr()[i+1] = internal::convert_index<StorageIndex>(inner);
             mp_target->valuePtr()[i+1] = it->second.value;
           }
         }
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h
new file mode 100644
index 0000000000000000000000000000000000000000..41d2bf61c1fa3c44658db01148cbd8d6bc97ff2e
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h
@@ -0,0 +1,286 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
+#define EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
+
+namespace Eigen {
+
+/** \returns an expression of the coefficient-wise i0(\a x) to the given
+ * arrays.
+  *
+  * It returns the modified Bessel function of the first kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of i0(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_i0()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_i0_op<typename Derived::Scalar>, const Derived>
+bessel_i0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_i0_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i0e(\a x) to the given
+ * arrays.
+  *
+  * It returns the exponentially scaled modified Bessel
+  * function of the first kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of i0e(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_i0e()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_i0e_op<typename Derived::Scalar>, const Derived>
+bessel_i0e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_i0e_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i1(\a x) to the given
+ * arrays.
+  *
+  * It returns the modified Bessel function of the first kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of i1(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_i1()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_i1_op<typename Derived::Scalar>, const Derived>
+bessel_i1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_i1_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i1e(\a x) to the given
+ * arrays.
+  *
+  * It returns the exponentially scaled modified Bessel
+  * function of the first kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of i1e(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_i1e()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_i1e_op<typename Derived::Scalar>, const Derived>
+bessel_i1e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_i1e_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k0(\a x) to the given
+ * arrays.
+  *
+  * It returns the modified Bessel function of the second kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of k0(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_k0()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_k0_op<typename Derived::Scalar>, const Derived>
+bessel_k0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_k0_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k0e(\a x) to the given
+ * arrays.
+  *
+  * It returns the exponentially scaled modified Bessel
+  * function of the second kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of k0e(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_k0e()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_k0e_op<typename Derived::Scalar>, const Derived>
+bessel_k0e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_k0e_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k1(\a x) to the given
+ * arrays.
+  *
+  * It returns the modified Bessel function of the second kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of k1(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_k1()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_k1_op<typename Derived::Scalar>, const Derived>
+bessel_k1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_k1_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k1e(\a x) to the given
+ * arrays.
+  *
+  * It returns the exponentially scaled modified Bessel
+  * function of the second kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of k1e(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_k1e()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_k1e_op<typename Derived::Scalar>, const Derived>
+bessel_k1e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_k1e_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise j0(\a x) to the given
+ * arrays.
+  *
+  * It returns the Bessel function of the first kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of j0(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_j0()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_j0_op<typename Derived::Scalar>, const Derived>
+bessel_j0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_j0_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise y0(\a x) to the given
+ * arrays.
+  *
+  * It returns the Bessel function of the second kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of y0(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_y0()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_y0_op<typename Derived::Scalar>, const Derived>
+bessel_y0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_y0_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise j1(\a x) to the given
+ * arrays.
+  *
+  * It returns the modified Bessel function of the first kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of j1(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_j1()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_j1_op<typename Derived::Scalar>, const Derived>
+bessel_j1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_j1_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise y1(\a x) to the given
+ * arrays.
+  *
+  * It returns the Bessel function of the second kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of y1(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_y1()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_y1_op<typename Derived::Scalar>, const Derived>
+bessel_y1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_y1_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..6049cc2feeef88b4002657598ba247a2825416db
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h
@@ -0,0 +1,68 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_BFLOAT16_H
+#define EIGEN_BESSELFUNCTIONS_BFLOAT16_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i0e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i1e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i1e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_j0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_j0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_j1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_j1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_y0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_y0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_y1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_y1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k0e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k1e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k1e(static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSELFUNCTIONS_BFLOAT16_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h
new file mode 100644
index 0000000000000000000000000000000000000000..8606a9f8e8b7d6bf3bf70d01b0fc1499d5f45063
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h
@@ -0,0 +1,357 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_FUNCTORS_H
+#define EIGEN_BESSELFUNCTIONS_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the first
+ * kind of order zero.
+ * \sa class CwiseUnaryOp, Cwise::bessel_i0()
+ */
+template <typename Scalar>
+struct scalar_bessel_i0_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i0_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i0;
+    return bessel_i0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_i0(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i0_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions. We also add
+    // the cost of an additional exp over i0e.
+    Cost = 28 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the first kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_i0e()
+ */
+template <typename Scalar>
+struct scalar_bessel_i0e_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i0e_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i0e;
+    return bessel_i0e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_i0e(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i0e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions.
+    Cost = 20 * NumTraits<Scalar>::MulCost + 40 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the first
+ * kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_i1()
+ */
+template <typename Scalar>
+struct scalar_bessel_i1_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i1_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i1;
+    return bessel_i1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_i1(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i1_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions. We also add
+    // the cost of an additional exp over i1e.
+    Cost = 28 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the first kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_i1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_i1e_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i1e_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i1e;
+    return bessel_i1e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_i1e(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i1e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions.
+    Cost = 20 * NumTraits<Scalar>::MulCost + 40 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_j0()
+ */
+template <typename Scalar>
+struct scalar_bessel_j0_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_j0_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_j0;
+    return bessel_j0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_j0(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_j0_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine and rsqrt cost.
+    Cost = 63 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_y0()
+ */
+template <typename Scalar>
+struct scalar_bessel_y0_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_y0_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_y0;
+    return bessel_y0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_y0(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_y0_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine, rsqrt and j0 cost.
+    Cost = 126 * NumTraits<Scalar>::MulCost + 96 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the first kind of
+ * order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_j1()
+ */
+template <typename Scalar>
+struct scalar_bessel_j1_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_j1_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_j1;
+    return bessel_j1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_j1(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_j1_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine and rsqrt cost.
+    Cost = 63 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_j1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_y1_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_y1_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_y1;
+    return bessel_y1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_y1(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_y1_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine, rsqrt and j1 cost.
+    Cost = 126 * NumTraits<Scalar>::MulCost + 96 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the second
+ * kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_k0()
+ */
+template <typename Scalar>
+struct scalar_bessel_k0_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k0_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k0;
+    return bessel_k0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_k0(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k0_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i0, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the second kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_k0e()
+ */
+template <typename Scalar>
+struct scalar_bessel_k0e_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k0e_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k0e;
+    return bessel_k0e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_k0e(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k0e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i0, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the
+ * second kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_k1()
+ */
+template <typename Scalar>
+struct scalar_bessel_k1_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k1_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k1;
+    return bessel_k1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_k1(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k1_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i1, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the second kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_k1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_k1e_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k1e_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k1e;
+    return bessel_k1e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_k1e(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k1e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i1, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BESSELFUNCTIONS_FUNCTORS_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h
new file mode 100644
index 0000000000000000000000000000000000000000..8930d1a3cb6a472e7a612c8d25aded848560d09c
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h
@@ -0,0 +1,66 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_HALF_H
+#define EIGEN_BESSELFUNCTIONS_HALF_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i0e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i1e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i1e(static_cast<float>(x)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_j0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_j0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_j1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_j1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_y0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_y0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_y1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_y1(static_cast<float>(x)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k0e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k1e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k1e(static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSELFUNCTIONS_HALF_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..24812be1bf6765a8fdfac60d3c934e56c7201802
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h
@@ -0,0 +1,1959 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSEL_FUNCTIONS_H
+#define EIGEN_BESSEL_FUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+//  Parts of this code are based on the Cephes Math Library.
+//
+//  Cephes Math Library Release 2.8:  June, 2000
+//  Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+//
+//  Permission has been kindly provided by the original author
+//  to incorporate the Cephes software into the Eigen codebase:
+//
+//    From: Stephen Moshier
+//    To: Eugene Brevdo
+//    Subject: Re: Permission to wrap several cephes functions in Eigen
+//
+//    Hello Eugene,
+//
+//    Thank you for writing.
+//
+//    If your licensing is similar to BSD, the formal way that has been
+//    handled is simply to add a statement to the effect that you are incorporating
+//    the Cephes software by permission of the author.
+//
+//    Good luck with your project,
+//    Steve
+
+
+/****************************************************************************
+ * Implementation of Bessel function, based on Cephes                       *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct bessel_i0e_retval {
+  typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i0e {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_i0e<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  i0ef.c
+     *
+     *  Modified Bessel function of order zero,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, i0ef();
+     *
+     * y = i0ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order zero of the argument.
+     *
+     * The function is defined as i0e(x) = exp(-|x|) j0( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        100000      3.7e-7      7.0e-8
+     * See i0f().
+     *
+     */
+
+    const float A[] = {-1.30002500998624804212E-8f, 6.04699502254191894932E-8f,
+                       -2.67079385394061173391E-7f, 1.11738753912010371815E-6f,
+                       -4.41673835845875056359E-6f, 1.64484480707288970893E-5f,
+                       -5.75419501008210370398E-5f, 1.88502885095841655729E-4f,
+                       -5.76375574538582365885E-4f, 1.63947561694133579842E-3f,
+                       -4.32430999505057594430E-3f, 1.05464603945949983183E-2f,
+                       -2.37374148058994688156E-2f, 4.93052842396707084878E-2f,
+                       -9.49010970480476444210E-2f, 1.71620901522208775349E-1f,
+                       -3.04682672343198398683E-1f, 6.76795274409476084995E-1f};
+
+    const float B[] = {3.39623202570838634515E-9f, 2.26666899049817806459E-8f,
+                       2.04891858946906374183E-7f, 2.89137052083475648297E-6f,
+                       6.88975834691682398426E-5f, 3.36911647825569408990E-3f,
+                       8.04490411014108831608E-1f};
+    T y = pabs(x);
+    T y_le_eight = internal::pchebevl<T, 18>::run(
+        pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A);
+    T y_gt_eight = pmul(
+        internal::pchebevl<T, 7>::run(
+            psub(pdiv(pset1<T>(32.0f), y), pset1<T>(2.0f)), B),
+        prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    return pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
+  }
+};
+
+template <typename T>
+struct generic_i0e<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  i0e.c
+     *
+     *  Modified Bessel function of order zero,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, i0e();
+     *
+     * y = i0e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order zero of the argument.
+     *
+     * The function is defined as i0e(x) = exp(-|x|) j0( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       5.4e-16     1.2e-16
+     * See i0().
+     *
+     */
+
+    const double A[] = {-4.41534164647933937950E-18, 3.33079451882223809783E-17,
+                        -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+                        -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+                        -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+                        -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+                        -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+                        -1.30002500998624804212E-8,  6.04699502254191894932E-8,
+                        -2.67079385394061173391E-7,  1.11738753912010371815E-6,
+                        -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+                        -5.75419501008210370398E-5,  1.88502885095841655729E-4,
+                        -5.76375574538582365885E-4,  1.63947561694133579842E-3,
+                        -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+                        -2.37374148058994688156E-2,  4.93052842396707084878E-2,
+                        -9.49010970480476444210E-2,  1.71620901522208775349E-1,
+                        -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+    const double B[] = {
+        -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+        4.46562142029675999901E-17,  3.46122286769746109310E-17,
+        -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+        1.77256013305652638360E-15,  3.81168066935262242075E-15,
+        -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+        1.54008621752140982691E-14,  3.85277838274214270114E-13,
+        7.18012445138366623367E-13,  -1.79417853150680611778E-12,
+        -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+        1.18891471078464383424E-11,  4.94060238822496958910E-10,
+        3.39623202570838634515E-9,   2.26666899049817806459E-8,
+        2.04891858946906374183E-7,   2.89137052083475648297E-6,
+        6.88975834691682398426E-5,   3.36911647825569408990E-3,
+        8.04490411014108831608E-1};
+    T y = pabs(x);
+    T y_le_eight = internal::pchebevl<T, 30>::run(
+        pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A);
+    T y_gt_eight = pmul(
+        internal::pchebevl<T, 25>::run(
+            psub(pdiv(pset1<T>(32.0), y), pset1<T>(2.0)), B),
+        prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    return pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
+  }
+};
+
+template <typename T>
+struct bessel_i0e_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_i0e<T>::run(x);
+  }
+};
+
+template <typename Scalar>
+struct bessel_i0_retval {
+  typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i0 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    return pmul(
+        pexp(pabs(x)),
+        generic_i0e<T, ScalarType>::run(x));
+  }
+};
+
+template <typename T>
+struct bessel_i0_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_i0<T>::run(x);
+  }
+};
+
+template <typename Scalar>
+struct bessel_i1e_retval {
+  typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type >
+struct generic_i1e {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_i1e<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* i1ef.c
+     *
+     *  Modified Bessel function of order one,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, i1ef();
+     *
+     * y = i1ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order one of the argument.
+     *
+     * The function is defined as i1(x) = -i exp(-|x|) j1( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       1.5e-6      1.5e-7
+     * See i1().
+     *
+     */
+    const float A[] = {9.38153738649577178388E-9f, -4.44505912879632808065E-8f,
+                       2.00329475355213526229E-7f, -8.56872026469545474066E-7f,
+                       3.47025130813767847674E-6f, -1.32731636560394358279E-5f,
+                       4.78156510755005422638E-5f, -1.61760815825896745588E-4f,
+                       5.12285956168575772895E-4f, -1.51357245063125314899E-3f,
+                       4.15642294431288815669E-3f, -1.05640848946261981558E-2f,
+                       2.47264490306265168283E-2f, -5.29459812080949914269E-2f,
+                       1.02643658689847095384E-1f, -1.76416518357834055153E-1f,
+                       2.52587186443633654823E-1f};
+
+    const float B[] = {-3.83538038596423702205E-9f, -2.63146884688951950684E-8f,
+                       -2.51223623787020892529E-7f, -3.88256480887769039346E-6f,
+                       -1.10588938762623716291E-4f, -9.76109749136146840777E-3f,
+                       7.78576235018280120474E-1f};
+
+
+    T y = pabs(x);
+    T y_le_eight = pmul(y, internal::pchebevl<T, 17>::run(
+        pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A));
+    T y_gt_eight = pmul(
+        internal::pchebevl<T, 7>::run(
+            psub(pdiv(pset1<T>(32.0f), y),
+                 pset1<T>(2.0f)), B),
+        prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    y = pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
+    return pselect(pcmp_lt(x, pset1<T>(0.0f)), pnegate(y), y);
+  }
+};
+
+template <typename T>
+struct generic_i1e<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  i1e.c
+     *
+     *  Modified Bessel function of order one,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, i1e();
+     *
+     * y = i1e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order one of the argument.
+     *
+     * The function is defined as i1(x) = -i exp(-|x|) j1( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       2.0e-15     2.0e-16
+     * See i1().
+     *
+     */
+    const double A[] = {2.77791411276104639959E-18, -2.11142121435816608115E-17,
+                        1.55363195773620046921E-16, -1.10559694773538630805E-15,
+                        7.60068429473540693410E-15, -5.04218550472791168711E-14,
+                        3.22379336594557470981E-13, -1.98397439776494371520E-12,
+                        1.17361862988909016308E-11, -6.66348972350202774223E-11,
+                        3.62559028155211703701E-10, -1.88724975172282928790E-9,
+                        9.38153738649577178388E-9,  -4.44505912879632808065E-8,
+                        2.00329475355213526229E-7,  -8.56872026469545474066E-7,
+                        3.47025130813767847674E-6,  -1.32731636560394358279E-5,
+                        4.78156510755005422638E-5,  -1.61760815825896745588E-4,
+                        5.12285956168575772895E-4,  -1.51357245063125314899E-3,
+                        4.15642294431288815669E-3,  -1.05640848946261981558E-2,
+                        2.47264490306265168283E-2,  -5.29459812080949914269E-2,
+                        1.02643658689847095384E-1,  -1.76416518357834055153E-1,
+                        2.52587186443633654823E-1};
+    const double B[] = {
+        7.51729631084210481353E-18,  4.41434832307170791151E-18,
+        -4.65030536848935832153E-17, -3.20952592199342395980E-17,
+        2.96262899764595013876E-16,  3.30820231092092828324E-16,
+        -1.88035477551078244854E-15, -3.81440307243700780478E-15,
+        1.04202769841288027642E-14,  4.27244001671195135429E-14,
+        -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+        -7.19855177624590851209E-13, 2.03562854414708950722E-12,
+        1.41258074366137813316E-11,  3.25260358301548823856E-11,
+        -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+        -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
+        -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
+        -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
+        7.78576235018280120474E-1};
+    T y = pabs(x);
+    T y_le_eight = pmul(y, internal::pchebevl<T, 29>::run(
+        pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A));
+    T y_gt_eight = pmul(
+        internal::pchebevl<T, 25>::run(
+            psub(pdiv(pset1<T>(32.0), y),
+                 pset1<T>(2.0)), B),
+        prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    y = pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
+    return pselect(pcmp_lt(x, pset1<T>(0.0)), pnegate(y), y);
+  }
+};
+
+template <typename T>
+struct bessel_i1e_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_i1e<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_i1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i1 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    return pmul(
+        pexp(pabs(x)),
+        generic_i1e<T, ScalarType>::run(x));
+  }
+};
+
+template <typename T>
+struct bessel_i1_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_i1<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_k0e_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k0e {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_k0e<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k0ef.c
+     *	Modified Bessel function, third kind, order zero,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k0ef();
+     *
+     * y = k0ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order zero of the argument.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       8.1e-7      7.8e-8
+     * See k0().
+     *
+     */
+
+    const float A[] = {1.90451637722020886025E-9f, 2.53479107902614945675E-7f,
+                       2.28621210311945178607E-5f, 1.26461541144692592338E-3f,
+                       3.59799365153615016266E-2f, 3.44289899924628486886E-1f,
+                       -5.35327393233902768720E-1f};
+
+    const float B[] = {-1.69753450938905987466E-9f, 8.57403401741422608519E-9f,
+                       -4.66048989768794782956E-8f, 2.76681363944501510342E-7f,
+                       -1.83175552271911948767E-6f, 1.39498137188764993662E-5f,
+                       -1.28495495816278026384E-4f, 1.56988388573005337491E-3f,
+                       -3.14481013119645005427E-2f, 2.44030308206595545468E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 7>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(
+        generic_i0<T, float>::run(x), pnegate(
+            plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pmul(pexp(x), x_le_two);
+    T x_gt_two = pmul(
+            internal::pchebevl<T, 10>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B),
+            prsqrt(x));
+    return pselect(
+        pcmp_le(x, pset1<T>(0.0)),
+        MAXNUM,
+        pselect(pcmp_le(x, two), x_le_two, x_gt_two));
+  }
+};
+
+template <typename T>
+struct generic_k0e<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k0e.c
+     *	Modified Bessel function, third kind, order zero,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, k0e();
+     *
+     * y = k0e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order zero of the argument.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       1.4e-15     1.4e-16
+     * See k0().
+     *
+     */
+
+    const double A[] = {
+      1.37446543561352307156E-16,
+      4.25981614279661018399E-14,
+      1.03496952576338420167E-11,
+      1.90451637722020886025E-9,
+      2.53479107902614945675E-7,
+      2.28621210311945178607E-5,
+      1.26461541144692592338E-3,
+      3.59799365153615016266E-2,
+      3.44289899924628486886E-1,
+      -5.35327393233902768720E-1};
+    const double B[] = {
+       5.30043377268626276149E-18, -1.64758043015242134646E-17,
+       5.21039150503902756861E-17, -1.67823109680541210385E-16,
+       5.51205597852431940784E-16, -1.84859337734377901440E-15,
+       6.34007647740507060557E-15, -2.22751332699166985548E-14,
+       8.03289077536357521100E-14, -2.98009692317273043925E-13,
+       1.14034058820847496303E-12, -4.51459788337394416547E-12,
+       1.85594911495471785253E-11, -7.95748924447710747776E-11,
+       3.57739728140030116597E-10, -1.69753450938905987466E-9,
+       8.57403401741422608519E-9, -4.66048989768794782956E-8,
+       2.76681363944501510342E-7, -1.83175552271911948767E-6,
+       1.39498137188764993662E-5, -1.28495495816278026384E-4,
+       1.56988388573005337491E-3, -3.14481013119645005427E-2,
+       2.44030308206595545468E0
+    };
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 10>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(
+        generic_i0<T, double>::run(x), pmul(
+            pset1<T>(-1.0), plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pmul(pexp(x), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+            internal::pchebevl<T, 25>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B),
+            prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k0e_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_k0e<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_k0_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k0 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_k0<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k0f.c
+     *	Modified Bessel function, third kind, order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k0f();
+     *
+     * y = k0f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns modified Bessel function of the third kind
+     * of order zero of the argument.
+     *
+     * The range is partitioned into the two intervals [0,8] and
+     * (8, infinity).  Chebyshev polynomial expansions are employed
+     * in each interval.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     * Tested at 2000 random points between 0 and 8.  Peak absolute
+     * error (relative when K0 > 1) was 1.46e-14; rms, 4.26e-15.
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       7.8e-7      8.5e-8
+     *
+     * ERROR MESSAGES:
+     *
+     *   message         condition      value returned
+     *  K0 domain          x <= 0          MAXNUM
+     *
+     */
+
+    const float A[] = {1.90451637722020886025E-9f, 2.53479107902614945675E-7f,
+                       2.28621210311945178607E-5f, 1.26461541144692592338E-3f,
+                       3.59799365153615016266E-2f, 3.44289899924628486886E-1f,
+                       -5.35327393233902768720E-1f};
+
+    const float B[] = {-1.69753450938905987466E-9f, 8.57403401741422608519E-9f,
+                       -4.66048989768794782956E-8f, 2.76681363944501510342E-7f,
+                       -1.83175552271911948767E-6f, 1.39498137188764993662E-5f,
+                       -1.28495495816278026384E-4f, 1.56988388573005337491E-3f,
+                       -3.14481013119645005427E-2f, 2.44030308206595545468E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 7>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(
+        generic_i0<T, float>::run(x), pnegate(
+            plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        pmul(
+            pexp(pnegate(x)),
+            internal::pchebevl<T, 10>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B)),
+        prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_k0<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*
+     *
+     *	Modified Bessel function, third kind, order zero,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, k0();
+     *
+     * y = k0( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order zero of the argument.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       1.4e-15     1.4e-16
+     * See k0().
+     *
+     */
+    const double A[] = {
+      1.37446543561352307156E-16,
+      4.25981614279661018399E-14,
+      1.03496952576338420167E-11,
+      1.90451637722020886025E-9,
+      2.53479107902614945675E-7,
+      2.28621210311945178607E-5,
+      1.26461541144692592338E-3,
+      3.59799365153615016266E-2,
+      3.44289899924628486886E-1,
+      -5.35327393233902768720E-1};
+    const double B[] = {
+       5.30043377268626276149E-18, -1.64758043015242134646E-17,
+       5.21039150503902756861E-17, -1.67823109680541210385E-16,
+       5.51205597852431940784E-16, -1.84859337734377901440E-15,
+       6.34007647740507060557E-15, -2.22751332699166985548E-14,
+       8.03289077536357521100E-14, -2.98009692317273043925E-13,
+       1.14034058820847496303E-12, -4.51459788337394416547E-12,
+       1.85594911495471785253E-11, -7.95748924447710747776E-11,
+       3.57739728140030116597E-10, -1.69753450938905987466E-9,
+       8.57403401741422608519E-9, -4.66048989768794782956E-8,
+       2.76681363944501510342E-7, -1.83175552271911948767E-6,
+       1.39498137188764993662E-5, -1.28495495816278026384E-4,
+       1.56988388573005337491E-3, -3.14481013119645005427E-2,
+       2.44030308206595545468E0
+    };
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 10>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(
+        generic_i0<T, double>::run(x), pnegate(
+            plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        pmul(
+            pexp(-x),
+            internal::pchebevl<T, 25>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B)),
+        prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k0_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_k0<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_k1e_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k1e {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_k1e<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* k1ef.c
+     *
+     *	Modified Bessel function, third kind, order one,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k1ef();
+     *
+     * y = k1ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order one of the argument:
+     *
+     *      k1e(x) = exp(x) * k1(x).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       4.9e-7      6.7e-8
+     * See k1().
+     *
+     */
+
+    const float A[] = {-2.21338763073472585583E-8f, -2.43340614156596823496E-6f,
+                        -1.73028895751305206302E-4f, -6.97572385963986435018E-3f,
+                        -1.22611180822657148235E-1f, -3.53155960776544875667E-1f,
+                        1.52530022733894777053E0f};
+    const float B[] = {2.01504975519703286596E-9f, -1.03457624656780970260E-8f,
+                       5.74108412545004946722E-8f, -3.50196060308781257119E-7f,
+                       2.40648494783721712015E-6f, -1.93619797416608296024E-5f,
+                       1.95215518471351631108E-4f, -2.85781685962277938680E-3f,
+                       1.03923736576817238437E-1f, 2.72062619048444266945E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 7>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(
+        generic_i1<T, float>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pmul(x_le_two, pexp(x));
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        internal::pchebevl<T, 10>::run(
+            psub(pdiv(pset1<T>(8.0), x), two), B),
+        prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_k1e<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k1e.c
+     *
+     *	Modified Bessel function, third kind, order one,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, k1e();
+     *
+     * y = k1e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order one of the argument:
+     *
+     *      k1e(x) = exp(x) * k1(x).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       7.8e-16     1.2e-16
+     * See k1().
+     *
+     */
+    const double A[] = {-7.02386347938628759343E-18, -2.42744985051936593393E-15,
+                        -6.66690169419932900609E-13, -1.41148839263352776110E-10,
+                        -2.21338763073472585583E-8, -2.43340614156596823496E-6,
+                        -1.73028895751305206302E-4, -6.97572385963986435018E-3,
+                        -1.22611180822657148235E-1, -3.53155960776544875667E-1,
+                        1.52530022733894777053E0};
+    const double B[] = {-5.75674448366501715755E-18, 1.79405087314755922667E-17,
+                        -5.68946255844285935196E-17, 1.83809354436663880070E-16,
+                        -6.05704724837331885336E-16, 2.03870316562433424052E-15,
+                        -7.01983709041831346144E-15, 2.47715442448130437068E-14,
+                        -8.97670518232499435011E-14, 3.34841966607842919884E-13,
+                        -1.28917396095102890680E-12, 5.13963967348173025100E-12,
+                        -2.12996783842756842877E-11, 9.21831518760500529508E-11,
+                        -4.19035475934189648750E-10, 2.01504975519703286596E-9,
+                        -1.03457624656780970260E-8, 5.74108412545004946722E-8,
+                        -3.50196060308781257119E-7, 2.40648494783721712015E-6,
+                        -1.93619797416608296024E-5, 1.95215518471351631108E-4,
+                        -2.85781685962277938680E-3, 1.03923736576817238437E-1,
+                        2.72062619048444266945E0};
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 11>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(
+        generic_i1<T, double>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pmul(x_le_two, pexp(x));
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        internal::pchebevl<T, 25>::run(
+            psub(pdiv(pset1<T>(8.0), x), two), B),
+        prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k1e_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_k1e<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_k1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k1 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_k1<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* k1f.c
+     *	Modified Bessel function, third kind, order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k1f();
+     *
+     * y = k1f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Computes the modified Bessel function of the third kind
+     * of order one of the argument.
+     *
+     * The range is partitioned into the two intervals [0,2] and
+     * (2, infinity).  Chebyshev polynomial expansions are employed
+     * in each interval.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       4.6e-7      7.6e-8
+     *
+     * ERROR MESSAGES:
+     *
+     *   message         condition      value returned
+     * k1 domain          x <= 0          MAXNUM
+     *
+     */
+
+    const float A[] = {-2.21338763073472585583E-8f, -2.43340614156596823496E-6f,
+                        -1.73028895751305206302E-4f, -6.97572385963986435018E-3f,
+                        -1.22611180822657148235E-1f, -3.53155960776544875667E-1f,
+                        1.52530022733894777053E0f};
+    const float B[] = {2.01504975519703286596E-9f, -1.03457624656780970260E-8f,
+                       5.74108412545004946722E-8f, -3.50196060308781257119E-7f,
+                       2.40648494783721712015E-6f, -1.93619797416608296024E-5f,
+                       1.95215518471351631108E-4f, -2.85781685962277938680E-3f,
+                       1.03923736576817238437E-1f, 2.72062619048444266945E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 7>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(
+        generic_i1<T, float>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        pexp(pnegate(x)),
+        pmul(
+            internal::pchebevl<T, 10>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B),
+            prsqrt(x)));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_k1<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k1.c
+     *	Modified Bessel function, third kind, order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k1f();
+     *
+     * y = k1f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Computes the modified Bessel function of the third kind
+     * of order one of the argument.
+     *
+     * The range is partitioned into the two intervals [0,2] and
+     * (2, infinity).  Chebyshev polynomial expansions are employed
+     * in each interval.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       4.6e-7      7.6e-8
+     *
+     * ERROR MESSAGES:
+     *
+     *   message         condition      value returned
+     * k1 domain          x <= 0          MAXNUM
+     *
+     */
+    const double A[] = {-7.02386347938628759343E-18, -2.42744985051936593393E-15,
+                        -6.66690169419932900609E-13, -1.41148839263352776110E-10,
+                        -2.21338763073472585583E-8, -2.43340614156596823496E-6,
+                        -1.73028895751305206302E-4, -6.97572385963986435018E-3,
+                        -1.22611180822657148235E-1, -3.53155960776544875667E-1,
+                        1.52530022733894777053E0};
+    const double B[] = {-5.75674448366501715755E-18, 1.79405087314755922667E-17,
+                        -5.68946255844285935196E-17, 1.83809354436663880070E-16,
+                        -6.05704724837331885336E-16, 2.03870316562433424052E-15,
+                        -7.01983709041831346144E-15, 2.47715442448130437068E-14,
+                        -8.97670518232499435011E-14, 3.34841966607842919884E-13,
+                        -1.28917396095102890680E-12, 5.13963967348173025100E-12,
+                        -2.12996783842756842877E-11, 9.21831518760500529508E-11,
+                        -4.19035475934189648750E-10, 2.01504975519703286596E-9,
+                        -1.03457624656780970260E-8, 5.74108412545004946722E-8,
+                        -3.50196060308781257119E-7, 2.40648494783721712015E-6,
+                        -1.93619797416608296024E-5, 1.95215518471351631108E-4,
+                        -2.85781685962277938680E-3, 1.03923736576817238437E-1,
+                        2.72062619048444266945E0};
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 11>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(
+        generic_i1<T, double>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        pexp(-x),
+        pmul(
+            internal::pchebevl<T, 25>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B),
+            prsqrt(x)));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k1_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_k1<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_j0_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_j0 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_j0<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j0f.c
+     *	Bessel function of order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, j0f();
+     *
+     * y = j0f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order zero of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval the following polynomial
+     * approximation is used:
+     *
+     *
+     *        2         2         2
+     * (w - r  ) (w - r  ) (w - r  ) P(w)
+     *       1         2         3
+     *
+     *            2
+     * where w = x  and the three r's are zeros of the function.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x R(1/x^2) - pi/4.  The function is
+     *
+     *   j0(x) = Modulus(x) cos( Phase(x) ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 2        100000      1.3e-7      3.6e-8
+     *    IEEE      2, 32       100000      1.9e-7      5.4e-8
+     *
+     */
+
+    const float JP[] = {-6.068350350393235E-008f, 6.388945720783375E-006f,
+                        -3.969646342510940E-004f, 1.332913422519003E-002f,
+                        -1.729150680240724E-001f};
+    const float MO[] = {-6.838999669318810E-002f, 1.864949361379502E-001f,
+                        -2.145007480346739E-001f, 1.197549369473540E-001f,
+                        -3.560281861530129E-003f, -4.969382655296620E-002f,
+                        -3.355424622293709E-006f, 7.978845717621440E-001f};
+    const float PH[] = {3.242077816988247E+001f, -3.630592630518434E+001f,
+                        1.756221482109099E+001f, -4.974978466280903E+000f,
+                        1.001973420681837E+000f, -1.939906941791308E-001f,
+                        6.490598792654666E-002f, -1.249992184872738E-001f};
+    const T DR1 =  pset1<T>(5.78318596294678452118f);
+    const T NEG_PIO4F = pset1<T>(-0.7853981633974483096f); /* -pi / 4 */
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_two = pselect(
+        pcmp_lt(y, pset1<T>(1.0e-3f)),
+        pmadd(z, pset1<T>(-0.25f), pset1<T>(1.0f)),
+        pmul(psub(z, DR1), internal::ppolevl<T, 4>::run(z, JP)));
+    T q = pdiv(pset1<T>(1.0f), y);
+    T w = prsqrt(y);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO));
+    w = pmul(q, q);
+    T yn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH), NEG_PIO4F);
+    T y_gt_two = pmul(p, pcos(padd(yn, y)));
+    return pselect(pcmp_le(y, pset1<T>(2.0)), y_le_two, y_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_j0<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j0.c
+     *	Bessel function of order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, j0();
+     *
+     * y = j0( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order zero of the argument.
+     *
+     * The domain is divided into the intervals [0, 5] and
+     * (5, infinity). In the first interval the following rational
+     * approximation is used:
+     *
+     *
+     *        2         2
+     * (w - r  ) (w - r  ) P (w) / Q (w)
+     *       1         2    3       8
+     *
+     *            2
+     * where w = x  and the two r's are zeros of the function.
+     *
+     * In the second interval, the Hankel asymptotic expansion
+     * is employed with two rational functions of degree 6/6
+     * and 7/7.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain     # trials      peak         rms
+     *    DEC       0, 30       10000       4.4e-17     6.3e-18
+     *    IEEE      0, 30       60000       4.2e-16     1.1e-16
+     *
+     */
+    const double PP[] = {7.96936729297347051624E-4, 8.28352392107440799803E-2,
+                        1.23953371646414299388E0, 5.44725003058768775090E0,
+                        8.74716500199817011941E0, 5.30324038235394892183E0,
+                        9.99999999999999997821E-1};
+    const double PQ[] = {9.24408810558863637013E-4, 8.56288474354474431428E-2,
+                         1.25352743901058953537E0, 5.47097740330417105182E0,
+                         8.76190883237069594232E0, 5.30605288235394617618E0,
+                         1.00000000000000000218E0};
+    const double QP[] = {-1.13663838898469149931E-2, -1.28252718670509318512E0,
+                         -1.95539544257735972385E1, -9.32060152123768231369E1,
+                         -1.77681167980488050595E2, -1.47077505154951170175E2,
+                         -5.14105326766599330220E1, -6.05014350600728481186E0};
+    const double QQ[] = {1.00000000000000000000E0, 6.43178256118178023184E1,
+                         8.56430025976980587198E2, 3.88240183605401609683E3,
+                         7.24046774195652478189E3, 5.93072701187316984827E3,
+                         2.06209331660327847417E3, 2.42005740240291393179E2};
+    const double RP[] = {-4.79443220978201773821E9, 1.95617491946556577543E12,
+                         -2.49248344360967716204E14, 9.70862251047306323952E15};
+    const double RQ[] = {1.00000000000000000000E0, 4.99563147152651017219E2,
+                         1.73785401676374683123E5, 4.84409658339962045305E7,
+                         1.11855537045356834862E10, 2.11277520115489217587E12,
+                         3.10518229857422583814E14, 3.18121955943204943306E16,
+                         1.71086294081043136091E18};
+    const T DR1 = pset1<T>(5.78318596294678452118E0);
+    const T DR2 = pset1<T>(3.04712623436620863991E1);
+    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+    const T NEG_PIO4 = pset1<T>(-0.7853981633974483096); /* pi / 4 */
+
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_five = pselect(
+        pcmp_lt(y, pset1<T>(1.0e-5)),
+        pmadd(z, pset1<T>(-0.25), pset1<T>(1.0)),
+        pmul(pmul(psub(z, DR1), psub(z, DR2)),
+             pdiv(internal::ppolevl<T, 3>::run(z, RP),
+                  internal::ppolevl<T, 8>::run(z, RQ))));
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(
+        internal::ppolevl<T, 6>::run(s, PP),
+        internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(
+        internal::ppolevl<T, 7>::run(s, QP),
+        internal::ppolevl<T, 7>::run(s, QQ));
+    T yn = padd(y, NEG_PIO4);
+    T w = pdiv(pset1<T>(-5.0), y);
+    p = pmadd(p, pcos(yn), pmul(w, pmul(q, psin(yn))));
+    T y_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(y)));
+    return pselect(pcmp_le(y, pset1<T>(5.0)), y_le_five, y_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_j0_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_j0<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_y0_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_y0 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_y0<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j0f.c
+     * 	Bessel function of the second kind, order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, y0f();
+     *
+     * y = y0f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind, of order
+     * zero, of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval a rational approximation
+     * R(x) is employed to compute
+     *
+     *                  2         2         2
+     * y0(x)  =  (w - r  ) (w - r  ) (w - r  ) R(x)  +  2/pi ln(x) j0(x).
+     *                 1         2         3
+     *
+     * Thus a call to j0() is required.  The three zeros are removed
+     * from R(x) to improve its numerical stability.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x S(1/x^2) - pi/4.  Then the function is
+     *
+     *   y0(x) = Modulus(x) sin( Phase(x) ).
+     *
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *  Absolute error, when y0(x) < 1; else relative error:
+     *
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,  2       100000      2.4e-7      3.4e-8
+     *    IEEE      2, 32       100000      1.8e-7      5.3e-8
+     *
+     */
+
+    const float YP[] = {9.454583683980369E-008f, -9.413212653797057E-006f,
+                        5.344486707214273E-004f, -1.584289289821316E-002f,
+                        1.707584643733568E-001f};
+    const float MO[] = {-6.838999669318810E-002f, 1.864949361379502E-001f,
+                        -2.145007480346739E-001f, 1.197549369473540E-001f,
+                        -3.560281861530129E-003f, -4.969382655296620E-002f,
+                        -3.355424622293709E-006f, 7.978845717621440E-001f};
+    const float PH[] = {3.242077816988247E+001f, -3.630592630518434E+001f,
+                        1.756221482109099E+001f, -4.974978466280903E+000f,
+                        1.001973420681837E+000f, -1.939906941791308E-001f,
+                        6.490598792654666E-002f, -1.249992184872738E-001f};
+    const T YZ1 = pset1<T>(0.43221455686510834878f);
+    const T TWOOPI =  pset1<T>(0.636619772367581343075535f); /* 2 / pi */
+    const T NEG_PIO4F = pset1<T>(-0.7853981633974483096f); /* -pi / 4 */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<float>::infinity());
+    T z = pmul(x, x);
+    T x_le_two = pmul(TWOOPI, pmul(plog(x), generic_j0<T, float>::run(x)));
+    x_le_two = pmadd(
+        psub(z, YZ1), internal::ppolevl<T, 4>::run(z, YP), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_two);
+    T q = pdiv(pset1<T>(1.0), x);
+    T w = prsqrt(x);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO));
+    T u = pmul(q, q);
+    T xn = pmadd(q, internal::ppolevl<T, 7>::run(u, PH), NEG_PIO4F);
+    T x_gt_two = pmul(p, psin(padd(xn, x)));
+    return pselect(pcmp_le(x, pset1<T>(2.0)), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_y0<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j0.c
+     *	Bessel function of the second kind, order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, y0();
+     *
+     * y = y0( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind, of order
+     * zero, of the argument.
+     *
+     * The domain is divided into the intervals [0, 5] and
+     * (5, infinity). In the first interval a rational approximation
+     * R(x) is employed to compute
+     *   y0(x)  = R(x)  +   2 * log(x) * j0(x) / PI.
+     * Thus a call to j0() is required.
+     *
+     * In the second interval, the Hankel asymptotic expansion
+     * is employed with two rational functions of degree 6/6
+     * and 7/7.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *  Absolute error, when y0(x) < 1; else relative error:
+     *
+     * arithmetic   domain     # trials      peak         rms
+     *    DEC       0, 30        9400       7.0e-17     7.9e-18
+     *    IEEE      0, 30       30000       1.3e-15     1.6e-16
+     *
+     */
+    const double PP[] = {7.96936729297347051624E-4, 8.28352392107440799803E-2,
+                        1.23953371646414299388E0, 5.44725003058768775090E0,
+                        8.74716500199817011941E0, 5.30324038235394892183E0,
+                        9.99999999999999997821E-1};
+    const double PQ[] = {9.24408810558863637013E-4, 8.56288474354474431428E-2,
+                         1.25352743901058953537E0, 5.47097740330417105182E0,
+                         8.76190883237069594232E0, 5.30605288235394617618E0,
+                         1.00000000000000000218E0};
+    const double QP[] = {-1.13663838898469149931E-2, -1.28252718670509318512E0,
+                         -1.95539544257735972385E1, -9.32060152123768231369E1,
+                         -1.77681167980488050595E2, -1.47077505154951170175E2,
+                         -5.14105326766599330220E1, -6.05014350600728481186E0};
+    const double QQ[] = {1.00000000000000000000E0, 6.43178256118178023184E1,
+                         8.56430025976980587198E2, 3.88240183605401609683E3,
+                         7.24046774195652478189E3, 5.93072701187316984827E3,
+                         2.06209331660327847417E3, 2.42005740240291393179E2};
+    const double YP[] = {1.55924367855235737965E4, -1.46639295903971606143E7,
+                         5.43526477051876500413E9, -9.82136065717911466409E11,
+                         8.75906394395366999549E13, -3.46628303384729719441E15,
+                         4.42733268572569800351E16, -1.84950800436986690637E16};
+    const double YQ[] = {1.00000000000000000000E0,  1.04128353664259848412E3,
+                         6.26107330137134956842E5, 2.68919633393814121987E8,
+                         8.64002487103935000337E10, 2.02979612750105546709E13,
+                         3.17157752842975028269E15, 2.50596256172653059228E17};
+    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+    const T TWOOPI =  pset1<T>(0.636619772367581343075535); /* 2 / pi */
+    const T NEG_PIO4 = pset1<T>(-0.7853981633974483096); /* -pi / 4 */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<double>::infinity());
+
+    T z = pmul(x, x);
+    T x_le_five = pdiv(internal::ppolevl<T, 7>::run(z, YP),
+                       internal::ppolevl<T, 7>::run(z, YQ));
+    x_le_five = pmadd(
+        pmul(TWOOPI, plog(x)), generic_j0<T, double>::run(x), x_le_five);
+    x_le_five = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_five);
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(
+        internal::ppolevl<T, 6>::run(s, PP),
+        internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(
+        internal::ppolevl<T, 7>::run(s, QP),
+        internal::ppolevl<T, 7>::run(s, QQ));
+    T xn = padd(x, NEG_PIO4);
+    T w = pdiv(pset1<T>(5.0), x);
+    p = pmadd(p, psin(xn), pmul(w, pmul(q, pcos(xn))));
+    T x_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(x)));
+    return pselect(pcmp_le(x, pset1<T>(5.0)), x_le_five, x_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_y0_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_y0<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_j1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_j1 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_j1<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j1f.c
+     *	Bessel function of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, j1f();
+     *
+     * y = j1f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order one of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval a polynomial approximation
+     *        2
+     * (w - r  ) x P(w)
+     *       1
+     *                     2
+     * is used, where w = x  and r is the first zero of the function.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x R(1/x^2) - 3pi/4.  The function is
+     *
+     *   j0(x) = Modulus(x) cos( Phase(x) ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak       rms
+     *    IEEE      0,  2       100000       1.2e-7     2.5e-8
+     *    IEEE      2, 32       100000       2.0e-7     5.3e-8
+     *
+     *
+     */
+
+    const float JP[] = {-4.878788132172128E-009f, 6.009061827883699E-007f,
+                        -4.541343896997497E-005f, 1.937383947804541E-003f,
+                        -3.405537384615824E-002f};
+    const float MO1[] = {6.913942741265801E-002f, -2.284801500053359E-001f,
+                        3.138238455499697E-001f, -2.102302420403875E-001f,
+                        5.435364690523026E-003f, 1.493389585089498E-001f,
+                        4.976029650847191E-006f, 7.978845453073848E-001f};
+    const float PH1[] = {-4.497014141919556E+001f, 5.073465654089319E+001f,
+                        -2.485774108720340E+001f, 7.222973196770240E+000f,
+                        -1.544842782180211E+000f, 3.503787691653334E-001f,
+                        -1.637986776941202E-001f, 3.749989509080821E-001f};
+    const T Z1 = pset1<T>(1.46819706421238932572E1f);
+    const T NEG_THPIO4F = pset1<T>(-2.35619449019234492885f);    /* -3*pi/4 */
+
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_two = pmul(
+        psub(z, Z1),
+        pmul(x, internal::ppolevl<T, 4>::run(z, JP)));
+    T q = pdiv(pset1<T>(1.0f), y);
+    T w = prsqrt(y);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO1));
+    w = pmul(q, q);
+    T yn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH1), NEG_THPIO4F);
+    T y_gt_two = pmul(p, pcos(padd(yn, y)));
+    // j1 is an odd function. This implementation differs from cephes to
+    // take this fact in to account. Cephes returns -j1(x) for y > 2 range.
+    y_gt_two = pselect(
+        pcmp_lt(x, pset1<T>(0.0f)), pnegate(y_gt_two), y_gt_two);
+    return pselect(pcmp_le(y, pset1<T>(2.0f)), y_le_two, y_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_j1<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j1.c
+     *	Bessel function of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, j1();
+     *
+     * y = j1( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order one of the argument.
+     *
+     * The domain is divided into the intervals [0, 8] and
+     * (8, infinity). In the first interval a 24 term Chebyshev
+     * expansion is used. In the second, the asymptotic
+     * trigonometric representation is employed using two
+     * rational functions of degree 5/5.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak         rms
+     *    DEC       0, 30       10000       4.0e-17     1.1e-17
+     *    IEEE      0, 30       30000       2.6e-16     1.1e-16
+     *
+     */
+    const double PP[] = {7.62125616208173112003E-4, 7.31397056940917570436E-2,
+                         1.12719608129684925192E0, 5.11207951146807644818E0,
+                         8.42404590141772420927E0, 5.21451598682361504063E0,
+                         1.00000000000000000254E0};
+    const double PQ[] = {5.71323128072548699714E-4, 6.88455908754495404082E-2,
+                         1.10514232634061696926E0, 5.07386386128601488557E0,
+                         8.39985554327604159757E0, 5.20982848682361821619E0,
+                         9.99999999999999997461E-1};
+    const double QP[] = {5.10862594750176621635E-2, 4.98213872951233449420E0,
+                         7.58238284132545283818E1, 3.66779609360150777800E2,
+                         7.10856304998926107277E2, 5.97489612400613639965E2,
+                         2.11688757100572135698E2, 2.52070205858023719784E1};
+    const double QQ[] = {1.00000000000000000000E0, 7.42373277035675149943E1,
+                         1.05644886038262816351E3, 4.98641058337653607651E3,
+                         9.56231892404756170795E3, 7.99704160447350683650E3,
+                         2.82619278517639096600E3, 3.36093607810698293419E2};
+    const double RP[] = {-8.99971225705559398224E8, 4.52228297998194034323E11,
+                         -7.27494245221818276015E13, 3.68295732863852883286E15};
+    const double RQ[] = {1.00000000000000000000E0, 6.20836478118054335476E2,
+                         2.56987256757748830383E5, 8.35146791431949253037E7,
+                         2.21511595479792499675E10, 4.74914122079991414898E12,
+                         7.84369607876235854894E14, 8.95222336184627338078E16,
+                         5.32278620332680085395E18};
+    const T Z1 = pset1<T>(1.46819706421238932572E1);
+    const T Z2 = pset1<T>(4.92184563216946036703E1);
+    const T NEG_THPIO4 = pset1<T>(-2.35619449019234492885);    /* -3*pi/4 */
+    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_five = pdiv(internal::ppolevl<T, 3>::run(z, RP),
+                       internal::ppolevl<T, 8>::run(z, RQ));
+    y_le_five = pmul(pmul(pmul(y_le_five, x), psub(z, Z1)), psub(z, Z2));
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(
+        internal::ppolevl<T, 6>::run(s, PP),
+        internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(
+        internal::ppolevl<T, 7>::run(s, QP),
+        internal::ppolevl<T, 7>::run(s, QQ));
+    T yn = padd(y, NEG_THPIO4);
+    T w = pdiv(pset1<T>(-5.0), y);
+    p = pmadd(p, pcos(yn), pmul(w, pmul(q, psin(yn))));
+    T y_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(y)));
+    // j1 is an odd function. This implementation differs from cephes to
+    // take this fact in to account. Cephes returns -j1(x) for y > 5 range.
+    y_gt_five = pselect(
+        pcmp_lt(x, pset1<T>(0.0)), pnegate(y_gt_five), y_gt_five);
+    return pselect(pcmp_le(y, pset1<T>(5.0)), y_le_five, y_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_j1_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_j1<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_y1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_y1 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_y1<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j1f.c
+     *	Bessel function of second kind of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, y1();
+     *
+     * y = y1( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind of order one
+     * of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval a rational approximation
+     * R(x) is employed to compute
+     *
+     *                  2
+     * y0(x)  =  (w - r  ) x R(x^2)  +  2/pi (ln(x) j1(x) - 1/x) .
+     *                 1
+     *
+     * Thus a call to j1() is required.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x S(1/x^2) - 3pi/4.  Then the function is
+     *
+     *   y0(x) = Modulus(x) sin( Phase(x) ).
+     *
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak         rms
+     *    IEEE      0,  2       100000       2.2e-7     4.6e-8
+     *    IEEE      2, 32       100000       1.9e-7     5.3e-8
+     *
+     * (error criterion relative when |y1| > 1).
+     *
+     */
+
+    const float YP[] = {8.061978323326852E-009f, -9.496460629917016E-007f,
+                        6.719543806674249E-005f, -2.641785726447862E-003f,
+                        4.202369946500099E-002f};
+    const float MO1[] = {6.913942741265801E-002f, -2.284801500053359E-001f,
+                        3.138238455499697E-001f, -2.102302420403875E-001f,
+                        5.435364690523026E-003f, 1.493389585089498E-001f,
+                        4.976029650847191E-006f, 7.978845453073848E-001f};
+    const float PH1[] = {-4.497014141919556E+001f, 5.073465654089319E+001f,
+                        -2.485774108720340E+001f, 7.222973196770240E+000f,
+                        -1.544842782180211E+000f, 3.503787691653334E-001f,
+                        -1.637986776941202E-001f, 3.749989509080821E-001f};
+    const T YO1 = pset1<T>(4.66539330185668857532f);
+    const T NEG_THPIO4F = pset1<T>(-2.35619449019234492885f);    /* -3*pi/4 */
+    const T TWOOPI = pset1<T>(0.636619772367581343075535f); /* 2/pi */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<float>::infinity());
+
+    T z = pmul(x, x);
+    T x_le_two = pmul(psub(z, YO1), internal::ppolevl<T, 4>::run(z, YP));
+    x_le_two = pmadd(
+       x_le_two, x,
+       pmul(TWOOPI, pmadd(
+           generic_j1<T, float>::run(x), plog(x),
+           pdiv(pset1<T>(-1.0f), x))));
+    x_le_two = pselect(pcmp_lt(x, pset1<T>(0.0f)), NEG_MAXNUM, x_le_two);
+
+    T q = pdiv(pset1<T>(1.0), x);
+    T w = prsqrt(x);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO1));
+    w = pmul(q, q);
+    T xn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH1), NEG_THPIO4F);
+    T x_gt_two = pmul(p, psin(padd(xn, x)));
+    return pselect(pcmp_le(x, pset1<T>(2.0)), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_y1<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j1.c
+     *	Bessel function of second kind of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, y1();
+     *
+     * y = y1( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind of order one
+     * of the argument.
+     *
+     * The domain is divided into the intervals [0, 8] and
+     * (8, infinity). In the first interval a 25 term Chebyshev
+     * expansion is used, and a call to j1() is required.
+     * In the second, the asymptotic trigonometric representation
+     * is employed using two rational functions of degree 5/5.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak         rms
+     *    DEC       0, 30       10000       8.6e-17     1.3e-17
+     *    IEEE      0, 30       30000       1.0e-15     1.3e-16
+     *
+     * (error criterion relative when |y1| > 1).
+     *
+     */
+    const double PP[] = {7.62125616208173112003E-4, 7.31397056940917570436E-2,
+                         1.12719608129684925192E0, 5.11207951146807644818E0,
+                         8.42404590141772420927E0, 5.21451598682361504063E0,
+                         1.00000000000000000254E0};
+    const double PQ[] = {5.71323128072548699714E-4, 6.88455908754495404082E-2,
+                         1.10514232634061696926E0, 5.07386386128601488557E0,
+                         8.39985554327604159757E0, 5.20982848682361821619E0,
+                         9.99999999999999997461E-1};
+    const double QP[] = {5.10862594750176621635E-2, 4.98213872951233449420E0,
+                         7.58238284132545283818E1, 3.66779609360150777800E2,
+                         7.10856304998926107277E2, 5.97489612400613639965E2,
+                         2.11688757100572135698E2, 2.52070205858023719784E1};
+    const double QQ[] = {1.00000000000000000000E0, 7.42373277035675149943E1,
+                         1.05644886038262816351E3, 4.98641058337653607651E3,
+                         9.56231892404756170795E3, 7.99704160447350683650E3,
+                         2.82619278517639096600E3, 3.36093607810698293419E2};
+    const double YP[] = {1.26320474790178026440E9, -6.47355876379160291031E11,
+                         1.14509511541823727583E14, -8.12770255501325109621E15,
+                         2.02439475713594898196E17, -7.78877196265950026825E17};
+    const double YQ[] = {1.00000000000000000000E0, 5.94301592346128195359E2,
+                         2.35564092943068577943E5, 7.34811944459721705660E7,
+                         1.87601316108706159478E10, 3.88231277496238566008E12,
+                         6.20557727146953693363E14, 6.87141087355300489866E16,
+                         3.97270608116560655612E18};
+    const T SQ2OPI = pset1<T>(.79788456080286535588);
+    const T NEG_THPIO4 = pset1<T>(-2.35619449019234492885);    /* -3*pi/4 */
+    const T TWOOPI = pset1<T>(0.636619772367581343075535); /* 2/pi */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<double>::infinity());
+
+    T z = pmul(x, x);
+    T x_le_five = pdiv(internal::ppolevl<T, 5>::run(z, YP),
+                   internal::ppolevl<T, 8>::run(z, YQ));
+    x_le_five = pmadd(
+        x_le_five, x, pmul(
+            TWOOPI, pmadd(generic_j1<T, double>::run(x), plog(x),
+                          pdiv(pset1<T>(-1.0), x))));
+
+    x_le_five = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_five);
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(
+        internal::ppolevl<T, 6>::run(s, PP),
+        internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(
+        internal::ppolevl<T, 7>::run(s, QP),
+        internal::ppolevl<T, 7>::run(s, QQ));
+    T xn = padd(x, NEG_THPIO4);
+    T w = pdiv(pset1<T>(5.0), x);
+    p = pmadd(p, psin(xn), pmul(w, pmul(q, pcos(xn))));
+    T x_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(x)));
+    return pselect(pcmp_le(x, pset1<T>(5.0)), x_le_five, x_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_y1_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_y1<T>::run(x);
+  }
+};
+
+}  // end namespace internal
+
+namespace numext {
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i0, Scalar)
+    bessel_i0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i0e, Scalar)
+    bessel_i0e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i0e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i1, Scalar)
+    bessel_i1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i1e, Scalar)
+    bessel_i1e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i1e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k0, Scalar)
+    bessel_k0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k0e, Scalar)
+    bessel_k0e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k0e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k1, Scalar)
+    bessel_k1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k1e, Scalar)
+    bessel_k1e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k1e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_j0, Scalar)
+    bessel_j0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_j0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_y0, Scalar)
+    bessel_y0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_y0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_j1, Scalar)
+    bessel_j1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_j1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_y1, Scalar)
+    bessel_y1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_y1, Scalar)::run(x);
+}
+
+}  // end namespace numext
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSEL_FUNCTIONS_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h
new file mode 100644
index 0000000000000000000000000000000000000000..943d10f6afcd22f80d18dbd1c7aed196073968bf
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h
@@ -0,0 +1,118 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_PACKETMATH_H
+#define EIGEN_BESSELFUNCTIONS_PACKETMATH_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero i0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_i0(const Packet& x) {
+  return numext::bessel_i0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero i0e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_i0e(const Packet& x) {
+  return numext::bessel_i0e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one i1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_i1(const Packet& x) {
+  return numext::bessel_i1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one i1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_i1e(const Packet& x) {
+  return numext::bessel_i1e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero j0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_j0(const Packet& x) {
+  return numext::bessel_j0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero j1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_j1(const Packet& x) {
+  return numext::bessel_j1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one y0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_y0(const Packet& x) {
+  return numext::bessel_y0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one y1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_y1(const Packet& x) {
+  return numext::bessel_y1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero k0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_k0(const Packet& x) {
+  return numext::bessel_k0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero k0e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_k0e(const Packet& x) {
+  return numext::bessel_k0e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one k1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_k1(const Packet& x) {
+  return numext::bessel_k1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one k1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_k1e(const Packet& x) {
+  return numext::bessel_k1e(x);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BESSELFUNCTIONS_PACKETMATH_H
+
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7b231adb077b9594847df1b56d92b54f503b86c
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h
@@ -0,0 +1,67 @@
+#ifndef HIP_VECTOR_COMPATIBILITY_H
+#define HIP_VECTOR_COMPATIBILITY_H
+
+namespace hip_impl {
+  template <typename, typename, unsigned int> struct Scalar_accessor;
+}   // end namespace hip_impl
+
+namespace Eigen {
+namespace internal {
+
+#define HIP_SCALAR_ACCESSOR_BUILDER(NAME) \
+template <typename T, typename U, unsigned int n> \
+struct NAME <hip_impl::Scalar_accessor<T, U, n>> : NAME <T> {};
+
+#define HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(NAME) \
+template <typename T, typename U, unsigned int n> \
+struct NAME##_impl <hip_impl::Scalar_accessor<T, U, n>> : NAME##_impl <T> {}; \
+template <typename T, typename U, unsigned int n> \
+struct NAME##_retval <hip_impl::Scalar_accessor<T, U, n>> : NAME##_retval <T> {};
+
+#define HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(NAME) \
+template <typename T, typename U, unsigned int n, IgammaComputationMode mode> \
+struct NAME <hip_impl::Scalar_accessor<T, U, n>, mode> : NAME <T, mode> {};
+
+#if EIGEN_HAS_C99_MATH
+HIP_SCALAR_ACCESSOR_BUILDER(betainc_helper)
+HIP_SCALAR_ACCESSOR_BUILDER(incbeta_cfe)
+
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(erf)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(erfc)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igammac)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(lgamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(ndtri)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(polygamma)
+
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igamma_generic_impl)
+#endif
+
+HIP_SCALAR_ACCESSOR_BUILDER(digamma_impl_maybe_poly)
+HIP_SCALAR_ACCESSOR_BUILDER(zeta_impl_series)
+
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i0e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i1e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_j0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_j1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k0e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k1e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_y0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_y1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(betainc)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(digamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(gamma_sample_der_alpha)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igamma_der_a)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(zeta)
+
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igamma_series_impl)
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igammac_cf_impl)
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // HIP_VECTOR_COMPATIBILITY_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
index ed415db990da5008c12479f1acacdeace464f9c2..691ff4d038d477340fa032df50cd0759481faa90 100644
--- a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
@@ -24,7 +24,7 @@ namespace Eigen {
   * \sa Eigen::igammac(), Eigen::lgamma()
   */
 template<typename Derived,typename ExponentDerived>
-inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
 igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
 {
   return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
@@ -33,6 +33,48 @@ igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerive
   );
 }
 
+/** \cpp11 \returns an expression of the coefficient-wise igamma_der_a(\a a, \a x) to the given arrays.
+  *
+  * This function computes the coefficient-wise derivative of the incomplete
+  * gamma function with respect to the parameter a.
+  *
+  * \note This function supports only float and double scalar types in c++11
+  * mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations
+  * of igamma_der_a(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::igamma(), Eigen::lgamma()
+  */
+template <typename Derived, typename ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igamma_der_a(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+    a.derived(),
+    x.derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise gamma_sample_der_alpha(\a alpha, \a sample) to the given arrays.
+  *
+  * This function computes the coefficient-wise derivative of the sample
+  * of a Gamma(alpha, 1) random variable with respect to the parameter alpha.
+  *
+  * \note This function supports only float and double scalar types in c++11
+  * mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations
+  * of gamma_sample_der_alpha(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::igamma(), Eigen::lgamma()
+  */
+template <typename AlphaDerived, typename SampleDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>, const AlphaDerived, const SampleDerived>
+gamma_sample_der_alpha(const Eigen::ArrayBase<AlphaDerived>& alpha, const Eigen::ArrayBase<SampleDerived>& sample) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>, const AlphaDerived, const SampleDerived>(
+      alpha.derived(),
+      sample.derived());
+}
+
 /** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
   *
   * This function computes the coefficient-wise complementary incomplete gamma function.
@@ -44,7 +86,7 @@ igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerive
   * \sa Eigen::igamma(), Eigen::lgamma()
   */
 template<typename Derived,typename ExponentDerived>
-inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
 igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
 {
   return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
@@ -66,7 +108,7 @@ igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDeriv
 // * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
 // * \sa ArrayBase::polygamma()
 template<typename DerivedN,typename DerivedX>
-inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>
 polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>& x)
 {
   return Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>(
@@ -86,7 +128,7 @@ polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>&
   * \sa Eigen::betainc(), Eigen::lgamma()
   */
 template<typename ArgADerived, typename ArgBDerived, typename ArgXDerived>
-inline const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>
 betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDerived>& b, const Eigen::ArrayBase<ArgXDerived>& x)
 {
   return Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>(
@@ -101,7 +143,7 @@ betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDeriv
   *
   * It returns the Riemann zeta function of two arguments \a x and \a q:
   *
-  * \param x is the exposent, it must be > 1
+  * \param x is the exponent, it must be > 1
   * \param q is the shift, it must be > 0
   *
   * \note This function supports only float and double scalar types. To support other scalar types, the user has
@@ -110,7 +152,7 @@ betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDeriv
   * \sa ArrayBase::zeta()
   */
 template<typename DerivedX,typename DerivedQ>
-inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
 zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
 {
   return Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>(
@@ -119,6 +161,7 @@ zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
   );
 }
 
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d94231f0eeee2acd9d810c427aa4bdb2d548320
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h
@@ -0,0 +1,58 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
+#define EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 lgamma(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::lgamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 digamma(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::digamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 zeta(const Eigen::bfloat16& x, const Eigen::bfloat16& q) {
+  return Eigen::bfloat16(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 polygamma(const Eigen::bfloat16& n, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 erf(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::erf(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 erfc(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::erfc(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 ndtri(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::ndtri(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igamma(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igamma_der_a(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::igamma_der_a(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 gamma_sample_der_alpha(const Eigen::bfloat16& alpha, const Eigen::bfloat16& sample) {
+  return Eigen::bfloat16(Eigen::numext::gamma_sample_der_alpha(static_cast<float>(alpha), static_cast<float>(sample)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igammac(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 betainc(const Eigen::bfloat16& a, const Eigen::bfloat16& b, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
index d8f2363bea71a3572fad34b413def2ab47c1e497..abefe99b7e7fdd1333977adf8a85d9d290a9ca83 100644
--- a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
@@ -41,6 +41,60 @@ struct functor_traits<scalar_igamma_op<Scalar> > {
   };
 };
 
+/** \internal
+  * \brief Template functor to compute the derivative of the incomplete gamma
+  * function igamma_der_a(a, x)
+  *
+  * \sa class CwiseBinaryOp, Cwise::igamma_der_a
+  */
+template <typename Scalar>
+struct scalar_igamma_der_a_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_der_a_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a, const Scalar& x) const {
+    using numext::igamma_der_a;
+    return igamma_der_a(a, x);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+    return internal::pigamma_der_a(a, x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_igamma_der_a_op<Scalar> > {
+  enum {
+    // 2x the cost of igamma
+    Cost = 40 * NumTraits<Scalar>::MulCost + 20 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGammaDerA
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the derivative of the sample
+  * of a Gamma(alpha, 1) random variable with respect to the parameter alpha
+  * gamma_sample_der_alpha(alpha, sample)
+  *
+  * \sa class CwiseBinaryOp, Cwise::gamma_sample_der_alpha
+  */
+template <typename Scalar>
+struct scalar_gamma_sample_der_alpha_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_gamma_sample_der_alpha_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& alpha, const Scalar& sample) const {
+    using numext::gamma_sample_der_alpha;
+    return gamma_sample_der_alpha(alpha, sample);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& alpha, const Packet& sample) const {
+    return internal::pgamma_sample_der_alpha(alpha, sample);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_gamma_sample_der_alpha_op<Scalar> > {
+  enum {
+    // 2x the cost of igamma, minus the lgamma cost (the lgamma cancels out)
+    Cost = 30 * NumTraits<Scalar>::MulCost + 15 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasGammaSampleDerAlpha
+  };
+};
 
 /** \internal
   * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
@@ -101,11 +155,11 @@ struct functor_traits<scalar_betainc_op<Scalar> > {
  */
 template<typename Scalar> struct scalar_lgamma_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
     using numext::lgamma; return lgamma(a);
   }
   typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_lgamma_op<Scalar> >
@@ -123,11 +177,11 @@ struct functor_traits<scalar_lgamma_op<Scalar> >
  */
 template<typename Scalar> struct scalar_digamma_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
     using numext::digamma; return digamma(a);
   }
   typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_digamma_op<Scalar> >
@@ -145,11 +199,11 @@ struct functor_traits<scalar_digamma_op<Scalar> >
  */
 template<typename Scalar> struct scalar_zeta_op {
     EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op)
-    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& q) const {
         using numext::zeta; return zeta(x, q);
     }
     typedef typename packet_traits<Scalar>::type Packet;
-    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_zeta_op<Scalar> >
@@ -167,11 +221,11 @@ struct functor_traits<scalar_zeta_op<Scalar> >
  */
 template<typename Scalar> struct scalar_polygamma_op {
     EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op)
-    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& n, const Scalar& x) const {
         using numext::polygamma; return polygamma(n, x);
     }
     typedef typename packet_traits<Scalar>::type Packet;
-    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_polygamma_op<Scalar> >
@@ -184,25 +238,40 @@ struct functor_traits<scalar_polygamma_op<Scalar> >
 };
 
 /** \internal
- * \brief Template functor to compute the Gauss error function of a
- * scalar
- * \sa class CwiseUnaryOp, Cwise::erf()
+ * \brief Template functor to compute the error function of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::erf()
  */
 template<typename Scalar> struct scalar_erf_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
-    using numext::erf; return erf(a);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+  operator()(const Scalar& a) const {
+    return numext::erf(a);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return perf(x);
   }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); }
 };
-template<typename Scalar>
-struct functor_traits<scalar_erf_op<Scalar> >
-{
+template <typename Scalar>
+struct functor_traits<scalar_erf_op<Scalar> > {
   enum {
-    // Guesstimate
-    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasErf
+    PacketAccess = packet_traits<Scalar>::HasErf,
+    Cost =
+        (PacketAccess
+#ifdef EIGEN_VECTORIZE_FMA
+             // TODO(rmlarsen): Move the FMA cost model to a central location.
+             // Haswell can issue 2 add/mul/madd per cycle.
+             // 10 pmadd, 2 pmul, 1 div, 2 other
+             ? (2 * NumTraits<Scalar>::AddCost +
+                7 * NumTraits<Scalar>::MulCost +
+                scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value)
+#else
+             ? (12 * NumTraits<Scalar>::AddCost +
+                12 * NumTraits<Scalar>::MulCost +
+                scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value)
+#endif
+             // Assume for simplicity that this is as expensive as an exp().
+             : (functor_traits<scalar_exp_op<Scalar> >::Cost))
   };
 };
 
@@ -213,11 +282,11 @@ struct functor_traits<scalar_erf_op<Scalar> >
  */
 template<typename Scalar> struct scalar_erfc_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
     using numext::erfc; return erfc(a);
   }
   typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::perfc(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_erfc_op<Scalar> >
@@ -229,6 +298,31 @@ struct functor_traits<scalar_erfc_op<Scalar> >
   };
 };
 
+/** \internal
+ * \brief Template functor to compute the Inverse of the normal distribution
+ * function of a scalar
+ * \sa class CwiseUnaryOp, Cwise::ndtri()
+ */
+template<typename Scalar> struct scalar_ndtri_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_ndtri_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
+    using numext::ndtri; return ndtri(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pndtri(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_ndtri_op<Scalar> >
+{
+  enum {
+    // On average, We are evaluating rational functions with degree N=9 in the
+    // numerator and denominator. This results in 2*N additions and 2*N
+    // multiplications.
+    Cost = 18 * NumTraits<Scalar>::MulCost + 18 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasNdtri
+  };
+};
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
index 553bcda6a65b0369564a931142ca86d6651b34c4..2a3a53168d221f439590bd692b63dfe0be856399 100644
--- a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
@@ -30,9 +30,20 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::ha
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
   return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
 }
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ndtri(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::ndtri(static_cast<float>(a)));
+}
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
   return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
 }
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma_der_a(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igamma_der_a(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half gamma_sample_der_alpha(const Eigen::half& alpha, const Eigen::half& sample) {
+  return Eigen::half(Eigen::numext::gamma_sample_der_alpha(static_cast<float>(alpha), static_cast<float>(sample)));
+}
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
   return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
 }
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index f524d7137741e645642c2de3d4c7218526a5f0ae..f1c260e29450eb6b557e664138b8093d343a6f9e 100644
--- a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -36,66 +36,6 @@ namespace internal {
 //    Good luck with your project,
 //    Steve
 
-namespace cephes {
-
-/* polevl (modified for Eigen)
- *
- *      Evaluate polynomial
- *
- *
- *
- * SYNOPSIS:
- *
- * int N;
- * Scalar x, y, coef[N+1];
- *
- * y = polevl<decltype(x), N>( x, coef);
- *
- *
- *
- * DESCRIPTION:
- *
- * Evaluates polynomial of degree N:
- *
- *                     2          N
- * y  =  C  + C x + C x  +...+ C x
- *        0    1     2          N
- *
- * Coefficients are stored in reverse order:
- *
- * coef[0] = C  , ..., coef[N] = C  .
- *            N                   0
- *
- *  The function p1evl() assumes that coef[N] = 1.0 and is
- * omitted from the array.  Its calling arguments are
- * otherwise the same as polevl().
- *
- *
- * The Eigen implementation is templatized.  For best speed, store
- * coef as a const array (constexpr), e.g.
- *
- * const double coef[] = {1.0, 2.0, 3.0, ...};
- *
- */
-template <typename Scalar, int N>
-struct polevl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x, const Scalar coef[]) {
-    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N];
-  }
-};
-
-template <typename Scalar>
-struct polevl<Scalar, 0> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar, const Scalar coef[]) {
-    return coef[0];
-  }
-};
-
-}  // end namespace cephes
 
 /****************************************************************************
  * Implementation of lgamma, requires C++11/C99                             *
@@ -117,13 +57,27 @@ struct lgamma_retval {
 };
 
 #if EIGEN_HAS_C99_MATH
+// Since glibc 2.19
+#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 19) || __GLIBC__>2) \
+ && (defined(_DEFAULT_SOURCE) || defined(_BSD_SOURCE) || defined(_SVID_SOURCE))
+#define EIGEN_HAS_LGAMMA_R
+#endif
+
+// Glibc versions before 2.19
+#if defined(__GLIBC__) && ((__GLIBC__==2 && __GLIBC_MINOR__ < 19) || __GLIBC__<2) \
+ && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE))
+#define EIGEN_HAS_LGAMMA_R
+#endif
+
 template <>
 struct lgamma_impl<float> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE float run(float x) {
-#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
-    int signgam;
-    return ::lgammaf_r(x, &signgam);
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && defined (EIGEN_HAS_LGAMMA_R) && !defined(__APPLE__)
+    int dummy;
+    return ::lgammaf_r(x, &dummy);
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::lgamma(x);
 #else
     return ::lgammaf(x);
 #endif
@@ -134,14 +88,18 @@ template <>
 struct lgamma_impl<double> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE double run(double x) {
-#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
-    int signgam;
-    return ::lgamma_r(x, &signgam);
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && defined(EIGEN_HAS_LGAMMA_R) && !defined(__APPLE__)
+    int dummy;
+    return ::lgamma_r(x, &dummy);
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::lgamma(x);
 #else
     return ::lgamma(x);
 #endif
   }
 };
+
+#undef EIGEN_HAS_LGAMMA_R
 #endif
 
 /****************************************************************************
@@ -191,7 +149,7 @@ struct digamma_impl_maybe_poly<float> {
     float z;
     if (s < 1.0e8f) {
       z = 1.0f / (s * s);
-      return z * cephes::polevl<float, 3>::run(z, A);
+      return z * internal::ppolevl<float, 3>::run(z, A);
     } else return 0.0f;
   }
 };
@@ -213,7 +171,7 @@ struct digamma_impl_maybe_poly<double> {
     double z;
     if (s < 1.0e17) {
       z = 1.0 / (s * s);
-      return z * cephes::polevl<double, 6>::run(z, A);
+      return z * internal::ppolevl<double, 6>::run(z, A);
     }
     else return 0.0;
   }
@@ -283,7 +241,7 @@ struct digamma_impl {
     Scalar p, q, nz, s, w, y;
     bool negative = false;
 
-    const Scalar maxnum = NumTraits<Scalar>::infinity();
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
     const Scalar m_pi = Scalar(EIGEN_PI);
 
     const Scalar zero = Scalar(0);
@@ -296,7 +254,7 @@ struct digamma_impl {
       q = x;
       p = numext::floor(q);
       if (p == q) {
-        return maxnum;
+        return nan;
       }
       /* Remove the zeros of tan(m_pi x)
        * by subtracting the nearest integer from x
@@ -335,13 +293,63 @@ struct digamma_impl {
  * Implementation of erf, requires C++11/C99                                *
  ****************************************************************************/
 
-template <typename Scalar>
+/** \internal \returns the error function of \a a (coeff-wise)
+    Doesn't do anything fancy, just a 13/8-degree rational interpolant which
+    is accurate up to a couple of ulp in the range [-4, 4], outside of which
+    fl(erf(x)) = +/-1.
+
+    This implementation works on both scalars and Ts.
+*/
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& a_x) {
+  // Clamp the inputs to the range [-4, 4] since anything outside
+  // this range is +/-1.0f in single-precision.
+  const T plus_4 = pset1<T>(4.f);
+  const T minus_4 = pset1<T>(-4.f);
+  const T x = pmax(pmin(a_x, plus_4), minus_4);
+  // The monomial coefficients of the numerator polynomial (odd).
+  const T alpha_1 = pset1<T>(-1.60960333262415e-02f);
+  const T alpha_3 = pset1<T>(-2.95459980854025e-03f);
+  const T alpha_5 = pset1<T>(-7.34990630326855e-04f);
+  const T alpha_7 = pset1<T>(-5.69250639462346e-05f);
+  const T alpha_9 = pset1<T>(-2.10102402082508e-06f);
+  const T alpha_11 = pset1<T>(2.77068142495902e-08f);
+  const T alpha_13 = pset1<T>(-2.72614225801306e-10f);
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const T beta_0 = pset1<T>(-1.42647390514189e-02f);
+  const T beta_2 = pset1<T>(-7.37332916720468e-03f);
+  const T beta_4 = pset1<T>(-1.68282697438203e-03f);
+  const T beta_6 = pset1<T>(-2.13374055278905e-04f);
+  const T beta_8 = pset1<T>(-1.45660718464996e-05f);
+
+  // Since the polynomials are odd/even, we need x^2.
+  const T x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial p.
+  T p = pmadd(x2, alpha_13, alpha_11);
+  p = pmadd(x2, p, alpha_9);
+  p = pmadd(x2, p, alpha_7);
+  p = pmadd(x2, p, alpha_5);
+  p = pmadd(x2, p, alpha_3);
+  p = pmadd(x2, p, alpha_1);
+  p = pmul(x, p);
+
+  // Evaluate the denominator polynomial p.
+  T q = pmadd(x2, beta_8, beta_6);
+  q = pmadd(x2, q, beta_4);
+  q = pmadd(x2, q, beta_2);
+  q = pmadd(x2, q, beta_0);
+
+  // Divide the numerator by the denominator.
+  return pdiv(p, q);
+}
+
+template <typename T>
 struct erf_impl {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
-    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
-                        THIS_TYPE_IS_NOT_SUPPORTED);
-    return Scalar(0);
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    return generic_fast_erf_float(x);
   }
 };
 
@@ -354,13 +362,25 @@ struct erf_retval {
 template <>
 struct erf_impl<float> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE float run(float x) { return ::erff(x); }
+  static EIGEN_STRONG_INLINE float run(float x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erf(x);
+#else
+    return generic_fast_erf_float(x);
+#endif
+  }
 };
 
 template <>
 struct erf_impl<double> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE double run(double x) { return ::erf(x); }
+  static EIGEN_STRONG_INLINE double run(double x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erf(x);
+#else
+    return ::erf(x);
+#endif
+  }
 };
 #endif  // EIGEN_HAS_C99_MATH
 
@@ -387,16 +407,270 @@ struct erfc_retval {
 template <>
 struct erfc_impl<float> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); }
+  static EIGEN_STRONG_INLINE float run(const float x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erfc(x);
+#else
+    return ::erfcf(x);
+#endif
+  }
 };
 
 template <>
 struct erfc_impl<double> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); }
+  static EIGEN_STRONG_INLINE double run(const double x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erfc(x);
+#else
+    return ::erfc(x);
+#endif
+  }
+};
+#endif  // EIGEN_HAS_C99_MATH
+
+
+/***************************************************************************
+* Implementation of ndtri.                                                 *
+****************************************************************************/
+
+/* Inverse of Normal distribution function (modified for Eigen).
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, ndtri();
+ *
+ * x = ndtri( y );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns the argument, x, for which the area under the
+ * Gaussian probability density function (integrated from
+ * minus infinity to x) is equal to y.
+ *
+ *
+ * For small arguments 0 < y < exp(-2), the program computes
+ * z = sqrt( -2.0 * log(y) );  then the approximation is
+ * x = z - log(z)/z  - (1/z) P(1/z) / Q(1/z).
+ * There are two rational functions P/Q, one for 0 < y < exp(-32)
+ * and the other for y up to exp(-2).  For larger arguments,
+ * w = y - 0.5, and  x/sqrt(2pi) = w + w**3 R(w**2)/S(w**2)).
+ *
+ *
+ * ACCURACY:
+ *
+ *                      Relative error:
+ * arithmetic   domain        # trials      peak         rms
+ *    DEC      0.125, 1         5500       9.5e-17     2.1e-17
+ *    DEC      6e-39, 0.135     3500       5.7e-17     1.3e-17
+ *    IEEE     0.125, 1        20000       7.2e-16     1.3e-16
+ *    IEEE     3e-308, 0.135   50000       4.6e-16     9.8e-17
+ *
+ *
+ * ERROR MESSAGES:
+ *
+ *   message         condition    value returned
+ * ndtri domain       x <= 0        -MAXNUM
+ * ndtri domain       x >= 1         MAXNUM
+ *
+ */
+ /*
+   Cephes Math Library Release 2.2: June, 1992
+   Copyright 1985, 1987, 1992 by Stephen L. Moshier
+   Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+ */
+
+
+// TODO: Add a cheaper approximation for float.
+
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T flipsign(
+    const T& should_flipsign, const T& x) {
+  typedef typename unpacket_traits<T>::type Scalar;
+  const T sign_mask = pset1<T>(Scalar(-0.0));
+  T sign_bit = pand<T>(should_flipsign, sign_mask);
+  return pxor<T>(sign_bit, x);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double flipsign<double>(
+    const double& should_flipsign, const double& x) {
+  return should_flipsign == 0 ? x : -x;
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float flipsign<float>(
+    const float& should_flipsign, const float& x) {
+  return should_flipsign == 0 ? x : -x;
+}
+
+// We split this computation in to two so that in the scalar path
+// only one branch is evaluated (due to our template specialization of pselect
+// being an if statement.)
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_gt_exp_neg_two(const T& b) {
+  const ScalarType p0[] = {
+    ScalarType(-5.99633501014107895267e1),
+    ScalarType(9.80010754185999661536e1),
+    ScalarType(-5.66762857469070293439e1),
+    ScalarType(1.39312609387279679503e1),
+    ScalarType(-1.23916583867381258016e0)
+  };
+  const ScalarType q0[] = {
+    ScalarType(1.0),
+    ScalarType(1.95448858338141759834e0),
+    ScalarType(4.67627912898881538453e0),
+    ScalarType(8.63602421390890590575e1),
+    ScalarType(-2.25462687854119370527e2),
+    ScalarType(2.00260212380060660359e2),
+    ScalarType(-8.20372256168333339912e1),
+    ScalarType(1.59056225126211695515e1),
+    ScalarType(-1.18331621121330003142e0)
+  };
+  const T sqrt2pi = pset1<T>(ScalarType(2.50662827463100050242e0));
+  const T half = pset1<T>(ScalarType(0.5));
+  T c, c2, ndtri_gt_exp_neg_two;
+
+  c = psub(b, half);
+  c2 = pmul(c, c);
+  ndtri_gt_exp_neg_two = pmadd(c, pmul(
+      c2, pdiv(
+          internal::ppolevl<T, 4>::run(c2, p0),
+          internal::ppolevl<T, 8>::run(c2, q0))), c);
+  return pmul(ndtri_gt_exp_neg_two, sqrt2pi);
+}
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_lt_exp_neg_two(
+    const T& b, const T& should_flipsign) {
+  /* Approximation for interval z = sqrt(-2 log a ) between 2 and 8
+   * i.e., a between exp(-2) = .135 and exp(-32) = 1.27e-14.
+   */
+  const ScalarType p1[] = {
+    ScalarType(4.05544892305962419923e0),
+    ScalarType(3.15251094599893866154e1),
+    ScalarType(5.71628192246421288162e1),
+    ScalarType(4.40805073893200834700e1),
+    ScalarType(1.46849561928858024014e1),
+    ScalarType(2.18663306850790267539e0),
+    ScalarType(-1.40256079171354495875e-1),
+    ScalarType(-3.50424626827848203418e-2),
+    ScalarType(-8.57456785154685413611e-4)
+  };
+  const ScalarType q1[] = {
+    ScalarType(1.0),
+    ScalarType(1.57799883256466749731e1),
+    ScalarType(4.53907635128879210584e1),
+    ScalarType(4.13172038254672030440e1),
+    ScalarType(1.50425385692907503408e1),
+    ScalarType(2.50464946208309415979e0),
+    ScalarType(-1.42182922854787788574e-1),
+    ScalarType(-3.80806407691578277194e-2),
+    ScalarType(-9.33259480895457427372e-4)
+  };
+  /* Approximation for interval z = sqrt(-2 log a ) between 8 and 64
+   * i.e., a between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
+   */
+  const ScalarType p2[] = {
+    ScalarType(3.23774891776946035970e0),
+    ScalarType(6.91522889068984211695e0),
+    ScalarType(3.93881025292474443415e0),
+    ScalarType(1.33303460815807542389e0),
+    ScalarType(2.01485389549179081538e-1),
+    ScalarType(1.23716634817820021358e-2),
+    ScalarType(3.01581553508235416007e-4),
+    ScalarType(2.65806974686737550832e-6),
+    ScalarType(6.23974539184983293730e-9)
+  };
+  const ScalarType q2[] = {
+    ScalarType(1.0),
+    ScalarType(6.02427039364742014255e0),
+    ScalarType(3.67983563856160859403e0),
+    ScalarType(1.37702099489081330271e0),
+    ScalarType(2.16236993594496635890e-1),
+    ScalarType(1.34204006088543189037e-2),
+    ScalarType(3.28014464682127739104e-4),
+    ScalarType(2.89247864745380683936e-6),
+    ScalarType(6.79019408009981274425e-9)
+  };
+  const T eight = pset1<T>(ScalarType(8.0));
+  const T one = pset1<T>(ScalarType(1));
+  const T neg_two = pset1<T>(ScalarType(-2));
+  T x, x0, x1, z;
+
+  x = psqrt(pmul(neg_two, plog(b)));
+  x0 = psub(x, pdiv(plog(x), x));
+  z = pdiv(one, x);
+  x1 = pmul(
+      z, pselect(
+          pcmp_lt(x, eight),
+          pdiv(internal::ppolevl<T, 8>::run(z, p1),
+               internal::ppolevl<T, 8>::run(z, q1)),
+          pdiv(internal::ppolevl<T, 8>::run(z, p2),
+               internal::ppolevl<T, 8>::run(z, q2))));
+  return flipsign(should_flipsign, psub(x0, x1));
+}
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T generic_ndtri(const T& a) {
+  const T maxnum = pset1<T>(NumTraits<ScalarType>::infinity());
+  const T neg_maxnum = pset1<T>(-NumTraits<ScalarType>::infinity());
+
+  const T zero = pset1<T>(ScalarType(0));
+  const T one = pset1<T>(ScalarType(1));
+  // exp(-2)
+  const T exp_neg_two = pset1<T>(ScalarType(0.13533528323661269189));
+  T b, ndtri, should_flipsign;
+
+  should_flipsign = pcmp_le(a, psub(one, exp_neg_two));
+  b = pselect(should_flipsign, a, psub(one, a));
+
+  ndtri = pselect(
+      pcmp_lt(exp_neg_two, b),
+      generic_ndtri_gt_exp_neg_two<T, ScalarType>(b),
+      generic_ndtri_lt_exp_neg_two<T, ScalarType>(b, should_flipsign));
+
+  return pselect(
+      pcmp_le(a, zero), neg_maxnum,
+      pselect(pcmp_le(one, a), maxnum, ndtri));
+}
+
+template <typename Scalar>
+struct ndtri_retval {
+  typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct ndtri_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+# else
+
+template <typename Scalar>
+struct ndtri_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
+    return generic_ndtri<Scalar, Scalar>(x);
+  }
 };
+
 #endif  // EIGEN_HAS_C99_MATH
 
+
 /**************************************************************************************************************
  * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 *
  **************************************************************************************************************/
@@ -452,6 +726,228 @@ struct cephes_helper<double> {
   }
 };
 
+enum IgammaComputationMode { VALUE, DERIVATIVE, SAMPLE_DERIVATIVE };
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC
+static EIGEN_STRONG_INLINE Scalar main_igamma_term(Scalar a, Scalar x) {
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    Scalar logax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+    if (logax < -numext::log(NumTraits<Scalar>::highest()) ||
+        // Assuming x and a aren't Nan.
+        (numext::isnan)(logax)) {
+      return Scalar(0);
+    }
+    return numext::exp(logax);
+}
+
+template <typename Scalar, IgammaComputationMode mode>
+EIGEN_DEVICE_FUNC
+int igamma_num_iterations() {
+  /* Returns the maximum number of internal iterations for igamma computation.
+   */
+  if (mode == VALUE) {
+    return 2000;
+  }
+
+  if (internal::is_same<Scalar, float>::value) {
+    return 200;
+  } else if (internal::is_same<Scalar, double>::value) {
+    return 500;
+  } else {
+    return 2000;
+  }
+}
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igammac_cf_impl {
+  /* Computes igamc(a, x) or derivative (depending on the mode)
+   * using the continued fraction expansion of the complementary
+   * incomplete Gamma function.
+   *
+   * Preconditions:
+   *   a > 0
+   *   x >= 1
+   *   x >= a
+   */
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar two = 2;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar big = cephes_helper<Scalar>::big();
+    const Scalar biginv = cephes_helper<Scalar>::biginv();
+
+    if ((numext::isinf)(x)) {
+      return zero;
+    }
+
+    Scalar ax = main_igamma_term<Scalar>(a, x);
+    // This is independent of mode. If this value is zero,
+    // then the function value is zero. If the function value is zero,
+    // then we are in a neighborhood where the function value evalutes to zero,
+    // so the derivative is zero.
+    if (ax == zero) {
+      return zero;
+    }
+
+    // continued fraction
+    Scalar y = one - a;
+    Scalar z = x + y + one;
+    Scalar c = zero;
+    Scalar pkm2 = one;
+    Scalar qkm2 = x;
+    Scalar pkm1 = x + one;
+    Scalar qkm1 = z * x;
+    Scalar ans = pkm1 / qkm1;
+
+    Scalar dpkm2_da = zero;
+    Scalar dqkm2_da = zero;
+    Scalar dpkm1_da = zero;
+    Scalar dqkm1_da = -x;
+    Scalar dans_da = (dpkm1_da - ans * dqkm1_da) / qkm1;
+
+    for (int i = 0; i < igamma_num_iterations<Scalar, mode>(); i++) {
+      c += one;
+      y += one;
+      z += two;
+
+      Scalar yc = y * c;
+      Scalar pk = pkm1 * z - pkm2 * yc;
+      Scalar qk = qkm1 * z - qkm2 * yc;
+
+      Scalar dpk_da = dpkm1_da * z - pkm1 - dpkm2_da * yc + pkm2 * c;
+      Scalar dqk_da = dqkm1_da * z - qkm1 - dqkm2_da * yc + qkm2 * c;
+
+      if (qk != zero) {
+        Scalar ans_prev = ans;
+        ans = pk / qk;
+
+        Scalar dans_da_prev = dans_da;
+        dans_da = (dpk_da - ans * dqk_da) / qk;
+
+        if (mode == VALUE) {
+          if (numext::abs(ans_prev - ans) <= machep * numext::abs(ans)) {
+            break;
+          }
+        } else {
+          if (numext::abs(dans_da - dans_da_prev) <= machep) {
+            break;
+          }
+        }
+      }
+
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+
+      dpkm2_da = dpkm1_da;
+      dpkm1_da = dpk_da;
+      dqkm2_da = dqkm1_da;
+      dqkm1_da = dqk_da;
+
+      if (numext::abs(pk) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
+
+        dpkm2_da *= biginv;
+        dpkm1_da *= biginv;
+        dqkm2_da *= biginv;
+        dqkm1_da *= biginv;
+      }
+    }
+
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    Scalar dlogax_da = numext::log(x) - digamma_impl<Scalar>::run(a);
+    Scalar dax_da = ax * dlogax_da;
+
+    switch (mode) {
+      case VALUE:
+        return ans * ax;
+      case DERIVATIVE:
+        return ans * dax_da + dans_da * ax;
+      case SAMPLE_DERIVATIVE:
+      default: // this is needed to suppress clang warning
+        return -(dans_da + ans * dlogax_da) * x;
+    }
+  }
+};
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_series_impl {
+  /* Computes igam(a, x) or its derivative (depending on the mode)
+   * using the series expansion of the incomplete Gamma function.
+   *
+   * Preconditions:
+   *   x > 0
+   *   a > 0
+   *   !(x > 1 && x > a)
+   */
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+
+    Scalar ax = main_igamma_term<Scalar>(a, x);
+
+    // This is independent of mode. If this value is zero,
+    // then the function value is zero. If the function value is zero,
+    // then we are in a neighborhood where the function value evalutes to zero,
+    // so the derivative is zero.
+    if (ax == zero) {
+      return zero;
+    }
+
+    ax /= a;
+
+    /* power series */
+    Scalar r = a;
+    Scalar c = one;
+    Scalar ans = one;
+
+    Scalar dc_da = zero;
+    Scalar dans_da = zero;
+
+    for (int i = 0; i < igamma_num_iterations<Scalar, mode>(); i++) {
+      r += one;
+      Scalar term = x / r;
+      Scalar dterm_da = -x / (r * r);
+      dc_da = term * dc_da + dterm_da * c;
+      dans_da += dc_da;
+      c *= term;
+      ans += c;
+
+      if (mode == VALUE) {
+        if (c <= machep * ans) {
+          break;
+        }
+      } else {
+        if (numext::abs(dc_da) <= machep * numext::abs(dans_da)) {
+          break;
+        }
+      }
+    }
+
+    Scalar dlogax_da = numext::log(x) - digamma_impl<Scalar>::run(a + one);
+    Scalar dax_da = ax * dlogax_da;
+
+    switch (mode) {
+      case VALUE:
+        return ans * ax;
+      case DERIVATIVE:
+        return ans * dax_da + dans_da * ax;
+      case SAMPLE_DERIVATIVE:
+      default: // this is needed to suppress clang warning
+        return -(dans_da + ans * dlogax_da) * x / a;
+    }
+  }
+};
+
 #if !EIGEN_HAS_C99_MATH
 
 template <typename Scalar>
@@ -466,8 +962,6 @@ struct igammac_impl {
 
 #else
 
-template <typename Scalar> struct igamma_impl;  // predeclare igamma_impl
-
 template <typename Scalar>
 struct igammac_impl {
   EIGEN_DEVICE_FUNC
@@ -535,93 +1029,15 @@ struct igammac_impl {
       return nan;
     }
 
-    if ((x < one) || (x < a)) {
-      /* The checks above ensure that we meet the preconditions for
-       * igamma_impl::Impl(), so call it, rather than igamma_impl::Run().
-       * Calling Run() would also work, but in that case the compiler may not be
-       * able to prove that igammac_impl::Run and igamma_impl::Run are not
-       * mutually recursive.  This leads to worse code, particularly on
-       * platforms like nvptx, where recursion is allowed only begrudgingly.
-       */
-      return (one - igamma_impl<Scalar>::Impl(a, x));
-    }
-
-    return Impl(a, x);
-  }
-
- private:
-  /* igamma_impl calls igammac_impl::Impl. */
-  friend struct igamma_impl<Scalar>;
-
-  /* Actually computes igamc(a, x).
-   *
-   * Preconditions:
-   *   a > 0
-   *   x >= 1
-   *   x >= a
-   */
-  EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
-    const Scalar zero = 0;
-    const Scalar one = 1;
-    const Scalar two = 2;
-    const Scalar machep = cephes_helper<Scalar>::machep();
-    const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
-    const Scalar big = cephes_helper<Scalar>::big();
-    const Scalar biginv = cephes_helper<Scalar>::biginv();
-    const Scalar inf = NumTraits<Scalar>::infinity();
-
-    Scalar ans, ax, c, yc, r, t, y, z;
-    Scalar pk, pkm1, pkm2, qk, qkm1, qkm2;
-
-    if (x == inf) return zero;  // std::isinf crashes on CUDA
-
-    /* Compute  x**a * exp(-x) / gamma(a)  */
-    ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
-    if (ax < -maxlog) {  // underflow
-      return zero;
+    if ((numext::isnan)(a) || (numext::isnan)(x)) {  // propagate nans
+      return nan;
     }
-    ax = numext::exp(ax);
 
-    // continued fraction
-    y = one - a;
-    z = x + y + one;
-    c = zero;
-    pkm2 = one;
-    qkm2 = x;
-    pkm1 = x + one;
-    qkm1 = z * x;
-    ans = pkm1 / qkm1;
-
-    while (true) {
-      c += one;
-      y += one;
-      z += two;
-      yc = y * c;
-      pk = pkm1 * z - pkm2 * yc;
-      qk = qkm1 * z - qkm2 * yc;
-      if (qk != zero) {
-        r = pk / qk;
-        t = numext::abs((ans - r) / r);
-        ans = r;
-      } else {
-        t = one;
-      }
-      pkm2 = pkm1;
-      pkm1 = pk;
-      qkm2 = qkm1;
-      qkm1 = qk;
-      if (numext::abs(pk) > big) {
-        pkm2 *= biginv;
-        pkm1 *= biginv;
-        qkm2 *= biginv;
-        qkm1 *= biginv;
-      }
-      if (t <= machep) {
-        break;
-      }
+    if ((x < one) || (x < a)) {
+      return (one - igamma_series_impl<Scalar, VALUE>::run(a, x));
     }
 
-    return (ans * ax);
+    return igammac_cf_impl<Scalar, VALUE>::run(a, x);
   }
 };
 
@@ -631,15 +1047,10 @@ struct igammac_impl {
  * Implementation of igamma (incomplete gamma integral), based on Cephes but requires C++11/C99 *
  ************************************************************************************************/
 
-template <typename Scalar>
-struct igamma_retval {
-  typedef Scalar type;
-};
-
 #if !EIGEN_HAS_C99_MATH
 
-template <typename Scalar>
-struct igamma_impl {
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_generic_impl {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) {
     EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
@@ -650,69 +1061,17 @@ struct igamma_impl {
 
 #else
 
-template <typename Scalar>
-struct igamma_impl {
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_generic_impl {
   EIGEN_DEVICE_FUNC
   static Scalar run(Scalar a, Scalar x) {
-    /*	igam()
-     *	Incomplete gamma integral
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * double a, x, y, igam();
-     *
-     * y = igam( a, x );
-     *
-     * DESCRIPTION:
-     *
-     * The function is defined by
-     *
-     *                           x
-     *                            -
-     *                   1       | |  -t  a-1
-     *  igam(a,x)  =   -----     |   e   t   dt.
-     *                  -      | |
-     *                 | (a)    -
-     *                           0
-     *
-     *
-     * In this implementation both arguments must be positive.
-     * The integral is evaluated by either a power series or
-     * continued fraction expansion, depending on the relative
-     * values of a and x.
-     *
-     * ACCURACY (double):
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0,30       200000       3.6e-14     2.9e-15
-     *    IEEE      0,100      300000       9.9e-14     1.5e-14
-     *
-     *
-     * ACCURACY (float):
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0,30        20000       7.8e-6      5.9e-7
-     *
-     */
-    /*
-      Cephes Math Library Release 2.2: June, 1992
-      Copyright 1985, 1987, 1992 by Stephen L. Moshier
-      Direct inquiries to 30 Frost Street, Cambridge, MA 02140
-    */
-
-
-    /* left tail of incomplete gamma function:
-     *
-     *          inf.      k
-     *   a  -x   -       x
-     *  x  e     >   ----------
-     *           -     -
-     *          k=0   | (a+k+1)
+    /* Depending on the mode, returns
+     * - VALUE: incomplete Gamma function igamma(a, x)
+     * - DERIVATIVE: derivative of incomplete Gamma function d/da igamma(a, x)
+     * - SAMPLE_DERIVATIVE: implicit derivative of a Gamma random variable
+     * x ~ Gamma(x | a, 1), dx/da = -1 / Gamma(x | a, 1) * d igamma(a, x) / dx
      *
+     * Derivatives are implemented by forward-mode differentiation.
      */
     const Scalar zero = 0;
     const Scalar one = 1;
@@ -724,67 +1083,167 @@ struct igamma_impl {
       return nan;
     }
 
+    if ((numext::isnan)(a) || (numext::isnan)(x)) {  // propagate nans
+      return nan;
+    }
+
     if ((x > one) && (x > a)) {
-      /* The checks above ensure that we meet the preconditions for
-       * igammac_impl::Impl(), so call it, rather than igammac_impl::Run().
-       * Calling Run() would also work, but in that case the compiler may not be
-       * able to prove that igammac_impl::Run and igamma_impl::Run are not
-       * mutually recursive.  This leads to worse code, particularly on
-       * platforms like nvptx, where recursion is allowed only begrudgingly.
-       */
-      return (one - igammac_impl<Scalar>::Impl(a, x));
+      Scalar ret = igammac_cf_impl<Scalar, mode>::run(a, x);
+      if (mode == VALUE) {
+        return one - ret;
+      } else {
+        return -ret;
+      }
     }
 
-    return Impl(a, x);
+    return igamma_series_impl<Scalar, mode>::run(a, x);
   }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
 
- private:
-  /* igammac_impl calls igamma_impl::Impl. */
-  friend struct igammac_impl<Scalar>;
+template <typename Scalar>
+struct igamma_retval {
+  typedef Scalar type;
+};
 
-  /* Actually computes igam(a, x).
+template <typename Scalar>
+struct igamma_impl : igamma_generic_impl<Scalar, VALUE> {
+  /* igam()
+   * Incomplete gamma integral.
+   *
+   * The CDF of Gamma(a, 1) random variable at the point x.
+   *
+   * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points.
+   * The ground truth is computed by mpmath. Mean absolute error:
+   * float: 1.26713e-05
+   * double: 2.33606e-12
+   *
+   * Cephes documentation below.
+   *
+   * SYNOPSIS:
+   *
+   * double a, x, y, igam();
+   *
+   * y = igam( a, x );
+   *
+   * DESCRIPTION:
+   *
+   * The function is defined by
+   *
+   *                           x
+   *                            -
+   *                   1       | |  -t  a-1
+   *  igam(a,x)  =   -----     |   e   t   dt.
+   *                  -      | |
+   *                 | (a)    -
+   *                           0
+   *
+   *
+   * In this implementation both arguments must be positive.
+   * The integral is evaluated by either a power series or
+   * continued fraction expansion, depending on the relative
+   * values of a and x.
+   *
+   * ACCURACY (double):
+   *
+   *                      Relative error:
+   * arithmetic   domain     # trials      peak         rms
+   *    IEEE      0,30       200000       3.6e-14     2.9e-15
+   *    IEEE      0,100      300000       9.9e-14     1.5e-14
+   *
+   *
+   * ACCURACY (float):
+   *
+   *                      Relative error:
+   * arithmetic   domain     # trials      peak         rms
+   *    IEEE      0,30        20000       7.8e-6      5.9e-7
    *
-   * Preconditions:
-   *   x > 0
-   *   a > 0
-   *   !(x > 1 && x > a)
    */
-  EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
-    const Scalar zero = 0;
-    const Scalar one = 1;
-    const Scalar machep = cephes_helper<Scalar>::machep();
-    const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
+  /*
+    Cephes Math Library Release 2.2: June, 1992
+    Copyright 1985, 1987, 1992 by Stephen L. Moshier
+    Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+  */
 
-    Scalar ans, ax, c, r;
+  /* left tail of incomplete gamma function:
+   *
+   *          inf.      k
+   *   a  -x   -       x
+   *  x  e     >   ----------
+   *           -     -
+   *          k=0   | (a+k+1)
+   *
+   */
+};
 
-    /* Compute  x**a * exp(-x) / gamma(a)  */
-    ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
-    if (ax < -maxlog) {
-      // underflow
-      return zero;
-    }
-    ax = numext::exp(ax);
+template <typename Scalar>
+struct igamma_der_a_retval : igamma_retval<Scalar> {};
 
-    /* power series */
-    r = a;
-    c = one;
-    ans = one;
+template <typename Scalar>
+struct igamma_der_a_impl : igamma_generic_impl<Scalar, DERIVATIVE> {
+  /* Derivative of the incomplete Gamma function with respect to a.
+   *
+   * Computes d/da igamma(a, x) by forward differentiation of the igamma code.
+   *
+   * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points.
+   * The ground truth is computed by mpmath. Mean absolute error:
+   * float: 6.17992e-07
+   * double: 4.60453e-12
+   *
+   * Reference:
+   * R. Moore. "Algorithm AS 187: Derivatives of the incomplete gamma
+   * integral". Journal of the Royal Statistical Society. 1982
+   */
+};
 
-    while (true) {
-      r += one;
-      c *= x/r;
-      ans += c;
-      if (c/ans <= machep) {
-        break;
-      }
-    }
+template <typename Scalar>
+struct gamma_sample_der_alpha_retval : igamma_retval<Scalar> {};
 
-    return (ans * ax / a);
-  }
+template <typename Scalar>
+struct gamma_sample_der_alpha_impl
+    : igamma_generic_impl<Scalar, SAMPLE_DERIVATIVE> {
+  /* Derivative of a Gamma random variable sample with respect to alpha.
+   *
+   * Consider a sample of a Gamma random variable with the concentration
+   * parameter alpha: sample ~ Gamma(alpha, 1). The reparameterization
+   * derivative that we want to compute is dsample / dalpha =
+   * d igammainv(alpha, u) / dalpha, where u = igamma(alpha, sample).
+   * However, this formula is numerically unstable and expensive, so instead
+   * we use implicit differentiation:
+   *
+   * igamma(alpha, sample) = u, where u ~ Uniform(0, 1).
+   * Apply d / dalpha to both sides:
+   * d igamma(alpha, sample) / dalpha
+   *     + d igamma(alpha, sample) / dsample * dsample/dalpha  = 0
+   * d igamma(alpha, sample) / dalpha
+   *     + Gamma(sample | alpha, 1) dsample / dalpha = 0
+   * dsample/dalpha = - (d igamma(alpha, sample) / dalpha)
+   *                   / Gamma(sample | alpha, 1)
+   *
+   * Here Gamma(sample | alpha, 1) is the PDF of the Gamma distribution
+   * (note that the derivative of the CDF w.r.t. sample is the PDF).
+   * See the reference below for more details.
+   *
+   * The derivative of igamma(alpha, sample) is computed by forward
+   * differentiation of the igamma code. Division by the Gamma PDF is performed
+   * in the same code, increasing the accuracy and speed due to cancellation
+   * of some terms.
+   *
+   * Accuracy estimation. For each alpha in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables sample ~ Gamma(sample | alpha, 1), a total of 300
+   * points. The ground truth is computed by mpmath. Mean absolute error:
+   * float: 2.1686e-06
+   * double: 1.4774e-12
+   *
+   * Reference:
+   * M. Figurnov, S. Mohamed, A. Mnih "Implicit Reparameterization Gradients".
+   * 2018
+   */
 };
 
-#endif  // EIGEN_HAS_C99_MATH
-
 /*****************************************************************************
  * Implementation of Riemann zeta function of two arguments, based on Cephes *
  *****************************************************************************/
@@ -944,7 +1403,12 @@ struct zeta_impl {
         {
             if(q == numext::floor(q))
             {
-                return maxnum;
+                if (x == numext::floor(x) && long(x) % 2 == 0) {
+                    return maxnum;
+                }
+                else {
+                    return nan;
+                }
             }
             p = x;
             r = numext::floor(p);
@@ -1020,11 +1484,11 @@ struct polygamma_impl {
         Scalar nplus = n + one;
         const Scalar nan = NumTraits<Scalar>::quiet_NaN();
 
-        // Check that n is an integer
-        if (numext::floor(n) != n) {
+        // Check that n is a non-negative integer
+        if (numext::floor(n) != n || n < zero) {
             return nan;
         }
-        // Just return the digamma function for n = 1
+        // Just return the digamma function for n = 0
         else if (n == zero) {
             return digamma_impl<Scalar>::run(x);
         }
@@ -1392,7 +1856,7 @@ struct betainc_helper<double> {
     if ((a + b) < maxgam && numext::abs(u) < maxlog) {
       t = gamma(a + b) / (gamma(a) * gamma(b));
       s = s * t * pow(x, a);
-    } else {
+    }
     */
     t = lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) -
         lgamma_impl<double>::run(b) + u + numext::log(s);
@@ -1539,12 +2003,30 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar)
   return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x);
 }
 
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(ndtri, Scalar)
+    ndtri(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(ndtri, Scalar)::run(x);
+}
+
 template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar)
     igamma(const Scalar& a, const Scalar& x) {
   return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x);
 }
 
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma_der_a, Scalar)
+    igamma_der_a(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(igamma_der_a, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(gamma_sample_der_alpha, Scalar)
+    gamma_sample_der_alpha(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(gamma_sample_der_alpha, Scalar)::run(a, x);
+}
+
 template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar)
     igammac(const Scalar& a, const Scalar& x) {
@@ -1558,8 +2040,6 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(betainc, Scalar)
 }
 
 }  // end namespace numext
-
-
 }  // end namespace Eigen
 
 #endif  // EIGEN_SPECIAL_FUNCTIONS_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
index 46d60d323e99f4b49b5c5447958bf696fddad681..2bb017921dd59524fa343ad5ed99d20744bf07c6 100644
--- a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
@@ -38,10 +38,32 @@ Packet perf(const Packet& a) { using numext::erf; return erf(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
 
+/** \internal \returns the ndtri(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pndtri(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type ScalarType;
+  using internal::generic_ndtri; return generic_ndtri<Packet, ScalarType>(a);
+}
+
 /** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
 
+/** \internal \returns the derivative of the incomplete gamma function
+ * igamma_der_a(\a a, \a x) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigamma_der_a(const Packet& a, const Packet& x) {
+  using numext::igamma_der_a; return igamma_der_a(a, x);
+}
+
+/** \internal \returns compute the derivative of the sample
+  * of Gamma(alpha, 1) random variable with respect to the parameter a
+  * gamma_sample_der_alpha(\a alpha, \a sample) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pgamma_sample_der_alpha(const Packet& alpha, const Packet& sample) {
+  using numext::gamma_sample_der_alpha; return gamma_sample_der_alpha(alpha, sample);
+}
+
 /** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); }
@@ -55,4 +77,3 @@ Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext
 } // end namespace Eigen
 
 #endif // EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
-
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d76692097f50386c97b9e33b7ebe925aa327bed
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h
@@ -0,0 +1,46 @@
+#ifndef EIGEN_AVX_BESSELFUNCTIONS_H
+#define EIGEN_AVX_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y1)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX_BESSELFUNCTIONS_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..35e62a8acb25e0d56334183f70bad9a3909de64a
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h
@@ -0,0 +1,16 @@
+#ifndef EIGEN_AVX_SPECIALFUNCTIONS_H
+#define EIGEN_AVX_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, perf)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, perf)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pndtri)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pndtri)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX_SPECIAL_FUNCTIONS_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..7dd3c3e5be6de84176da2459a860fec6c344c770
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
@@ -0,0 +1,46 @@
+#ifndef EIGEN_AVX512_BESSELFUNCTIONS_H
+#define EIGEN_AVX512_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y1)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX512_BESSELFUNCTIONS_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..79878f2b6967ed3125d1726df48df95e9b63721d
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h
@@ -0,0 +1,16 @@
+#ifndef EIGEN_AVX512_SPECIALFUNCTIONS_H
+#define EIGEN_AVX512_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, perf)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, perf)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pndtri)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pndtri)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX512_SPECIAL_FUNCTIONS_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
deleted file mode 100644
index ec4fa84488cb7ab2bc150ab01cbd63395d9afe26..0000000000000000000000000000000000000000
--- a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
+++ /dev/null
@@ -1,165 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CUDA_SPECIALFUNCTIONS_H
-#define EIGEN_CUDA_SPECIALFUNCTIONS_H
-
-namespace Eigen {
-
-namespace internal {
-
-// Make sure this is only available when targeting a GPU: we don't want to
-// introduce conflicts between these packet_traits definitions and the ones
-// we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 plgamma<float4>(const float4& a)
-{
-  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 plgamma<double2>(const double2& a)
-{
-  using numext::lgamma;
-  return make_double2(lgamma(a.x), lgamma(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pdigamma<float4>(const float4& a)
-{
-  using numext::digamma;
-  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pdigamma<double2>(const double2& a)
-{
-  using numext::digamma;
-  return make_double2(digamma(a.x), digamma(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pzeta<float4>(const float4& x, const float4& q)
-{
-    using numext::zeta;
-    return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pzeta<double2>(const double2& x, const double2& q)
-{
-    using numext::zeta;
-    return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 ppolygamma<float4>(const float4& n, const float4& x)
-{
-    using numext::polygamma;
-    return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 ppolygamma<double2>(const double2& n, const double2& x)
-{
-    using numext::polygamma;
-    return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 perf<float4>(const float4& a)
-{
-  return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 perf<double2>(const double2& a)
-{
-  using numext::erf;
-  return make_double2(erf(a.x), erf(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 perfc<float4>(const float4& a)
-{
-  using numext::erfc;
-  return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 perfc<double2>(const double2& a)
-{
-  using numext::erfc;
-  return make_double2(erfc(a.x), erfc(a.y));
-}
-
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pigamma<float4>(const float4& a, const float4& x)
-{
-  using numext::igamma;
-  return make_float4(
-      igamma(a.x, x.x),
-      igamma(a.y, x.y),
-      igamma(a.z, x.z),
-      igamma(a.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pigamma<double2>(const double2& a, const double2& x)
-{
-  using numext::igamma;
-  return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pigammac<float4>(const float4& a, const float4& x)
-{
-  using numext::igammac;
-  return make_float4(
-      igammac(a.x, x.x),
-      igammac(a.y, x.y),
-      igammac(a.z, x.z),
-      igammac(a.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pigammac<double2>(const double2& a, const double2& x)
-{
-  using numext::igammac;
-  return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x)
-{
-  using numext::betainc;
-  return make_float4(
-      betainc(a.x, b.x, x.x),
-      betainc(a.y, b.y, x.y),
-      betainc(a.z, b.z, x.z),
-      betainc(a.w, b.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x)
-{
-  using numext::betainc;
-  return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
-}
-
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_CUDA_SPECIALFUNCTIONS_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd3bf4dd13bdf923db53be7f661f04367e1171a0
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h
@@ -0,0 +1,369 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GPU_SPECIALFUNCTIONS_H
+#define EIGEN_GPU_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plgamma<float4>(const float4& a)
+{
+  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plgamma<double2>(const double2& a)
+{
+  using numext::lgamma;
+  return make_double2(lgamma(a.x), lgamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pdigamma<float4>(const float4& a)
+{
+  using numext::digamma;
+  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pdigamma<double2>(const double2& a)
+{
+  using numext::digamma;
+  return make_double2(digamma(a.x), digamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pzeta<float4>(const float4& x, const float4& q)
+{
+    using numext::zeta;
+    return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pzeta<double2>(const double2& x, const double2& q)
+{
+    using numext::zeta;
+    return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 ppolygamma<float4>(const float4& n, const float4& x)
+{
+    using numext::polygamma;
+    return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 ppolygamma<double2>(const double2& n, const double2& x)
+{
+    using numext::polygamma;
+    return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perf<float4>(const float4& a)
+{
+  return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perf<double2>(const double2& a)
+{
+  using numext::erf;
+  return make_double2(erf(a.x), erf(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perfc<float4>(const float4& a)
+{
+  using numext::erfc;
+  return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perfc<double2>(const double2& a)
+{
+  using numext::erfc;
+  return make_double2(erfc(a.x), erfc(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pndtri<float4>(const float4& a)
+{
+  using numext::ndtri;
+  return make_float4(ndtri(a.x), ndtri(a.y), ndtri(a.z), ndtri(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pndtri<double2>(const double2& a)
+{
+  using numext::ndtri;
+  return make_double2(ndtri(a.x), ndtri(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigamma<float4>(const float4& a, const float4& x)
+{
+  using numext::igamma;
+  return make_float4(
+      igamma(a.x, x.x),
+      igamma(a.y, x.y),
+      igamma(a.z, x.z),
+      igamma(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigamma<double2>(const double2& a, const double2& x)
+{
+  using numext::igamma;
+  return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pigamma_der_a<float4>(
+    const float4& a, const float4& x) {
+  using numext::igamma_der_a;
+  return make_float4(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y),
+                     igamma_der_a(a.z, x.z), igamma_der_a(a.w, x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pigamma_der_a<double2>(const double2& a, const double2& x) {
+  using numext::igamma_der_a;
+  return make_double2(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pgamma_sample_der_alpha<float4>(
+    const float4& alpha, const float4& sample) {
+  using numext::gamma_sample_der_alpha;
+  return make_float4(
+      gamma_sample_der_alpha(alpha.x, sample.x),
+      gamma_sample_der_alpha(alpha.y, sample.y),
+      gamma_sample_der_alpha(alpha.z, sample.z),
+      gamma_sample_der_alpha(alpha.w, sample.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pgamma_sample_der_alpha<double2>(const double2& alpha, const double2& sample) {
+  using numext::gamma_sample_der_alpha;
+  return make_double2(
+      gamma_sample_der_alpha(alpha.x, sample.x),
+      gamma_sample_der_alpha(alpha.y, sample.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigammac<float4>(const float4& a, const float4& x)
+{
+  using numext::igammac;
+  return make_float4(
+      igammac(a.x, x.x),
+      igammac(a.y, x.y),
+      igammac(a.z, x.z),
+      igammac(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigammac<double2>(const double2& a, const double2& x)
+{
+  using numext::igammac;
+  return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x)
+{
+  using numext::betainc;
+  return make_float4(
+      betainc(a.x, b.x, x.x),
+      betainc(a.y, b.y, x.y),
+      betainc(a.z, b.z, x.z),
+      betainc(a.w, b.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x)
+{
+  using numext::betainc;
+  return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i0e<float4>(const float4& x) {
+  using numext::bessel_i0e;
+  return make_float4(bessel_i0e(x.x), bessel_i0e(x.y), bessel_i0e(x.z), bessel_i0e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_i0e<double2>(const double2& x) {
+  using numext::bessel_i0e;
+  return make_double2(bessel_i0e(x.x), bessel_i0e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i0<float4>(const float4& x) {
+  using numext::bessel_i0;
+  return make_float4(bessel_i0(x.x), bessel_i0(x.y), bessel_i0(x.z), bessel_i0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_i0<double2>(const double2& x) {
+  using numext::bessel_i0;
+  return make_double2(bessel_i0(x.x), bessel_i0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i1e<float4>(const float4& x) {
+  using numext::bessel_i1e;
+  return make_float4(bessel_i1e(x.x), bessel_i1e(x.y), bessel_i1e(x.z), bessel_i1e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_i1e<double2>(const double2& x) {
+  using numext::bessel_i1e;
+  return make_double2(bessel_i1e(x.x), bessel_i1e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i1<float4>(const float4& x) {
+  using numext::bessel_i1;
+  return make_float4(bessel_i1(x.x), bessel_i1(x.y), bessel_i1(x.z), bessel_i1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_i1<double2>(const double2& x) {
+  using numext::bessel_i1;
+  return make_double2(bessel_i1(x.x), bessel_i1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k0e<float4>(const float4& x) {
+  using numext::bessel_k0e;
+  return make_float4(bessel_k0e(x.x), bessel_k0e(x.y), bessel_k0e(x.z), bessel_k0e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_k0e<double2>(const double2& x) {
+  using numext::bessel_k0e;
+  return make_double2(bessel_k0e(x.x), bessel_k0e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k0<float4>(const float4& x) {
+  using numext::bessel_k0;
+  return make_float4(bessel_k0(x.x), bessel_k0(x.y), bessel_k0(x.z), bessel_k0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_k0<double2>(const double2& x) {
+  using numext::bessel_k0;
+  return make_double2(bessel_k0(x.x), bessel_k0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k1e<float4>(const float4& x) {
+  using numext::bessel_k1e;
+  return make_float4(bessel_k1e(x.x), bessel_k1e(x.y), bessel_k1e(x.z), bessel_k1e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_k1e<double2>(const double2& x) {
+  using numext::bessel_k1e;
+  return make_double2(bessel_k1e(x.x), bessel_k1e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k1<float4>(const float4& x) {
+  using numext::bessel_k1;
+  return make_float4(bessel_k1(x.x), bessel_k1(x.y), bessel_k1(x.z), bessel_k1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_k1<double2>(const double2& x) {
+  using numext::bessel_k1;
+  return make_double2(bessel_k1(x.x), bessel_k1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_j0<float4>(const float4& x) {
+  using numext::bessel_j0;
+  return make_float4(bessel_j0(x.x), bessel_j0(x.y), bessel_j0(x.z), bessel_j0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_j0<double2>(const double2& x) {
+  using numext::bessel_j0;
+  return make_double2(bessel_j0(x.x), bessel_j0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_j1<float4>(const float4& x) {
+  using numext::bessel_j1;
+  return make_float4(bessel_j1(x.x), bessel_j1(x.y), bessel_j1(x.z), bessel_j1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_j1<double2>(const double2& x) {
+  using numext::bessel_j1;
+  return make_double2(bessel_j1(x.x), bessel_j1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_y0<float4>(const float4& x) {
+  using numext::bessel_y0;
+  return make_float4(bessel_y0(x.x), bessel_y0(x.y), bessel_y0(x.z), bessel_y0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_y0<double2>(const double2& x) {
+  using numext::bessel_y0;
+  return make_double2(bessel_y0(x.x), bessel_y0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_y1<float4>(const float4& x) {
+  using numext::bessel_y1;
+  return make_float4(bessel_y1(x.x), bessel_y1(x.y), bessel_y1(x.z), bessel_y1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_y1<double2>(const double2& x) {
+  using numext::bessel_y1;
+  return make_double2(bessel_y1(x.x), bessel_y1(x.y));
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GPU_SPECIALFUNCTIONS_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..67433b057cf384084243b65cbd9e56982b9e671b
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h
@@ -0,0 +1,54 @@
+#ifndef EIGEN_NEON_BESSELFUNCTIONS_H
+#define EIGEN_NEON_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+#define NEON_HALF_TO_FLOAT_FUNCTIONS(METHOD)                            \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                       \
+Packet8hf METHOD<Packet8hf>(const Packet8hf& x) {                       \
+  const Packet4f lo = METHOD<Packet4f>(vcvt_f32_f16(vget_low_f16(x)));  \
+  const Packet4f hi = METHOD<Packet4f>(vcvt_f32_f16(vget_high_f16(x))); \
+  return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi));              \
+}                                                                       \
+                                                                        \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                       \
+Packet4hf METHOD<Packet4hf>(const Packet4hf& x) {                       \
+  return vcvt_f16_f32(METHOD<Packet4f>(vcvt_f32_f16(x)));               \
+}
+
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i0e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i1e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_j0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_j1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k0e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k1e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_y0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_y1)
+
+#undef NEON_HALF_TO_FLOAT_FUNCTIONS
+#endif
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y1)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_NEON_BESSELFUNCTIONS_H
diff --git a/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec9295197c5ee4e50eec7e64da7a353b0478f247
--- /dev/null
+++ b/contrib/eigen/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h
@@ -0,0 +1,34 @@
+#ifndef EIGEN_NEON_SPECIALFUNCTIONS_H
+#define EIGEN_NEON_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+#define NEON_HALF_TO_FLOAT_FUNCTIONS(METHOD)                            \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                       \
+Packet8hf METHOD<Packet8hf>(const Packet8hf& x) {                       \
+  const Packet4f lo = METHOD<Packet4f>(vcvt_f32_f16(vget_low_f16(x)));  \
+  const Packet4f hi = METHOD<Packet4f>(vcvt_f32_f16(vget_high_f16(x))); \
+  return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi));              \
+}                                                                       \
+                                                                        \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                       \
+Packet4hf METHOD<Packet4hf>(const Packet4hf& x) {                       \
+  return vcvt_f16_f32(METHOD<Packet4f>(vcvt_f32_f16(x)));               \
+}
+
+NEON_HALF_TO_FLOAT_FUNCTIONS(perf)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pndtri)
+
+#undef NEON_HALF_TO_FLOAT_FUNCTIONS
+#endif
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, perf)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pndtri)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_NEON_SPECIALFUNCTIONS_H
diff --git a/contrib/eigen/unsupported/Eigen/src/Splines/Spline.h b/contrib/eigen/unsupported/Eigen/src/Splines/Spline.h
index f1ae92d79faa5dbea55339dabe419e3ad2aa973e..79edd52ce6df14df735b8f3889fc3e58696e1d6d 100644
--- a/contrib/eigen/unsupported/Eigen/src/Splines/Spline.h
+++ b/contrib/eigen/unsupported/Eigen/src/Splines/Spline.h
@@ -255,7 +255,7 @@ namespace Eigen
     const KnotVectorType& U = knots;
 
     BasisVectorType left(p+1); left(0) = Scalar(0);
-    BasisVectorType right(p+1); right(0) = Scalar(0);        
+    BasisVectorType right(p+1); right(0) = Scalar(0);
 
     VectorBlock<BasisVectorType,Degree>(left,1,p) = u - VectorBlock<const KnotVectorType,Degree>(U,i+1-p,p).reverse();
     VectorBlock<BasisVectorType,Degree>(right,1,p) = VectorBlock<const KnotVectorType,Degree>(U,i+1,p) - u;
diff --git a/contrib/eigen/unsupported/Eigen/src/Splines/SplineFitting.h b/contrib/eigen/unsupported/Eigen/src/Splines/SplineFitting.h
index c761a9b3d898384a33b6f2e39f3c68cfc2e88ef8..9f6e8afa0de8c4d67505fb051960ac808ed610fd 100644
--- a/contrib/eigen/unsupported/Eigen/src/Splines/SplineFitting.h
+++ b/contrib/eigen/unsupported/Eigen/src/Splines/SplineFitting.h
@@ -17,8 +17,8 @@
 
 #include "SplineFwd.h"
 
-#include <Eigen/LU>
-#include <Eigen/QR>
+#include "../../../../Eigen/LU"
+#include "../../../../Eigen/QR"
 
 namespace Eigen
 {
@@ -181,7 +181,7 @@ namespace Eigen
    * \ingroup Splines_Module
    *
    * \param[in] pts The data points to which a spline should be fit.
-   * \param[out] chord_lengths The resulting chord lenggth vector.
+   * \param[out] chord_lengths The resulting chord length vector.
    *
    * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data
    **/   
@@ -385,7 +385,7 @@ namespace Eigen
     {
       const DenseIndex span = SplineType::Span(parameters[i], degree, knots);
 
-      if (derivativeIndices[derivativeIndex] == i)
+      if (derivativeIndex < derivativeIndices.size() && derivativeIndices[derivativeIndex] == i)
       {
         A.block(row, span - degree, 2, degree + 1)
           = SplineType::BasisFunctionDerivatives(parameters[i], 1, degree, knots);
@@ -395,8 +395,9 @@ namespace Eigen
       }
       else
       {
-        A.row(row++).segment(span - degree, degree + 1)
+        A.row(row).segment(span - degree, degree + 1)
           = SplineType::BasisFunctions(parameters[i], degree, knots);
+        b.col(row++) = points.col(i);
       }
     }
     b.col(0) = points.col(0);
diff --git a/contrib/eigen/unsupported/Eigen/src/Splines/SplineFwd.h b/contrib/eigen/unsupported/Eigen/src/Splines/SplineFwd.h
index 0a95fbf3ead4c225528ac4a51298248ba2f0707b..00d6b4921cca533b4fbc7b4a1eeac3774b6f53ff 100644
--- a/contrib/eigen/unsupported/Eigen/src/Splines/SplineFwd.h
+++ b/contrib/eigen/unsupported/Eigen/src/Splines/SplineFwd.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_SPLINES_FWD_H
 #define EIGEN_SPLINES_FWD_H
 
-#include <Eigen/Core>
+#include "../../../../Eigen/Core"
 
 namespace Eigen
 {
diff --git a/contrib/eigen/unsupported/README.txt b/contrib/eigen/unsupported/README.txt
index 83479ff0bb549d8bcad91505f130226ee1675f2a..70793bf13ce3214d95d2edfdc6e8a3cae3456884 100644
--- a/contrib/eigen/unsupported/README.txt
+++ b/contrib/eigen/unsupported/README.txt
@@ -20,7 +20,7 @@ However, it:
  - must rely on Eigen,
  - must be highly related to math,
  - should have some general purpose in the sense that it could
-   potentially become an offical Eigen module (or be merged into another one).
+   potentially become an official Eigen module (or be merged into another one).
 
 In doubt feel free to contact us. For instance, if your addons is very too specific
 but it shows an interesting way of using Eigen, then it could be a nice demo.
diff --git a/contrib/eigen/unsupported/bench/bench_svd.cpp b/contrib/eigen/unsupported/bench/bench_svd.cpp
index 01d8231aeac0434386a3fee1a060c34888751224..e7028a2b91810385071eb98a841e1a5924ae23e8 100644
--- a/contrib/eigen/unsupported/bench/bench_svd.cpp
+++ b/contrib/eigen/unsupported/bench/bench_svd.cpp
@@ -70,7 +70,7 @@ void bench_svd(const MatrixType& a = MatrixType())
   std::cout<< std::endl;
   timerJacobi.reset();
   timerBDC.reset();
-  cout << " Computes rotaion matrix" <<endl;
+  cout << " Computes rotation matrix" <<endl;
   for (int k=1; k<=NUMBER_SAMPLE; ++k)
   {
     timerBDC.start();
diff --git a/contrib/eigen/unsupported/doc/Overview.dox b/contrib/eigen/unsupported/doc/Overview.dox
index 45464a5458f37c1e2063ef84d478c41277e95e04..bae51dcf60902481907462b67de98786efe3b5cc 100644
--- a/contrib/eigen/unsupported/doc/Overview.dox
+++ b/contrib/eigen/unsupported/doc/Overview.dox
@@ -11,6 +11,8 @@ Click on the \e Modules tab at the top of this page to get a list of all unsuppo
 
 Don't miss the <a href="../index.html">official Eigen documentation</a>.
 
+ \subpage SYCL_EIGEN "SYCL backend for Eigen"
+
 */
 
 /*
@@ -26,3 +28,4 @@ subject to be included in %Eigen in the future.
 /// \internal \brief Namespace containing low-level routines from the %Eigen library.
 namespace internal {}
 }
+
diff --git a/contrib/eigen/unsupported/doc/SYCL.dox b/contrib/eigen/unsupported/doc/SYCL.dox
new file mode 100644
index 0000000000000000000000000000000000000000..2295adf21fec0a778366ae1a38c759ec33147071
--- /dev/null
+++ b/contrib/eigen/unsupported/doc/SYCL.dox
@@ -0,0 +1,9 @@
+/** \page SYCL_EIGEN Eigen SYCL Backend
+
+Useful information for Eigen SYCL Backend:
+
+- <a href="https://developer.codeplay.com/computecppce/latest/getting-started-with-eigen">Getting Started with Eigen</a> 
+
+- <a href="https://developer.codeplay.com/computecppce/latest/options-for-building-eigen-sycl">Options for Building Eigen SYCL</a>  
+
+*/
diff --git a/contrib/eigen/unsupported/doc/examples/CMakeLists.txt b/contrib/eigen/unsupported/doc/examples/CMakeLists.txt
index c47646dfca9d76e45cf929d74508e32698d96ee3..7bb67736c6e38230b6eac9e2926d339d1c50e558 100644
--- a/contrib/eigen/unsupported/doc/examples/CMakeLists.txt
+++ b/contrib/eigen/unsupported/doc/examples/CMakeLists.txt
@@ -1,20 +1,24 @@
-FILE(GLOB examples_SRCS "*.cpp")
+file(GLOB examples_SRCS "*.cpp")
 
-ADD_CUSTOM_TARGET(unsupported_examples)
+add_custom_target(unsupported_examples)
 
-INCLUDE_DIRECTORIES(../../../unsupported ../../../unsupported/test)
+include_directories(../../../unsupported ../../../unsupported/test)
 
-FOREACH(example_src ${examples_SRCS})
-  GET_FILENAME_COMPONENT(example ${example_src} NAME_WE)
-  ADD_EXECUTABLE(example_${example} ${example_src})
+foreach(example_src ${examples_SRCS})
+  get_filename_component(example ${example_src} NAME_WE)
+  add_executable(example_${example} ${example_src})
   if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
     target_link_libraries(example_${example} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
   endif()
-  ADD_CUSTOM_COMMAND(
+  add_custom_command(
     TARGET example_${example}
     POST_BUILD
     COMMAND example_${example}
     ARGS >${CMAKE_CURRENT_BINARY_DIR}/${example}.out
   )
-  ADD_DEPENDENCIES(unsupported_examples example_${example})
-ENDFOREACH(example_src)
+  add_dependencies(unsupported_examples example_${example})
+endforeach(example_src)
+
+if(EIGEN_TEST_SYCL)
+  add_subdirectory(SYCL)
+endif(EIGEN_TEST_SYCL)
diff --git a/contrib/eigen/unsupported/doc/examples/EulerAngles.cpp b/contrib/eigen/unsupported/doc/examples/EulerAngles.cpp
index 1ef6aee182d6c8dc6782455e9d446821dff2287d..3f8ca8c1742f19585f488d15fe1ded4b37e61296 100644
--- a/contrib/eigen/unsupported/doc/examples/EulerAngles.cpp
+++ b/contrib/eigen/unsupported/doc/examples/EulerAngles.cpp
@@ -23,7 +23,7 @@ int main()
   // Some Euler angles representation that our plane use.
   EulerAnglesZYZd planeAngles(0.78474, 0.5271, -0.513794);
   
-  MyArmyAngles planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeAngles);
+  MyArmyAngles planeAnglesInMyArmyAngles(planeAngles);
   
   std::cout << "vehicle angles(MyArmy):     " << vehicleAngles << std::endl;
   std::cout << "plane angles(ZYZ):        " << planeAngles << std::endl;
@@ -37,7 +37,7 @@ int main()
   Quaterniond planeRotated = AngleAxisd(-0.342, Vector3d::UnitY()) * planeAngles;
   
   planeAngles = planeRotated;
-  planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeRotated);
+  planeAnglesInMyArmyAngles = planeRotated;
   
   std::cout << "new plane angles(ZYZ):     " << planeAngles << std::endl;
   std::cout << "new plane angles(MyArmy): " << planeAnglesInMyArmyAngles << std::endl;
diff --git a/contrib/eigen/unsupported/doc/examples/SYCL/CMakeLists.txt b/contrib/eigen/unsupported/doc/examples/SYCL/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1d0f721dc1228061edad1ba71680713fb38c7764
--- /dev/null
+++ b/contrib/eigen/unsupported/doc/examples/SYCL/CMakeLists.txt
@@ -0,0 +1,37 @@
+FILE(GLOB examples_SRCS "*.cpp")
+
+set(EIGEN_SYCL ON)
+list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread)
+if(EIGEN_SYCL_TRISYCL)
+  set(CMAKE_CXX_STANDARD 17)
+else(EIGEN_SYCL_TRISYCL)
+  if(MSVC)
+    # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11
+    # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers.
+    set(CMAKE_CXX_STANDARD 14)
+    list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
+  else()
+    set(CMAKE_CXX_STANDARD 11)
+    list(APPEND COMPUTECPP_USER_FLAGS -Wall)
+  endif()
+  # The following flags are not supported by Clang and can cause warnings
+  # if used with -Werror so they are removed here.
+  if(COMPUTECPP_USE_COMPILER_DRIVER)
+    set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE})
+    string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+    string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+    string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  endif()
+  list(APPEND COMPUTECPP_USER_FLAGS
+      -DEIGEN_NO_ASSERTION_CHECKING=1
+      -no-serial-memop
+      -Xclang
+      -cl-mad-enable)
+endif(EIGEN_SYCL_TRISYCL)
+
+FOREACH(example_src ${examples_SRCS})
+  GET_FILENAME_COMPONENT(example ${example_src} NAME_WE)
+  ei_add_test_internal(${example} example_${example})
+  ADD_DEPENDENCIES(unsupported_examples example_${example})
+ENDFOREACH(example_src)
+set(EIGEN_SYCL OFF)
diff --git a/contrib/eigen/unsupported/doc/examples/SYCL/CwiseMul.cpp b/contrib/eigen/unsupported/doc/examples/SYCL/CwiseMul.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a7c33140e226cdf44601296715e53343990d0768
--- /dev/null
+++ b/contrib/eigen/unsupported/doc/examples/SYCL/CwiseMul.cpp
@@ -0,0 +1,63 @@
+#include <iostream>
+#define EIGEN_USE_SYCL
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+int main()
+{
+  using DataType = float;
+  using IndexType = int64_t;
+  constexpr auto DataLayout = Eigen::RowMajor;
+
+  auto devices = Eigen::get_sycl_supported_devices();
+  const auto device_selector = *devices.begin();
+  Eigen::QueueInterface queueInterface(device_selector);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  
+  // create the tensors to be used in the operation
+  IndexType sizeDim1 = 3;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 3;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+
+  // initialize the tensors with the data we want manipulate to
+  Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange);
+
+  // set up some random data in the tensors to be multiplied
+  in1 = in1.random();
+  in2 = in2.random();
+
+  // allocate memory for the tensors
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.size()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  // 
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+
+  // copy the memory to the device and do the c=a*b calculation
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.size())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType));
+  gpu_out.device(sycl_device) = gpu_in1 * gpu_in2;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  // print out the results
+   for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        std::cout << "device_out" << "(" << i << ", " << j << ", " << k << ") : " << out(i,j,k) 
+                  << " vs host_out" << "(" << i << ", " << j << ", " << k << ") : " << in1(i,j,k) * in2(i,j,k) << "\n";
+      }
+    }
+  }
+  printf("c=a*b Done\n");
+}
diff --git a/contrib/eigen/unsupported/doc/snippets/CMakeLists.txt b/contrib/eigen/unsupported/doc/snippets/CMakeLists.txt
index f0c5cc2a84dd65edd575963bf648a1481f9b1dde..adf95a8dbca0f941a3676fb0d2b9da8fc8313a7c 100644
--- a/contrib/eigen/unsupported/doc/snippets/CMakeLists.txt
+++ b/contrib/eigen/unsupported/doc/snippets/CMakeLists.txt
@@ -1,26 +1,26 @@
-FILE(GLOB snippets_SRCS "*.cpp")
+file(GLOB snippets_SRCS "*.cpp")
 
-ADD_CUSTOM_TARGET(unsupported_snippets)
+add_custom_target(unsupported_snippets)
 
-FOREACH(snippet_src ${snippets_SRCS})
-  GET_FILENAME_COMPONENT(snippet ${snippet_src} NAME_WE)
-  SET(compile_snippet_target compile_${snippet})
-  SET(compile_snippet_src ${compile_snippet_target}.cpp)
-  FILE(READ ${snippet_src} snippet_source_code)
-  CONFIGURE_FILE(${PROJECT_SOURCE_DIR}/doc/snippets/compile_snippet.cpp.in
+foreach(snippet_src ${snippets_SRCS})
+  get_filename_component(snippet ${snippet_src} NAME_WE)
+  set(compile_snippet_target compile_${snippet})
+  set(compile_snippet_src ${compile_snippet_target}.cpp)
+  file(READ ${snippet_src} snippet_source_code)
+  configure_file(${PROJECT_SOURCE_DIR}/doc/snippets/compile_snippet.cpp.in
                  ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src})
-  ADD_EXECUTABLE(${compile_snippet_target}
+  add_executable(${compile_snippet_target}
                  ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src})
   if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
     target_link_libraries(${compile_snippet_target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
   endif()
-  ADD_CUSTOM_COMMAND(
+  add_custom_command(
     TARGET ${compile_snippet_target}
     POST_BUILD
     COMMAND ${compile_snippet_target}
     ARGS >${CMAKE_CURRENT_BINARY_DIR}/${snippet}.out
   )
-  ADD_DEPENDENCIES(unsupported_snippets ${compile_snippet_target})
+  add_dependencies(unsupported_snippets ${compile_snippet_target})
   set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src}
                               PROPERTIES OBJECT_DEPENDS ${snippet_src})
-ENDFOREACH(snippet_src)
+endforeach(snippet_src)
diff --git a/contrib/eigen/unsupported/test/BVH.cpp b/contrib/eigen/unsupported/test/BVH.cpp
index ff5b3299d679379567e0a81a29f3cb914690e737..d8c39d5568817e803847000d9c0c7f1be891b39a 100644
--- a/contrib/eigen/unsupported/test/BVH.cpp
+++ b/contrib/eigen/unsupported/test/BVH.cpp
@@ -192,7 +192,7 @@ struct TreeTest
 };
 
 
-void test_BVH()
+EIGEN_DECLARE_TEST(BVH)
 {
   for(int i = 0; i < g_repeat; i++) {
 #ifdef EIGEN_TEST_PART_1
diff --git a/contrib/eigen/unsupported/test/CMakeLists.txt b/contrib/eigen/unsupported/test/CMakeLists.txt
index 3a8775a1cc189738296d555255820d26fe6942fb..d30fa62bd30c7ed07707a6f48b5cf20ecd414939 100644
--- a/contrib/eigen/unsupported/test/CMakeLists.txt
+++ b/contrib/eigen/unsupported/test/CMakeLists.txt
@@ -1,16 +1,7 @@
-# generate split test header file only if it does not yet exist
-# in order to prevent a rebuild everytime cmake is configured
-if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
-  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "")
-  foreach(i RANGE 1 999)
-    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h
-      "#ifdef EIGEN_TEST_PART_${i}\n"
-      "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n"
-      "#else\n"
-      "#define CALL_SUBTEST_${i}(FUNC)\n"
-      "#endif\n\n"
-    )
-  endforeach()
+# The file split_test_helper.h was generated at first run,
+# it is now included in test/
+if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
+  file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
 endif()
 
 set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Unsupported")
@@ -22,17 +13,17 @@ include_directories(../../test ../../unsupported ../../Eigen
 find_package (Threads)
 
 find_package(GoogleHash)
-if(GOOGLEHASH_FOUND)
+if(GoogleHash_FOUND)
   add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
   include_directories(${GOOGLEHASH_INCLUDES})
   ei_add_property(EIGEN_TESTED_BACKENDS  "GoogleHash, ")
-else(GOOGLEHASH_FOUND)
+else()
   ei_add_property(EIGEN_MISSING_BACKENDS  "GoogleHash, ")
-endif(GOOGLEHASH_FOUND)
+endif()
 
 
 find_package(Adolc)
-if(ADOLC_FOUND)
+if(Adolc_FOUND)
   include_directories(${ADOLC_INCLUDES})
   ei_add_property(EIGEN_TESTED_BACKENDS "Adolc, ")
   if(EIGEN_TEST_CXX11)
@@ -40,9 +31,9 @@ if(ADOLC_FOUND)
   else()
     message(STATUS "Adolc found, but tests require C++11 mode")
   endif()
-else(ADOLC_FOUND)
+else()
   ei_add_property(EIGEN_MISSING_BACKENDS "Adolc, ")
-endif(ADOLC_FOUND)
+endif()
 
 # this test seems to never have been successful on x87, so is considered to contain a FP-related bug.
 # see thread: "non-linear optimization test summary"
@@ -52,9 +43,7 @@ ei_add_test(NumericalDiff)
 ei_add_test(autodiff_scalar)
 ei_add_test(autodiff)
 
-if (NOT CMAKE_CXX_COMPILER MATCHES "clang\\+\\+$")
 ei_add_test(BVH)
-endif()
 
 ei_add_test(matrix_exponential)
 ei_add_test(matrix_function)
@@ -66,13 +55,11 @@ ei_add_test(FFT)
 
 ei_add_test(EulerAngles)
 
-find_package(MPFR 2.3.0)
-find_package(GMP)
-if(MPFR_FOUND AND EIGEN_COMPILER_SUPPORT_CPP11)
-  include_directories(${MPFR_INCLUDES} ./mpreal)
+find_package(MPREAL)
+if(MPREAL_FOUND AND EIGEN_COMPILER_SUPPORT_CPP11)
   ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ")
-  set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
- ei_add_test(mpreal_support "-std=c++11" "${EIGEN_MPFR_TEST_LIBRARIES}" )
+  include_directories(${MPREAL_INCLUDES})
+  ei_add_test(mpreal_support "-std=c++11" "${MPREAL_LIBRARIES}" )
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ")
 endif()
@@ -92,8 +79,8 @@ else()
   ei_add_property(EIGEN_MISSING_BACKENDS "fftw, ")
 endif()
 
-option(EIGEN_TEST_NO_OPENGL "Disable OpenGL support in unit tests" OFF)
-if(NOT EIGEN_TEST_NO_OPENGL)
+option(EIGEN_TEST_OPENGL "Enable OpenGL support in unit tests" OFF)
+if(EIGEN_TEST_OPENGL)
   find_package(OpenGL)
   find_package(GLUT)
   find_package(GLEW)
@@ -113,90 +100,192 @@ ei_add_test(polynomialsolver)
 ei_add_test(polynomialutils)
 ei_add_test(splines)
 ei_add_test(gmres)
+ei_add_test(dgmres)
 ei_add_test(minres)
+ei_add_test(idrs)
 ei_add_test(levenberg_marquardt)
 ei_add_test(kronecker_product)
+ei_add_test(bessel_functions)
 ei_add_test(special_functions)
-
-# TODO: The following test names are prefixed with the cxx11 string, since historically
-# the tests depended on c++11. This isn't the case anymore so we ought to rename them.
-# FIXME: Old versions of MSVC fail to compile this code, so we just disable these tests
-# when using visual studio. We should make the check more strict to enable the tests for
-# newer versions of MSVC.
-if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-ei_add_test(cxx11_tensor_dimension)
-ei_add_test(cxx11_tensor_map)
-ei_add_test(cxx11_tensor_assign)
-ei_add_test(cxx11_tensor_comparisons)
-ei_add_test(cxx11_tensor_forced_eval)
-ei_add_test(cxx11_tensor_math)
-ei_add_test(cxx11_tensor_const)
-ei_add_test(cxx11_tensor_intdiv)
-ei_add_test(cxx11_tensor_casts)
-ei_add_test(cxx11_tensor_empty)
-ei_add_test(cxx11_tensor_sugar)
-ei_add_test(cxx11_tensor_roundings)
-ei_add_test(cxx11_tensor_layout_swap)
-ei_add_test(cxx11_tensor_io)
-if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
-  # This test requires __uint128_t which is only available on 64bit systems
-  ei_add_test(cxx11_tensor_uint128)
-endif()
-endif()
+ei_add_test(special_packetmath "-DEIGEN_FAST_MATH=1")
 
 if(EIGEN_TEST_CXX11)
   if(EIGEN_TEST_SYCL)
-    ei_add_test_sycl(cxx11_tensor_sycl "-std=c++11")
-    ei_add_test_sycl(cxx11_tensor_forced_eval_sycl "-std=c++11")
-    ei_add_test_sycl(cxx11_tensor_broadcast_sycl "-std=c++11")
-    ei_add_test_sycl(cxx11_tensor_device_sycl "-std=c++11")
-    ei_add_test_sycl(cxx11_tensor_reduction_sycl "-std=c++11")
-  endif(EIGEN_TEST_SYCL)
-  # It should be safe to always run these tests as there is some fallback code for
-  # older compiler that don't support cxx11.
-  # This is already set if EIGEN_TEST_CXX11 is enabled:
-  # set(CMAKE_CXX_STANDARD 11)
+    set(EIGEN_SYCL ON)
+    # Forward CMake options as preprocessor definitions
+    if(EIGEN_SYCL_USE_DEFAULT_SELECTOR)
+      add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR})
+    endif()
+    if(EIGEN_SYCL_NO_LOCAL_MEM)
+      add_definitions(-DEIGEN_SYCL_NO_LOCAL_MEM=${EIGEN_SYCL_NO_LOCAL_MEM})
+    endif()
+    if(EIGEN_SYCL_LOCAL_MEM)
+      add_definitions(-DEIGEN_SYCL_LOCAL_MEM=${EIGEN_SYCL_LOCAL_MEM})
+    endif()
+    if(EIGEN_SYCL_MAX_GLOBAL_RANGE)
+      add_definitions(-DEIGEN_SYCL_MAX_GLOBAL_RANGE=${EIGEN_SYCL_MAX_GLOBAL_RANGE})
+    endif()
+    if(EIGEN_SYCL_LOCAL_THREAD_DIM0)
+      add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM0=${EIGEN_SYCL_LOCAL_THREAD_DIM0})
+    endif()
+    if(EIGEN_SYCL_LOCAL_THREAD_DIM1)
+      add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM1=${EIGEN_SYCL_LOCAL_THREAD_DIM1})
+    endif()
+    if(EIGEN_SYCL_REG_M)
+      add_definitions(-DEIGEN_SYCL_REG_M=${EIGEN_SYCL_REG_M})
+    endif()
+    if(EIGEN_SYCL_REG_N)
+      add_definitions(-DEIGEN_SYCL_REG_N=${EIGEN_SYCL_REG_N})
+    endif()
+    if(EIGEN_SYCL_USE_PROGRAM_CLASS)
+      add_definitions(-DEIGEN_SYCL_USE_PROGRAM_CLASS=${EIGEN_SYCL_USE_PROGRAM_CLASS})
+    endif()
+    if(EIGEN_SYCL_ASYNC_EXECUTION)
+      add_definitions(-DEIGEN_SYCL_ASYNC_EXECUTION=${EIGEN_SYCL_ASYNC_EXECUTION})
+    endif()
+    if(EIGEN_SYCL_DISABLE_SKINNY)
+      add_definitions(-DEIGEN_SYCL_DISABLE_SKINNY=${EIGEN_SYCL_DISABLE_SKINNY})
+    endif()
+    if(EIGEN_SYCL_DISABLE_DOUBLE_BUFFER)
+    add_definitions(-DEIGEN_SYCL_DISABLE_DOUBLE_BUFFER=${EIGEN_SYCL_DISABLE_DOUBLE_BUFFER})
+  endif()
+    if(EIGEN_SYCL_DISABLE_RANK1)
+      add_definitions(-DEIGEN_SYCL_DISABLE_RANK1=${EIGEN_SYCL_DISABLE_RANK1})
+    endif()
+    if(EIGEN_SYCL_DISABLE_SCALAR)
+      add_definitions(-DEIGEN_SYCL_DISABLE_SCALAR=${EIGEN_SYCL_DISABLE_SCALAR})
+    endif()
+    if(EIGEN_SYCL_DISABLE_GEMV)
+      add_definitions(-DEIGEN_SYCL_DISABLE_GEMV=${EIGEN_SYCL_DISABLE_GEMV})
+    endif()
+    if(EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION)
+      add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION})
+    endif()
+
+    if(EIGEN_SYCL_TRISYCL)
+      # triSYCL now requires c++17.
+      set(CMAKE_CXX_STANDARD 17)
+    else()
+      if(MSVC)
+        # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11
+        # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers.
+        set(CMAKE_CXX_STANDARD 14)
+        list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
+      else()
+        set(CMAKE_CXX_STANDARD 11)
+        list(APPEND COMPUTECPP_USER_FLAGS -Wall)
+      endif()
+      # The following flags are not supported by Clang and can cause warnings
+      # if used with -Werror so they are removed here.
+      if(COMPUTECPP_USE_COMPILER_DRIVER)
+        set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE})
+        string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+        string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+        string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+      endif()
+      list(APPEND COMPUTECPP_USER_FLAGS
+          -DEIGEN_NO_ASSERTION_CHECKING=1
+          -no-serial-memop
+          -Xclang
+          -cl-mad-enable)
+    endif()
+
+    ei_add_test(cxx11_tensor_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_image_op_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_math_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_device_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_padding_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_contract_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_striding_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_random_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_generator_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_scan_sycl ${STD_CXX_FLAG})
+    set(EIGEN_SYCL OFF)
+  endif()
 
   ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
 
   ei_add_test(cxx11_meta)
-  ei_add_test(cxx11_tensor_simple)
-#  ei_add_test(cxx11_tensor_symmetry)
-  ei_add_test(cxx11_tensor_index_list)
-  ei_add_test(cxx11_tensor_mixed_indices)
+  ei_add_test(cxx11_maxsizevector)
+  ei_add_test(cxx11_tensor_argmax)
+  ei_add_test(cxx11_tensor_assign)
+  ei_add_test(cxx11_tensor_block_access)
+  ei_add_test(cxx11_tensor_block_eval)
+  ei_add_test(cxx11_tensor_block_io)
+  ei_add_test(cxx11_tensor_broadcasting)
+  ei_add_test(cxx11_tensor_casts)
+  ei_add_test(cxx11_tensor_chipping)
+  ei_add_test(cxx11_tensor_comparisons)
+  ei_add_test(cxx11_tensor_concatenation)
+  ei_add_test(cxx11_tensor_const)
   ei_add_test(cxx11_tensor_contraction)
   ei_add_test(cxx11_tensor_convolution)
+  ei_add_test(cxx11_tensor_custom_index)
+  ei_add_test(cxx11_tensor_custom_op)
+  ei_add_test(cxx11_tensor_dimension)
+  ei_add_test(cxx11_tensor_empty)
+  ei_add_test(cxx11_tensor_executor "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_tensor_expr)
+  ei_add_test(cxx11_tensor_fft)
   ei_add_test(cxx11_tensor_fixed_size)
-  ei_add_test(cxx11_tensor_of_const_values)
-  ei_add_test(cxx11_tensor_of_complex)
-  ei_add_test(cxx11_tensor_of_strings)
-  ei_add_test(cxx11_tensor_lvalue)
-  ei_add_test(cxx11_tensor_broadcasting)
-  ei_add_test(cxx11_tensor_chipping)
-  ei_add_test(cxx11_tensor_concatenation)
+  ei_add_test(cxx11_tensor_forced_eval)
+  ei_add_test(cxx11_tensor_generator)
+  ei_add_test(cxx11_tensor_ifft)
+  ei_add_test(cxx11_tensor_image_patch)
+  ei_add_test(cxx11_tensor_index_list)
   ei_add_test(cxx11_tensor_inflation)
+  ei_add_test(cxx11_tensor_intdiv)
+  ei_add_test(cxx11_tensor_io)
+  ei_add_test(cxx11_tensor_layout_swap)
+  ei_add_test(cxx11_tensor_lvalue)
+  ei_add_test(cxx11_tensor_map)
+  ei_add_test(cxx11_tensor_math)
+  ei_add_test(cxx11_tensor_mixed_indices)
   ei_add_test(cxx11_tensor_morphing)
+  ei_add_test(cxx11_tensor_move)
+  ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_tensor_of_complex)
+  ei_add_test(cxx11_tensor_of_const_values)
+  ei_add_test(cxx11_tensor_of_strings)
   ei_add_test(cxx11_tensor_padding)
   ei_add_test(cxx11_tensor_patch)
-  ei_add_test(cxx11_tensor_image_patch)
-  ei_add_test(cxx11_tensor_volume_patch)
+  ei_add_test(cxx11_tensor_random)
   ei_add_test(cxx11_tensor_reduction)
-  ei_add_test(cxx11_tensor_argmax)
+  ei_add_test(cxx11_tensor_ref)
+  ei_add_test(cxx11_tensor_roundings)
+  ei_add_test(cxx11_tensor_scan)
   ei_add_test(cxx11_tensor_shuffling)
+  ei_add_test(cxx11_tensor_simple)
   ei_add_test(cxx11_tensor_striding)
-  ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_tensor_sugar)
+  ei_add_test(cxx11_tensor_thread_local "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
-  ei_add_test(cxx11_tensor_ref)
-  ei_add_test(cxx11_tensor_random)
-  ei_add_test(cxx11_tensor_generator)
-  ei_add_test(cxx11_tensor_custom_op)
-  ei_add_test(cxx11_tensor_custom_index)
-  ei_add_test(cxx11_tensor_fft)
-  ei_add_test(cxx11_tensor_ifft)
-  ei_add_test(cxx11_tensor_scan)
+  ei_add_test(cxx11_tensor_trace)
+  ei_add_test(cxx11_tensor_volume_patch)
+#  ei_add_test(cxx11_tensor_symmetry)
+if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+  # This test requires __uint128_t which is only available on 64bit systems
+  ei_add_test(cxx11_tensor_uint128)
+endif()
 
 endif()
 
@@ -219,7 +308,11 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
     set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
   endif()
   if(EIGEN_TEST_CUDA_CLANG)
-   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+    string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
+    foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+        string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${ARCH}")
+    endforeach()
   endif()
 
   set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
@@ -227,37 +320,98 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
     set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
   endif()
 
-  if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3))
-    set(EIGEN_CUDA_CXX11_FLAG "-std=c++11")
-  else()
-    # otherwise the flag has already been added because of the above set(CMAKE_CXX_STANDARD 11)
-    set(EIGEN_CUDA_CXX11_FLAG "")
-  endif()
-
-  set(CUDA_NVCC_FLAGS  "${EIGEN_CUDA_CXX11_FLAG} ${EIGEN_CUDA_RELAXED_CONSTEXPR} -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\" ${CUDA_NVCC_FLAGS}")
+  set(NVCC_ARCH_FLAGS)
+  foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+    string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}")
+  endforeach()
+  set(CUDA_NVCC_FLAGS  "${EIGEN_CUDA_RELAXED_CONSTEXPR} -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS}")
   cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
 
-  ei_add_test(cxx11_tensor_complex_cuda)
-  ei_add_test(cxx11_tensor_complex_cwise_ops_cuda)
-  ei_add_test(cxx11_tensor_reduction_cuda)
-  ei_add_test(cxx11_tensor_argmax_cuda)
-  ei_add_test(cxx11_tensor_cast_float16_cuda)
-  ei_add_test(cxx11_tensor_scan_cuda)
+  ei_add_test(cxx11_tensor_complex_gpu)
+  ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
+  ei_add_test(cxx11_tensor_reduction_gpu)
+  ei_add_test(cxx11_tensor_argmax_gpu)
+  ei_add_test(cxx11_tensor_cast_float16_gpu)
+  ei_add_test(cxx11_tensor_scan_gpu)
+
+  set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH 9999)
+  foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+    if(${ARCH} LESS ${EIGEN_CUDA_OLDEST_COMPUTE_ARCH})
+      set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH ${ARCH})
+    endif()
+  endforeach()
 
   # Contractions require arch 3.0 or higher
-  if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29)
+  if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 29)
     ei_add_test(cxx11_tensor_device)
-    ei_add_test(cxx11_tensor_cuda)
-    ei_add_test(cxx11_tensor_contract_cuda)
-    ei_add_test(cxx11_tensor_of_float16_cuda)
+    ei_add_test(cxx11_tensor_gpu)
+    ei_add_test(cxx11_tensor_contract_gpu)
+    ei_add_test(cxx11_tensor_of_float16_gpu)
   endif()
 
   # The random number generation code requires arch 3.5 or greater.
-  if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34)
-    ei_add_test(cxx11_tensor_random_cuda)
+  if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 34)
+    ei_add_test(cxx11_tensor_random_gpu)
   endif()
 
 
   unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()
+
+# Add HIP specific tests
+if (EIGEN_TEST_HIP)
+
+  set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
+
+  if (EXISTS ${HIP_PATH})
+
+    list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
+
+    find_package(HIP REQUIRED)
+    if (HIP_FOUND)
+
+      execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
+
+      if ((${HIP_PLATFORM} STREQUAL "hcc") OR (${HIP_PLATFORM} STREQUAL "amd"))
+
+	include_directories(${CMAKE_CURRENT_BINARY_DIR})
+	include_directories(${HIP_PATH}/include)
+
+	set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
+	#
+	# complex datatype is not yet supported by HIP
+	# so leaving out those tests for now
+	#
+	# ei_add_test(cxx11_tensor_complex_gpu)
+	# ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
+	#
+	ei_add_test(cxx11_tensor_reduction_gpu)
+	ei_add_test(cxx11_tensor_argmax_gpu)
+	ei_add_test(cxx11_tensor_cast_float16_gpu)
+	ei_add_test(cxx11_tensor_scan_gpu)
+	ei_add_test(cxx11_tensor_device)
+
+	ei_add_test(cxx11_tensor_gpu)
+	ei_add_test(cxx11_tensor_contract_gpu)
+	ei_add_test(cxx11_tensor_of_float16_gpu)
+	ei_add_test(cxx11_tensor_random_gpu)
+
+	unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+
+      elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia"))
+	message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
+      else ()
+	message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
+      endif()
+
+    endif()
+
+  else ()
+
+    message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
+
+  endif()
+
+endif()
+
diff --git a/contrib/eigen/unsupported/test/EulerAngles.cpp b/contrib/eigen/unsupported/test/EulerAngles.cpp
index a8cb5286409bf9186f79b66671a6d43b78836f5c..0955795b67c677d02ac9f6ff2f343f165988332e 100644
--- a/contrib/eigen/unsupported/test/EulerAngles.cpp
+++ b/contrib/eigen/unsupported/test/EulerAngles.cpp
@@ -13,146 +13,220 @@
 
 using namespace Eigen;
 
-template<typename EulerSystem, typename Scalar>
-void verify_euler_ranged(const Matrix<Scalar,3,1>& ea,
-  bool positiveRangeAlpha, bool positiveRangeBeta, bool positiveRangeGamma)
+// Unfortunately, we need to specialize it in order to work. (We could add it in main.h test framework)
+template <typename Scalar, class System>
+bool verifyIsApprox(const Eigen::EulerAngles<Scalar, System>& a, const Eigen::EulerAngles<Scalar, System>& b)
+{
+  return verifyIsApprox(a.angles(), b.angles());
+}
+
+// Verify that x is in the approxed range [a, b]
+#define VERIFY_APPROXED_RANGE(a, x, b) \
+  do { \
+  VERIFY_IS_APPROX_OR_LESS_THAN(a, x); \
+  VERIFY_IS_APPROX_OR_LESS_THAN(x, b); \
+  } while(0)
+
+const char X = EULER_X;
+const char Y = EULER_Y;
+const char Z = EULER_Z;
+
+template<typename Scalar, class EulerSystem>
+void verify_euler(const EulerAngles<Scalar, EulerSystem>& e)
 {
   typedef EulerAngles<Scalar, EulerSystem> EulerAnglesType;
   typedef Matrix<Scalar,3,3> Matrix3;
   typedef Matrix<Scalar,3,1> Vector3;
   typedef Quaternion<Scalar> QuaternionType;
   typedef AngleAxis<Scalar> AngleAxisType;
-  using std::abs;
-  
-  Scalar alphaRangeStart, alphaRangeEnd;
-  Scalar betaRangeStart, betaRangeEnd;
-  Scalar gammaRangeStart, gammaRangeEnd;
   
-  if (positiveRangeAlpha)
-  {
-    alphaRangeStart = Scalar(0);
-    alphaRangeEnd = Scalar(2 * EIGEN_PI);
-  }
-  else
-  {
-    alphaRangeStart = -Scalar(EIGEN_PI);
-    alphaRangeEnd = Scalar(EIGEN_PI);
-  }
+  const Scalar ONE = Scalar(1);
+  const Scalar HALF_PI = Scalar(EIGEN_PI / 2);
+  const Scalar PI = Scalar(EIGEN_PI);
   
-  if (positiveRangeBeta)
-  {
-    betaRangeStart = Scalar(0);
-    betaRangeEnd = Scalar(2 * EIGEN_PI);
-  }
-  else
-  {
-    betaRangeStart = -Scalar(EIGEN_PI);
-    betaRangeEnd = Scalar(EIGEN_PI);
-  }
+  // It's very important calc the acceptable precision depending on the distance from the pole.
+  const Scalar longitudeRadius = std::abs(
+    EulerSystem::IsTaitBryan ?
+    std::cos(e.beta()) :
+    std::sin(e.beta())
+    );
+  Scalar precision = test_precision<Scalar>() / longitudeRadius;
   
-  if (positiveRangeGamma)
+  Scalar betaRangeStart, betaRangeEnd;
+  if (EulerSystem::IsTaitBryan)
   {
-    gammaRangeStart = Scalar(0);
-    gammaRangeEnd = Scalar(2 * EIGEN_PI);
+    betaRangeStart = -HALF_PI;
+    betaRangeEnd = HALF_PI;
   }
   else
   {
-    gammaRangeStart = -Scalar(EIGEN_PI);
-    gammaRangeEnd = Scalar(EIGEN_PI);
+    if (!EulerSystem::IsBetaOpposite)
+    {
+      betaRangeStart = 0;
+      betaRangeEnd = PI;
+    }
+    else
+    {
+      betaRangeStart = -PI;
+      betaRangeEnd = 0;
+    }
   }
   
-  const int i = EulerSystem::AlphaAxisAbs - 1;
-  const int j = EulerSystem::BetaAxisAbs - 1;
-  const int k = EulerSystem::GammaAxisAbs - 1;
+  const Vector3 I_ = EulerAnglesType::AlphaAxisVector();
+  const Vector3 J_ = EulerAnglesType::BetaAxisVector();
+  const Vector3 K_ = EulerAnglesType::GammaAxisVector();
   
-  const int iFactor = EulerSystem::IsAlphaOpposite ? -1 : 1;
-  const int jFactor = EulerSystem::IsBetaOpposite ? -1 : 1;
-  const int kFactor = EulerSystem::IsGammaOpposite ? -1 : 1;
-  
-  const Vector3 I = EulerAnglesType::AlphaAxisVector();
-  const Vector3 J = EulerAnglesType::BetaAxisVector();
-  const Vector3 K = EulerAnglesType::GammaAxisVector();
-  
-  EulerAnglesType e(ea[0], ea[1], ea[2]);
+  // Is approx checks
+  VERIFY(e.isApprox(e));
+  VERIFY_IS_APPROX(e, e);
+  VERIFY_IS_NOT_APPROX(e, EulerAnglesType(e.alpha() + ONE, e.beta() + ONE, e.gamma() + ONE));
+
+  const Matrix3 m(e);
+  VERIFY_IS_APPROX(Scalar(m.determinant()), ONE);
+
+  EulerAnglesType ebis(m);
   
-  Matrix3 m(e);
-  Vector3 eabis = EulerAnglesType(m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
+  // When no roll(acting like polar representation), we have the best precision.
+  // One of those cases is when the Euler angles are on the pole, and because it's singular case,
+  //  the computation returns no roll.
+  if (ebis.beta() == 0)
+    precision = test_precision<Scalar>();
   
   // Check that eabis in range
-  VERIFY(alphaRangeStart <= eabis[0] && eabis[0] <= alphaRangeEnd);
-  VERIFY(betaRangeStart <= eabis[1] && eabis[1] <= betaRangeEnd);
-  VERIFY(gammaRangeStart <= eabis[2] && eabis[2] <= gammaRangeEnd);
+  VERIFY_APPROXED_RANGE(-PI, ebis.alpha(), PI);
+  VERIFY_APPROXED_RANGE(betaRangeStart, ebis.beta(), betaRangeEnd);
+  VERIFY_APPROXED_RANGE(-PI, ebis.gamma(), PI);
+
+  const Matrix3 mbis(AngleAxisType(ebis.alpha(), I_) * AngleAxisType(ebis.beta(), J_) * AngleAxisType(ebis.gamma(), K_));
+  VERIFY_IS_APPROX(Scalar(mbis.determinant()), ONE);
+  VERIFY_IS_APPROX(mbis, ebis.toRotationMatrix());
+  /*std::cout << "===================\n" <<
+    "e: " << e << std::endl <<
+    "eabis: " << eabis.transpose() << std::endl <<
+    "m: " << m << std::endl <<
+    "mbis: " << mbis << std::endl <<
+    "X: " << (m * Vector3::UnitX()).transpose() << std::endl <<
+    "X: " << (mbis * Vector3::UnitX()).transpose() << std::endl;*/
+  VERIFY(m.isApprox(mbis, precision));
+
+  // Test if ea and eabis are the same
+  // Need to check both singular and non-singular cases
+  // There are two singular cases.
+  // 1. When I==K and sin(ea(1)) == 0
+  // 2. When I!=K and cos(ea(1)) == 0
+
+  // TODO: Make this test work well, and use range saturation function.
+  /*// If I==K, and ea[1]==0, then there no unique solution.
+  // The remark apply in the case where I!=K, and |ea[1]| is close to +-pi/2.
+  if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) 
+      VERIFY_IS_APPROX(ea, eabis);*/
   
-  Vector3 eabis2 = m.eulerAngles(i, j, k);
+  // Quaternions
+  const QuaternionType q(e);
+  ebis = q;
+  const QuaternionType qbis(ebis);
+  VERIFY(internal::isApprox<Scalar>(std::abs(q.dot(qbis)), ONE, precision));
+  //VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same
   
-  // Invert the relevant axes
-  eabis2[0] *= iFactor;
-  eabis2[1] *= jFactor;
-  eabis2[2] *= kFactor;
+  // A suggestion for simple product test when will be supported.
+  /*EulerAnglesType e2(PI/2, PI/2, PI/2);
+  Matrix3 m2(e2);
+  VERIFY_IS_APPROX(e*e2, m*m2);*/
+}
+
+template<signed char A, signed char B, signed char C, typename Scalar>
+void verify_euler_vec(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler(EulerAngles<Scalar, EulerSystem<A, B, C> >(ea[0], ea[1], ea[2]));
+}
+
+template<signed char A, signed char B, signed char C, typename Scalar>
+void verify_euler_all_neg(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler_vec<+A,+B,+C>(ea);
+  verify_euler_vec<+A,+B,-C>(ea);
+  verify_euler_vec<+A,-B,+C>(ea);
+  verify_euler_vec<+A,-B,-C>(ea);
   
-  // Saturate the angles to the correct range
-  if (positiveRangeAlpha && (eabis2[0] < 0))
-    eabis2[0] += Scalar(2 * EIGEN_PI);
-  if (positiveRangeBeta && (eabis2[1] < 0))
-    eabis2[1] += Scalar(2 * EIGEN_PI);
-  if (positiveRangeGamma && (eabis2[2] < 0))
-    eabis2[2] += Scalar(2 * EIGEN_PI);
+  verify_euler_vec<-A,+B,+C>(ea);
+  verify_euler_vec<-A,+B,-C>(ea);
+  verify_euler_vec<-A,-B,+C>(ea);
+  verify_euler_vec<-A,-B,-C>(ea);
+}
+
+template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler_all_neg<X,Y,Z>(ea);
+  verify_euler_all_neg<X,Y,X>(ea);
+  verify_euler_all_neg<X,Z,Y>(ea);
+  verify_euler_all_neg<X,Z,X>(ea);
   
-  VERIFY_IS_APPROX(eabis, eabis2);// Verify that our estimation is the same as m.eulerAngles() is
+  verify_euler_all_neg<Y,Z,X>(ea);
+  verify_euler_all_neg<Y,Z,Y>(ea);
+  verify_euler_all_neg<Y,X,Z>(ea);
+  verify_euler_all_neg<Y,X,Y>(ea);
   
-  Matrix3 mbis(AngleAxisType(eabis[0], I) * AngleAxisType(eabis[1], J) * AngleAxisType(eabis[2], K));
-  VERIFY_IS_APPROX(m,  mbis);
+  verify_euler_all_neg<Z,X,Y>(ea);
+  verify_euler_all_neg<Z,X,Z>(ea);
+  verify_euler_all_neg<Z,Y,X>(ea);
+  verify_euler_all_neg<Z,Y,Z>(ea);
+}
+
+template<typename Scalar> void check_singular_cases(const Scalar& singularBeta)
+{
+  typedef Matrix<Scalar,3,1> Vector3;
+  const Scalar PI = Scalar(EIGEN_PI);
   
-  // Tests that are only relevant for no possitive range
-  if (!(positiveRangeAlpha || positiveRangeBeta || positiveRangeGamma))
+  for (Scalar epsilon = NumTraits<Scalar>::epsilon(); epsilon < 1; epsilon *= Scalar(1.2))
   {
-    /* If I==K, and ea[1]==0, then there no unique solution. */ 
-    /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */ 
-    if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) 
-      VERIFY((ea-eabis).norm() <= test_precision<Scalar>());
-    
-    // approx_or_less_than does not work for 0
-    VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1)));
+    check_all_var(Vector3(PI/4, singularBeta, PI/3));
+    check_all_var(Vector3(PI/4, singularBeta - epsilon, PI/3));
+    check_all_var(Vector3(PI/4, singularBeta - Scalar(1.5)*epsilon, PI/3));
+    check_all_var(Vector3(PI/4, singularBeta - 2*epsilon, PI/3));
+    check_all_var(Vector3(PI*Scalar(0.8), singularBeta - epsilon, Scalar(0.9)*PI));
+    check_all_var(Vector3(PI*Scalar(-0.9), singularBeta + epsilon, PI*Scalar(0.3)));
+    check_all_var(Vector3(PI*Scalar(-0.6), singularBeta + Scalar(1.5)*epsilon, PI*Scalar(0.3)));
+    check_all_var(Vector3(PI*Scalar(-0.5), singularBeta + 2*epsilon, PI*Scalar(0.4)));
+    check_all_var(Vector3(PI*Scalar(0.9), singularBeta + epsilon, Scalar(0.8)*PI));
   }
   
-  // Quaternions
-  QuaternionType q(e);
-  eabis = EulerAnglesType(q, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
-  VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same
-}
-
-template<typename EulerSystem, typename Scalar>
-void verify_euler(const Matrix<Scalar,3,1>& ea)
-{
-  verify_euler_ranged<EulerSystem>(ea, false, false, false);
-  verify_euler_ranged<EulerSystem>(ea, false, false, true);
-  verify_euler_ranged<EulerSystem>(ea, false, true, false);
-  verify_euler_ranged<EulerSystem>(ea, false, true, true);
-  verify_euler_ranged<EulerSystem>(ea, true, false, false);
-  verify_euler_ranged<EulerSystem>(ea, true, false, true);
-  verify_euler_ranged<EulerSystem>(ea, true, true, false);
-  verify_euler_ranged<EulerSystem>(ea, true, true, true);
+  // This one for sanity, it had a problem with near pole cases in float scalar.
+  check_all_var(Vector3(PI*Scalar(0.8), singularBeta - Scalar(1E-6), Scalar(0.9)*PI));
 }
 
-template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
+template<typename Scalar> void eulerangles_manual()
 {
-  verify_euler<EulerSystemXYZ>(ea);
-  verify_euler<EulerSystemXYX>(ea);
-  verify_euler<EulerSystemXZY>(ea);
-  verify_euler<EulerSystemXZX>(ea);
-  
-  verify_euler<EulerSystemYZX>(ea);
-  verify_euler<EulerSystemYZY>(ea);
-  verify_euler<EulerSystemYXZ>(ea);
-  verify_euler<EulerSystemYXY>(ea);
-  
-  verify_euler<EulerSystemZXY>(ea);
-  verify_euler<EulerSystemZXZ>(ea);
-  verify_euler<EulerSystemZYX>(ea);
-  verify_euler<EulerSystemZYZ>(ea);
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Matrix<Scalar,Dynamic,1> VectorX;
+  const Vector3 Zero = Vector3::Zero();
+  const Scalar PI = Scalar(EIGEN_PI);
+  
+  check_all_var(Zero);
+  
+  // singular cases
+  check_singular_cases(PI/2);
+  check_singular_cases(-PI/2);
+  
+  check_singular_cases(Scalar(0));
+  check_singular_cases(Scalar(-0));
+  
+  check_singular_cases(PI);
+  check_singular_cases(-PI);
+  
+  // non-singular cases
+  VectorX alpha = VectorX::LinSpaced(20, Scalar(-0.99) * PI, PI);
+  VectorX beta =  VectorX::LinSpaced(20, Scalar(-0.49) * PI, Scalar(0.49) * PI);
+  VectorX gamma = VectorX::LinSpaced(20, Scalar(-0.99) * PI, PI);
+  for (int i = 0; i < alpha.size(); ++i) {
+    for (int j = 0; j < beta.size(); ++j) {
+      for (int k = 0; k < gamma.size(); ++k) {
+        check_all_var(Vector3(alpha(i), beta(j), gamma(k)));
+      }
+    }
+  }
 }
 
-template<typename Scalar> void eulerangles()
+template<typename Scalar> void eulerangles_rand()
 {
   typedef Matrix<Scalar,3,3> Matrix3;
   typedef Matrix<Scalar,3,1> Vector3;
@@ -199,10 +273,24 @@ template<typename Scalar> void eulerangles()
   check_all_var(ea);
 }
 
-void test_EulerAngles()
+EIGEN_DECLARE_TEST(EulerAngles)
 {
+  // Simple cast test
+  EulerAnglesXYZd onesEd(1, 1, 1);
+  EulerAnglesXYZf onesEf = onesEd.cast<float>();
+  VERIFY_IS_APPROX(onesEd, onesEf.cast<double>());
+
+  // Simple Construction from Vector3 test
+  VERIFY_IS_APPROX(onesEd, EulerAnglesXYZd(Vector3d::Ones()));
+  
+  CALL_SUBTEST_1( eulerangles_manual<float>() );
+  CALL_SUBTEST_2( eulerangles_manual<double>() );
+  
   for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( eulerangles<float>() );
-    CALL_SUBTEST_2( eulerangles<double>() );
+    CALL_SUBTEST_3( eulerangles_rand<float>() );
+    CALL_SUBTEST_4( eulerangles_rand<double>() );
   }
+  
+  // TODO: Add tests for auto diff
+  // TODO: Add tests for complex numbers
 }
diff --git a/contrib/eigen/unsupported/test/FFTW.cpp b/contrib/eigen/unsupported/test/FFTW.cpp
index 8b7528fb7e59a02562fce8b91b39479594da8d76..cfe559ebda567237b2789df1b675afb2a124a9c9 100644
--- a/contrib/eigen/unsupported/test/FFTW.cpp
+++ b/contrib/eigen/unsupported/test/FFTW.cpp
@@ -225,7 +225,7 @@ void test_return_by_value(int len)
     VERIFY( (in1-in).norm() < test_precision<float>() );
 }
 
-void test_FFTW()
+EIGEN_DECLARE_TEST(FFTW)
 {
   CALL_SUBTEST( test_return_by_value(32) );
   //CALL_SUBTEST( ( test_complex2d<float,4,8> () ) ); CALL_SUBTEST( ( test_complex2d<double,4,8> () ) );
diff --git a/contrib/eigen/unsupported/test/NonLinearOptimization.cpp b/contrib/eigen/unsupported/test/NonLinearOptimization.cpp
index dd93c21e9fed3849a423ce44b961fefaa00d1c14..c667b7247ddcd72d8bbac7150f4c024f2748cfcb 100644
--- a/contrib/eigen/unsupported/test/NonLinearOptimization.cpp
+++ b/contrib/eigen/unsupported/test/NonLinearOptimization.cpp
@@ -1789,7 +1789,7 @@ void testNistEckerle4(void)
   VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
 }
 
-void test_NonLinearOptimization()
+EIGEN_DECLARE_TEST(NonLinearOptimization)
 {
     // Tests using the examples provided by (c)minpack
     CALL_SUBTEST/*_1*/(testChkder());
diff --git a/contrib/eigen/unsupported/test/NumericalDiff.cpp b/contrib/eigen/unsupported/test/NumericalDiff.cpp
index 27d8880566202656c58febd44cac649d16a95a25..6d836413bbf32b94c392b7b219d731117067f65d 100644
--- a/contrib/eigen/unsupported/test/NumericalDiff.cpp
+++ b/contrib/eigen/unsupported/test/NumericalDiff.cpp
@@ -24,7 +24,7 @@ struct Functor
   int m_inputs, m_values;
   
   Functor() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
-  Functor(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+  Functor(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {}
   
   int inputs() const { return m_inputs; }
   int values() const { return m_values; }
@@ -107,7 +107,7 @@ void test_central()
     VERIFY_IS_APPROX(jac, actual_jac);
 }
 
-void test_NumericalDiff()
+EIGEN_DECLARE_TEST(NumericalDiff)
 {
     CALL_SUBTEST(test_forward());
     CALL_SUBTEST(test_central());
diff --git a/contrib/eigen/unsupported/test/alignedvector3.cpp b/contrib/eigen/unsupported/test/alignedvector3.cpp
index 252cb1d3f6f7ac629d4106eea312b694effb1e06..f442e416a9f833d6e5b45ab78af3e1a28b47f11e 100644
--- a/contrib/eigen/unsupported/test/alignedvector3.cpp
+++ b/contrib/eigen/unsupported/test/alignedvector3.cpp
@@ -70,13 +70,16 @@ void alignedvector3()
     VERIFY_IS_APPROX(f6,r1-r4);
   }
   
+  FastType f8, f9(0,0,0);
+  VERIFY_IS_APPROX(f9-f1,-f1);
+
   std::stringstream ss1, ss2;
   ss1 << f1;
   ss2 << r1;
   VERIFY(ss1.str()==ss2.str());
 }
 
-void test_alignedvector3()
+EIGEN_DECLARE_TEST(alignedvector3)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST( alignedvector3<float>() );
diff --git a/contrib/eigen/unsupported/test/autodiff.cpp b/contrib/eigen/unsupported/test/autodiff.cpp
index 1d8c8b5fd718d7a74cf51682b99284dd7b373017..2cea56ba5f968ddbe6fb39c520b2b0840fd38f53 100644
--- a/contrib/eigen/unsupported/test/autodiff.cpp
+++ b/contrib/eigen/unsupported/test/autodiff.cpp
@@ -44,7 +44,7 @@ struct TestFunc1
   int m_inputs, m_values;
 
   TestFunc1() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
-  TestFunc1(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+  TestFunc1(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {}
 
   int inputs() const { return m_inputs; }
   int values() const { return m_values; }
@@ -369,7 +369,7 @@ double bug_1281() {
 
 #endif
 
-void test_autodiff()
+EIGEN_DECLARE_TEST(autodiff)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( test_autodiff_scalar<1>() );
diff --git a/contrib/eigen/unsupported/test/autodiff_scalar.cpp b/contrib/eigen/unsupported/test/autodiff_scalar.cpp
index a917ec344f0fb7e9f7a39b4d6b43cd12fc4f90e8..e81a7788be75f9a17c151e3051d672fb7310bc8f 100644
--- a/contrib/eigen/unsupported/test/autodiff_scalar.cpp
+++ b/contrib/eigen/unsupported/test/autodiff_scalar.cpp
@@ -81,7 +81,7 @@ void check_limits_specialization()
   typedef std::numeric_limits<AD> A;
   typedef std::numeric_limits<Scalar> B;
 
-  // workaround "unsed typedef" warning:
+  // workaround "unused typedef" warning:
   VERIFY(!bool(internal::is_same<B, A>::value));
 
 #if EIGEN_HAS_CXX11
@@ -89,7 +89,7 @@ void check_limits_specialization()
 #endif
 }
 
-void test_autodiff_scalar()
+EIGEN_DECLARE_TEST(autodiff_scalar)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( check_atan2<float>() );
diff --git a/contrib/eigen/unsupported/test/bessel_functions.cpp b/contrib/eigen/unsupported/test/bessel_functions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..06765bfab81fedb048b742599b84d5575a6bfb67
--- /dev/null
+++ b/contrib/eigen/unsupported/test/bessel_functions.cpp
@@ -0,0 +1,370 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include "../Eigen/SpecialFunctions"
+
+template<typename X, typename Y>
+void verify_component_wise(const X& x, const Y& y)
+{
+  for(Index i=0; i<x.size(); ++i)
+  {
+    if((numext::isfinite)(y(i))) {
+      VERIFY_IS_APPROX( x(i), y(i) );
+    }
+    else if((numext::isnan)(y(i)))
+      VERIFY((numext::isnan)(x(i)));
+    else
+      VERIFY_IS_EQUAL( x(i), y(i) );
+  }
+}
+
+template<typename ArrayType> void array_bessel_functions() 
+{
+  // Test Bessel function i0. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << 4.35582826e+07, 6.21841242e+06, 8.93446228e+05, 1.29418563e+05,
+       1.89489253e+04, 2.81571663e+03, 4.27564116e+02, 6.72344070e+01,
+       1.13019220e+01, 2.27958530e+00, 1.00000000e+00, 2.27958530e+00,
+       1.13019220e+01, 6.72344070e+01, 4.27564116e+02, 2.81571663e+03,
+       1.89489253e+04, 1.29418563e+05, 8.93446228e+05, 6.21841242e+06,
+       4.35582826e+07;
+
+    CALL_SUBTEST(res = bessel_i0(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function i0e. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << 0.0897803118848, 0.0947062952128, 0.100544127361,
+        0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857,
+        0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554,
+        0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163,
+        0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128,
+        0.0897803118848;
+
+    CALL_SUBTEST(res = bessel_i0e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function i1. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << -4.24549734e+07, -6.04313324e+06, -8.65059436e+05, -1.24707259e+05,
+       -1.81413488e+04, -2.67098830e+03, -3.99873137e+02, -6.13419368e+01,
+       -9.75946515e+00, -1.59063685e+00,  0.00000000e+00,  1.59063685e+00,
+        9.75946515e+00,  6.13419368e+01,  3.99873137e+02,  2.67098830e+03,
+        1.81413488e+04,  1.24707259e+05,  8.65059436e+05,  6.04313324e+06,
+        4.24549734e+07;
+
+    CALL_SUBTEST(res = bessel_i1(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function i1e. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << -0.0875062221833, -0.092036796872, -0.0973496147565,
+        -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293,
+        -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249,
+        0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384,
+        0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872,
+        0.0875062221833;
+
+    CALL_SUBTEST(res = bessel_i1e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function j0. Reference results obtained with SciPy.
+  {
+    ArrayType x(77);
+    ArrayType expected(77);
+    ArrayType res(77);
+
+    x << -38., -37., -36., -35., -34., -33., -32., -31., -30.,
+      -29., -28., -27., -26., -25., -24., -23., -22., -21., -20., -19.,
+      -18., -17., -16., -15., -14., -13., -12., -11., -10.,  -9.,  -8.,
+       -7.,  -6.,  -5.,  -4.,  -3.,  -2.,  -1.,   0.,   1.,   2.,   3.,
+        4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,  12.,  13.,  14.,
+       15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,  23.,  24.,  25.,
+       26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,  34.,  35.,  36.,
+       37.,  38.;
+
+    expected << 0.11433274,  0.01086237, -0.10556738,
+             -0.12684568, -0.03042119,  0.09727067,  0.13807901,  0.05120815,
+             -0.08636798, -0.14784876, -0.07315701,  0.07274192,  0.15599932,
+              0.09626678, -0.05623027, -0.16241278, -0.12065148,  0.03657907,
+              0.16702466,  0.14662944, -0.01335581, -0.16985425, -0.17489907,
+             -0.01422447,  0.17107348,  0.2069261 ,  0.04768931, -0.1711903 ,
+             -0.24593576, -0.09033361,  0.17165081,  0.30007927,  0.15064526,
+             -0.17759677, -0.39714981, -0.26005195,  0.22389078,  0.76519769,
+              1.        ,  0.76519769,  0.22389078, -0.26005195, -0.39714981,
+             -0.17759677,  0.15064526,  0.30007927,  0.17165081, -0.09033361,
+             -0.24593576, -0.1711903 ,  0.04768931,  0.2069261 ,  0.17107348,
+             -0.01422447, -0.17489907, -0.16985425, -0.01335581,  0.14662944,
+              0.16702466,  0.03657907, -0.12065148, -0.16241278, -0.05623027,
+              0.09626678,  0.15599932,  0.07274192, -0.07315701, -0.14784876,
+             -0.08636798,  0.05120815,  0.13807901,  0.09727067, -0.03042119,
+             -0.12684568, -0.10556738,  0.01086237,  0.11433274;
+
+    CALL_SUBTEST(res = bessel_j0(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function j1. Reference results obtained with SciPy.
+  {
+    ArrayType x(81);
+    ArrayType expected(81);
+    ArrayType res(81);
+
+    x << -40., -39., -38., -37., -36., -35., -34., -33., -32., -31., -30.,
+      -29., -28., -27., -26., -25., -24., -23., -22., -21., -20., -19.,
+      -18., -17., -16., -15., -14., -13., -12., -11., -10.,  -9.,  -8.,
+       -7.,  -6.,  -5.,  -4.,  -3.,  -2.,  -1.,   0.,   1.,   2.,   3.,
+        4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,  12.,  13.,  14.,
+       15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,  23.,  24.,  25.,
+       26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,  34.,  35.,  36.,
+       37.,  38.,  39.,  40.;
+
+    expected << -0.12603832, -0.0640561 ,  0.05916189,  0.13058004,  0.08232981,
+             -0.04399094, -0.13297118, -0.10061965,  0.02658903,  0.13302432,
+              0.11875106, -0.0069342 , -0.13055149, -0.13658472, -0.01504573,
+              0.12535025,  0.15403807,  0.03951932, -0.11717779, -0.17112027,
+             -0.06683312,  0.10570143,  0.18799489,  0.09766849, -0.09039718,
+             -0.20510404, -0.13337515,  0.07031805,  0.2234471 ,  0.1767853 ,
+             -0.04347275, -0.24531179, -0.23463635,  0.00468282,  0.27668386,
+              0.32757914,  0.06604333, -0.33905896, -0.57672481, -0.44005059,
+              0.        ,  0.44005059,  0.57672481,  0.33905896, -0.06604333,
+             -0.32757914, -0.27668386, -0.00468282,  0.23463635,  0.24531179,
+              0.04347275, -0.1767853 , -0.2234471 , -0.07031805,  0.13337515,
+              0.20510404,  0.09039718, -0.09766849, -0.18799489, -0.10570143,
+              0.06683312,  0.17112027,  0.11717779, -0.03951932, -0.15403807,
+             -0.12535025,  0.01504573,  0.13658472,  0.13055149,  0.0069342 ,
+             -0.11875106, -0.13302432, -0.02658903,  0.10061965,  0.13297118,
+              0.04399094, -0.08232981, -0.13058004, -0.05916189,  0.0640561 ,
+              0.12603832;
+
+    CALL_SUBTEST(res = bessel_j1(x);
+                 verify_component_wise(res, expected););
+  }
+  // Test Bessel function k0e. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 1.97933385, 1.52410939, 1.14446308, 0.84156822,
+             0.6977616 , 0.60929767, 0.54780756, 0.50186313, 0.4658451 ,
+             0.43662302, 0.41229555, 0.39163193, 0.3737955 , 0.35819488,
+             0.34439865, 0.33208364, 0.32100235, 0.31096159, 0.30180802,
+             0.29341821, 0.28569149, 0.27854488, 0.2719092 , 0.26572635,
+             0.25994703, 0.25452917, 0.2494366 , 0.24463801, 0.24010616,
+             0.23581722, 0.23175022, 0.22788667, 0.22421014, 0.22070602,
+             0.21736123, 0.21416406, 0.21110397, 0.20817141, 0.20535778,
+             0.20265524, 0.20005668, 0.19755558;
+
+    CALL_SUBTEST(res = bessel_k0e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function k0. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 1.54150675, 0.92441907, 4.21024438e-01, 1.13893873e-01,
+             3.47395044e-02, 1.11596761e-02, 3.69109833e-03, 1.24399433e-03,
+             4.24795742e-04, 1.46470705e-04, 5.08813130e-05, 1.77800623e-05,
+             6.24302055e-06, 2.20082540e-06, 7.78454386e-07, 2.76137082e-07,
+             9.81953648e-08, 3.49941166e-08, 1.24946640e-08, 4.46875334e-09,
+             1.60067129e-09, 5.74123782e-10, 2.06176797e-10, 7.41235161e-11,
+             2.66754511e-11, 9.60881878e-12, 3.46416156e-12, 1.24987740e-12,
+             4.51286453e-13, 1.63053459e-13, 5.89495073e-14, 2.13247750e-14,
+             7.71838266e-15, 2.79505752e-15, 1.01266123e-15, 3.67057597e-16,
+             1.33103515e-16, 4.82858338e-17, 1.75232770e-17, 6.36161716e-18,
+             2.31029936e-18, 8.39286110e-19;
+
+    CALL_SUBTEST(res = bessel_k0(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function k0e. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 1.97933385, 1.52410939, 1.14446308, 0.84156822,
+             0.6977616 , 0.60929767, 0.54780756, 0.50186313,
+             0.4658451 , 0.43662302, 0.41229555, 0.39163193,
+             0.3737955 , 0.35819488, 0.34439865, 0.33208364,
+             0.32100235, 0.31096159, 0.30180802, 0.29341821,
+             0.28569149, 0.27854488, 0.2719092 , 0.26572635,
+             0.25994703, 0.25452917, 0.2494366 , 0.24463801,
+             0.24010616, 0.23581722, 0.23175022, 0.22788667,
+             0.22421014, 0.22070602, 0.21736123, 0.21416406,
+             0.21110397, 0.20817141, 0.20535778, 0.20265524,
+             0.20005668, 0.19755558;
+
+    CALL_SUBTEST(res = bessel_k0e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function k1. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 3.74702597, 1.65644112, 6.01907230e-01, 1.39865882e-01,
+             4.01564311e-02, 1.24834989e-02, 4.04461345e-03, 1.34391972e-03,
+             4.54182487e-04, 1.55369212e-04, 5.36370164e-05, 1.86487735e-05,
+             6.52086067e-06, 2.29075746e-06, 8.07858841e-07, 2.85834365e-07,
+             1.01417294e-07, 3.60715712e-08, 1.28570417e-08, 4.59124963e-09,
+             1.64226697e-09, 5.88305797e-10, 2.11029922e-10, 7.57898116e-11,
+             2.72493059e-11, 9.80699893e-12, 3.53277807e-12, 1.27369078e-12,
+             4.59568940e-13, 1.65940011e-13, 5.99574032e-14, 2.16773200e-14,
+             7.84189960e-15, 2.83839927e-15, 1.02789171e-15, 3.72416929e-16,
+             1.34991783e-16, 4.89519373e-17, 1.77585196e-17, 6.44478588e-18,
+             2.33973340e-18, 8.49713195e-19;
+
+    CALL_SUBTEST(res = bessel_k1(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function k1e. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 4.81127659, 2.73100971, 1.63615349, 1.03347685,
+             0.80656348, 0.68157595, 0.60027386, 0.54217591,
+             0.49807158, 0.46314909, 0.43462525, 0.41076657,
+             0.39043094, 0.37283175, 0.35740757, 0.34374563,
+             0.33153489, 0.32053597, 0.31056123, 0.30146131,
+             0.29311559, 0.2854255 , 0.27830958, 0.27169987,
+             0.26553913, 0.25977879, 0.25437733, 0.249299  ,
+             0.24451285, 0.23999191, 0.2357126 , 0.23165413,
+             0.22779816, 0.22412841, 0.22063036, 0.21729103,
+             0.21409878, 0.21104314, 0.20811462, 0.20530466,
+             0.20260547, 0.20000997;
+
+    CALL_SUBTEST(res = bessel_k1e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function y0. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << -0.93157302, -0.44451873, 0.08825696,  0.51037567,  0.37685001,
+             -0.01694074, -0.30851763, -0.28819468, -0.02594974,  0.22352149,
+             0.2499367 ,  0.05567117, -0.16884732, -0.22523731, -0.07820786,
+             0.12719257,  0.2054643 , 0.095811  , -0.0926372 , -0.18755216,
+             -0.10951969,  0.0626406 , 0.17020176,  0.1198876 , -0.03598179,
+             -0.15283403, -0.12724943, 0.01204463,  0.13521498,  0.13183647,
+             0.00948116, -0.11729573, -0.13383266, -0.02874248,  0.09913483,
+             0.13340405,  0.04579799, -0.08085609, -0.13071488, -0.06066076,
+             0.06262353,  0.12593642;
+
+    CALL_SUBTEST(res = bessel_y0(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function y1. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << -2.70410523, -1.47147239, -0.78121282, -0.10703243,
+             0.32467442,  0.39792571,  0.14786314, -0.17501034, -0.30266724,
+             -0.15806046,  0.10431458,  0.24901542, 0.16370554, -0.05709922,
+             -0.21008141, -0.16664484,  0.02107363, 0.17797517,  0.16720504,
+             0.00815513, -0.14956011, -0.16551161, -0.03253926,  0.12340586,
+             0.1616692 ,  0.05305978, -0.09882996, -0.15579655, -0.07025124,
+             0.07552213,  0.14803412,  0.08442557, -0.05337283, -0.13854483,
+             -0.09578012,  0.03238588,  0.12751273, 0.10445477, -0.01262946,
+             -0.11514066, -0.11056411, -0.00579351;
+
+    CALL_SUBTEST(res = bessel_y1(x);
+                 verify_component_wise(res, expected););
+  }
+}
+
+EIGEN_DECLARE_TEST(bessel_functions)
+{
+  CALL_SUBTEST_1(array_bessel_functions<ArrayXf>());
+  CALL_SUBTEST_2(array_bessel_functions<ArrayXd>());
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_eventcount.cpp b/contrib/eigen/unsupported/test/cxx11_eventcount.cpp
index 3b598bf424badf076bc944f0a227e24627ac3a95..7bf4e965fc9cc2e1c338e412e46910e9f62dec29 100644
--- a/contrib/eigen/unsupported/test/cxx11_eventcount.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_eventcount.cpp
@@ -30,11 +30,11 @@ static void test_basic_eventcount()
   EventCount ec(waiters);
   EventCount::Waiter& w = waiters[0];
   ec.Notify(false);
-  ec.Prewait(&w);
+  ec.Prewait();
   ec.Notify(true);
   ec.CommitWait(&w);
-  ec.Prewait(&w);
-  ec.CancelWait(&w);
+  ec.Prewait();
+  ec.CancelWait();
 }
 
 // Fake bounded counter-based queue.
@@ -112,7 +112,7 @@ static void test_stress_eventcount()
         unsigned idx = rand_reentrant(&rnd) % kQueues;
         if (queues[idx].Pop()) continue;
         j--;
-        ec.Prewait(&w);
+        ec.Prewait();
         bool empty = true;
         for (int q = 0; q < kQueues; q++) {
           if (!queues[q].Empty()) {
@@ -121,7 +121,7 @@ static void test_stress_eventcount()
           }
         }
         if (!empty) {
-          ec.CancelWait(&w);
+          ec.CancelWait();
           continue;
         }
         ec.CommitWait(&w);
@@ -135,7 +135,7 @@ static void test_stress_eventcount()
   }
 }
 
-void test_cxx11_eventcount()
+EIGEN_DECLARE_TEST(cxx11_eventcount)
 {
   CALL_SUBTEST(test_basic_eventcount());
   CALL_SUBTEST(test_stress_eventcount());
diff --git a/contrib/eigen/unsupported/test/cxx11_maxsizevector.cpp b/contrib/eigen/unsupported/test/cxx11_maxsizevector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..46b689a8ed4a6914ea8bad586b4eeca551484bda
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_maxsizevector.cpp
@@ -0,0 +1,77 @@
+#include "main.h"
+
+#include <exception>  // std::exception
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+struct Foo
+{
+  static Index object_count;
+  static Index object_limit;
+  EIGEN_ALIGN_TO_BOUNDARY(128) int dummy;
+
+  Foo(int x=0) : dummy(x)
+  {
+#ifdef EIGEN_EXCEPTIONS
+    // TODO: Is this the correct way to handle this?
+    if (Foo::object_count > Foo::object_limit) { std::cout << "\nThrow!\n"; throw Foo::Fail(); }
+#endif
+    std::cout << '+';
+    ++Foo::object_count;
+    eigen_assert((internal::UIntPtr(this) & (127)) == 0);
+  }
+  Foo(const Foo&)
+  {
+    std::cout << 'c';
+    ++Foo::object_count;
+    eigen_assert((internal::UIntPtr(this) & (127)) == 0);
+  }
+
+  ~Foo()
+  {
+    std::cout << '~';
+    --Foo::object_count;
+  }
+
+  class Fail : public std::exception {};
+};
+
+Index Foo::object_count = 0;
+Index Foo::object_limit = 0;
+
+
+
+EIGEN_DECLARE_TEST(cxx11_maxsizevector)
+{
+  typedef MaxSizeVector<Foo> VectorX;
+  Foo::object_count = 0;
+  for(int r = 0; r < g_repeat; r++) {
+    Index rows = internal::random<Index>(3,30);
+    Foo::object_limit = internal::random<Index>(0, rows - 2);
+    std::cout << "object_limit = " << Foo::object_limit << std::endl;
+    bool exception_raised = false;
+#ifdef EIGEN_EXCEPTIONS
+    try
+    {
+#endif
+      std::cout <<       "\nVectorX m(" << rows << ");\n";
+      VectorX vect(rows);
+      for(int i=0; i<rows; ++i)
+          vect.push_back(Foo());
+#ifdef EIGEN_EXCEPTIONS
+      VERIFY(false);  // not reached if exceptions are enabled
+    }
+    catch (const Foo::Fail&) { exception_raised = true; }
+    VERIFY(exception_raised);
+#endif
+    VERIFY_IS_EQUAL(Index(0), Foo::object_count);
+
+    {
+      Foo::object_limit = rows+1;
+      VectorX vect2(rows, Foo());
+      VERIFY_IS_EQUAL(Foo::object_count, rows);
+    }
+    VERIFY_IS_EQUAL(Index(0), Foo::object_count);
+    std::cout << '\n';
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_meta.cpp b/contrib/eigen/unsupported/test/cxx11_meta.cpp
index 8911c59d808827b5ee2ed5aa3319e465d2fc4083..510e11032956811b3c06e268761ae4cf6950e19c 100644
--- a/contrib/eigen/unsupported/test/cxx11_meta.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_meta.cpp
@@ -340,7 +340,7 @@ static void test_array_misc()
   VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 5>(data).c), 5);
 }
 
-void test_cxx11_meta()
+EIGEN_DECLARE_TEST(cxx11_meta)
 {
   CALL_SUBTEST(test_gen_numeric_list());
   CALL_SUBTEST(test_concat());
diff --git a/contrib/eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/contrib/eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp
index 5f9bb938b0df4324679e58dda0bc0b44e6560499..993ee17894048a9c518f78cdafee8f3e1a9b9cb6 100644
--- a/contrib/eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@@ -11,22 +11,23 @@
 #define EIGEN_USE_THREADS
 #include "main.h"
 #include "Eigen/CXX11/ThreadPool"
+#include "Eigen/CXX11/Tensor"
 
 static void test_create_destroy_empty_pool()
 {
   // Just create and destroy the pool. This will wind up and tear down worker
   // threads. Ensure there are no issues in that logic.
   for (int i = 0; i < 16; ++i) {
-    NonBlockingThreadPool tp(i);
+    ThreadPool tp(i);
   }
 }
 
 
-static void test_parallelism()
+static void test_parallelism(bool allow_spinning)
 {
   // Test we never-ever fail to match available tasks with idle threads.
   const int kThreads = 16;  // code below expects that this is a multiple of 4
-  NonBlockingThreadPool tp(kThreads);
+  ThreadPool tp(kThreads, allow_spinning);
   VERIFY_IS_EQUAL(tp.NumThreads(), kThreads);
   VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1);
   for (int iter = 0; iter < 100; ++iter) {
@@ -100,8 +101,80 @@ static void test_parallelism()
   }
 }
 
-void test_cxx11_non_blocking_thread_pool()
+
+static void test_cancel()
+{
+  ThreadPool tp(2);
+
+  // Schedule a large number of closure that each sleeps for one second. This
+  // will keep the thread pool busy for much longer than the default test timeout.
+  for (int i = 0; i < 1000; ++i) {
+    tp.Schedule([]() {
+      std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+    });
+  }
+
+  // Cancel the processing of all the closures that are still pending.
+  tp.Cancel();
+}
+
+static void test_pool_partitions() {
+  const int kThreads = 2;
+  ThreadPool tp(kThreads);
+
+  // Assign each thread to its own partition, so that stealing other work only
+  // occurs globally when a thread is idle.
+  std::vector<std::pair<unsigned, unsigned>> steal_partitions(kThreads);
+  for (int i = 0; i < kThreads; ++i) {
+    steal_partitions[i] = std::make_pair(i, i + 1);
+  }
+  tp.SetStealPartitions(steal_partitions);
+
+  std::atomic<int> running(0);
+  std::atomic<int> done(0);
+  std::atomic<int> phase(0);
+
+  // Schedule kThreads tasks and ensure that they all are running.
+  for (int i = 0; i < kThreads; ++i) {
+    tp.Schedule([&]() {
+      const int thread_id = tp.CurrentThreadId();
+      VERIFY_GE(thread_id, 0);
+      VERIFY_LE(thread_id, kThreads - 1);
+      ++running;
+      while (phase < 1) {
+      }
+      ++done;
+    });
+  }
+  while (running != kThreads) {
+  }
+  // Schedule each closure to only run on thread 'i' and verify that it does.
+  for (int i = 0; i < kThreads; ++i) {
+    tp.ScheduleWithHint(
+        [&, i]() {
+          ++running;
+          const int thread_id = tp.CurrentThreadId();
+          VERIFY_IS_EQUAL(thread_id, i);
+          while (phase < 2) {
+          }
+          ++done;
+        },
+        i, i + 1);
+  }
+  running = 0;
+  phase = 1;
+  while (running != kThreads) {
+  }
+  running = 0;
+  phase = 2;
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_non_blocking_thread_pool)
 {
   CALL_SUBTEST(test_create_destroy_empty_pool());
-  CALL_SUBTEST(test_parallelism());
+  CALL_SUBTEST(test_parallelism(true));
+  CALL_SUBTEST(test_parallelism(false));
+  CALL_SUBTEST(test_cancel());
+  CALL_SUBTEST(test_pool_partitions());
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_runqueue.cpp b/contrib/eigen/unsupported/test/cxx11_runqueue.cpp
index 91f690114b3dcf70b03bc83f1cf8fb59bec9355e..8fc5a30744bc7c599c75cb7ba6a0e5a03ade76b7 100644
--- a/contrib/eigen/unsupported/test/cxx11_runqueue.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_runqueue.cpp
@@ -227,7 +227,7 @@ void test_stress_runqueue()
   VERIFY(total.load() == 0);
 }
 
-void test_cxx11_runqueue()
+EIGEN_DECLARE_TEST(cxx11_runqueue)
 {
   CALL_SUBTEST_1(test_basic_runqueue());
   CALL_SUBTEST_2(test_empty_runqueue());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_argmax.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_argmax.cpp
index 037767270a4ed7c87643cd5e40b0aacb42fb9fa9..4a0c8967b8edb79a62e13bd1870212a7cb229d97 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_argmax.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_argmax.cpp
@@ -273,7 +273,7 @@ static void test_argmin_dim()
   }
 }
 
-void test_cxx11_tensor_argmax()
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax)
 {
   CALL_SUBTEST(test_simple_index_tuples<RowMajor>());
   CALL_SUBTEST(test_simple_index_tuples<ColMajor>());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_argmax_cuda.cu b/contrib/eigen/unsupported/test/cxx11_tensor_argmax_gpu.cu
similarity index 72%
rename from contrib/eigen/unsupported/test/cxx11_tensor_argmax_cuda.cu
rename to contrib/eigen/unsupported/test/cxx11_tensor_argmax_gpu.cu
index 3d73d491adff6581e7f67085e03c64872aec113c..79f4066e9c354f0acb9b54416d1f782eda7cfd98 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_argmax_cuda.cu
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_argmax_gpu.cu
@@ -9,16 +9,18 @@
 
 
 #define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+
 #define EIGEN_USE_GPU
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
 using Eigen::Tensor;
 
 template <int Layout>
-void test_cuda_simple_argmax()
+void test_gpu_simple_argmax()
 {
   Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
   Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
@@ -34,13 +36,13 @@ void test_cuda_simple_argmax()
   double* d_in;
   DenseIndex* d_out_max;
   DenseIndex* d_out_min;
-  cudaMalloc((void**)(&d_in), in_bytes);
-  cudaMalloc((void**)(&d_out_max), out_bytes);
-  cudaMalloc((void**)(&d_out_min), out_bytes);
+  gpuMalloc((void**)(&d_in), in_bytes);
+  gpuMalloc((void**)(&d_out_max), out_bytes);
+  gpuMalloc((void**)(&d_out_min), out_bytes);
 
-  cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
@@ -50,20 +52,20 @@ void test_cuda_simple_argmax()
   gpu_out_max.device(gpu_device) = gpu_in.argmax();
   gpu_out_min.device(gpu_device) = gpu_in.argmin();
 
-  assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out_max.data(), d_out_max, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
   VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
 
-  cudaFree(d_in);
-  cudaFree(d_out_max);
-  cudaFree(d_out_min);
+  gpuFree(d_in);
+  gpuFree(d_out_max);
+  gpuFree(d_out_min);
 }
 
 template <int DataLayout>
-void test_cuda_argmax_dim()
+void test_gpu_argmax_dim()
 {
   Tensor<float, 4, DataLayout> tensor(2,3,5,7);
   std::vector<int> dims;
@@ -97,12 +99,12 @@ void test_cuda_argmax_dim()
 
     float* d_in;
     DenseIndex* d_out;
-    cudaMalloc((void**)(&d_in), in_bytes);
-    cudaMalloc((void**)(&d_out), out_bytes);
+    gpuMalloc((void**)(&d_in), in_bytes);
+    gpuMalloc((void**)(&d_out), out_bytes);
 
-    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
 
-    Eigen::CudaStreamDevice stream;
+    Eigen::GpuStreamDevice stream;
     Eigen::GpuDevice gpu_device(&stream);
 
     Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
@@ -110,8 +112,8 @@ void test_cuda_argmax_dim()
 
     gpu_out.device(gpu_device) = gpu_in.argmax(dim);
 
-    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
     VERIFY_IS_EQUAL(tensor_arg.size(),
                     size_t(2*3*5*7 / tensor.dimension(dim)));
@@ -134,25 +136,25 @@ void test_cuda_argmax_dim()
       }
     }
 
-    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
 
     gpu_out.device(gpu_device) = gpu_in.argmax(dim);
 
-    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
     for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
       // Expect max to be in the last index of the reduced dimension
       VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
     }
 
-    cudaFree(d_in);
-    cudaFree(d_out);
+    gpuFree(d_in);
+    gpuFree(d_out);
   }
 }
 
 template <int DataLayout>
-void test_cuda_argmin_dim()
+void test_gpu_argmin_dim()
 {
   Tensor<float, 4, DataLayout> tensor(2,3,5,7);
   std::vector<int> dims;
@@ -186,12 +188,12 @@ void test_cuda_argmin_dim()
 
     float* d_in;
     DenseIndex* d_out;
-    cudaMalloc((void**)(&d_in), in_bytes);
-    cudaMalloc((void**)(&d_out), out_bytes);
+    gpuMalloc((void**)(&d_in), in_bytes);
+    gpuMalloc((void**)(&d_out), out_bytes);
 
-    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
 
-    Eigen::CudaStreamDevice stream;
+    Eigen::GpuStreamDevice stream;
     Eigen::GpuDevice gpu_device(&stream);
 
     Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
@@ -199,8 +201,8 @@ void test_cuda_argmin_dim()
 
     gpu_out.device(gpu_device) = gpu_in.argmin(dim);
 
-    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
     VERIFY_IS_EQUAL(tensor_arg.size(),
                     2*3*5*7 / tensor.dimension(dim));
@@ -223,29 +225,29 @@ void test_cuda_argmin_dim()
       }
     }
 
-    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
 
     gpu_out.device(gpu_device) = gpu_in.argmin(dim);
 
-    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
     for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
       // Expect max to be in the last index of the reduced dimension
       VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
     }
 
-    cudaFree(d_in);
-    cudaFree(d_out);
+    gpuFree(d_in);
+    gpuFree(d_out);
   }
 }
 
-void test_cxx11_tensor_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>());
-  CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>());
-  CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>());
-  CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>());
-  CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>());
+  CALL_SUBTEST_1(test_gpu_simple_argmax<RowMajor>());
+  CALL_SUBTEST_1(test_gpu_simple_argmax<ColMajor>());
+  CALL_SUBTEST_2(test_gpu_argmax_dim<RowMajor>());
+  CALL_SUBTEST_2(test_gpu_argmax_dim<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_argmin_dim<RowMajor>());
+  CALL_SUBTEST_3(test_gpu_argmin_dim<ColMajor>());
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_argmax_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ac71286e1f8167cc808f257391735700cb697e1
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_argmax_sycl.cpp
@@ -0,0 +1,258 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+#define EIGEN_HAS_CONSTEXPR 1
+
+#include "main.h"
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int Layout, typename DenseIndex>
+static void test_sycl_simple_argmax(const Eigen::SyclDevice& sycl_device) {
+  Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2, 2, 2}});
+  Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
+  Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
+  in.setRandom();
+  in *= in.constant(100.0);
+  in(0, 0, 0) = -1000.0;
+  in(1, 1, 1) = 1000.0;
+
+  std::size_t in_bytes = in.size() * sizeof(DataType);
+  std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
+
+  DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+  DenseIndex* d_out_max = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+  DenseIndex* d_out_min = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in,
+                                                                           Eigen::array<DenseIndex, 3>{{2, 2, 2}});
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_max(d_out_max);
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_min(d_out_min);
+  sycl_device.memcpyHostToDevice(d_in, in.data(), in_bytes);
+
+  gpu_out_max.device(sycl_device) = gpu_in.argmax();
+  gpu_out_min.device(sycl_device) = gpu_in.argmin();
+
+  sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes);
+  sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes);
+
+  VERIFY_IS_EQUAL(out_max(), 2 * 2 * 2 - 1);
+  VERIFY_IS_EQUAL(out_min(), 0);
+
+  sycl_device.deallocate(d_in);
+  sycl_device.deallocate(d_out_max);
+  sycl_device.deallocate(d_out_min);
+}
+
+template <typename DataType, int DataLayout, typename DenseIndex>
+static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) {
+  DenseIndex sizeDim0 = 9;
+  DenseIndex sizeDim1 = 3;
+  DenseIndex sizeDim2 = 5;
+  DenseIndex sizeDim3 = 7;
+  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3);
+
+  std::vector<DenseIndex> dims;
+  dims.push_back(sizeDim0);
+  dims.push_back(sizeDim1);
+  dims.push_back(sizeDim2);
+  dims.push_back(sizeDim3);
+  for (DenseIndex dim = 0; dim < 4; ++dim) {
+    array<DenseIndex, 3> out_shape;
+    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1];
+
+    Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l)
+            // = 10.0
+            tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(DataType);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+    DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+    DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(
+        d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}});
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmax(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
+                    size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim)));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    sycl_device.synchronize();
+
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0;
+          }
+        }
+      }
+    }
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmax(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+    sycl_device.deallocate(d_in);
+    sycl_device.deallocate(d_out);
+  }
+}
+
+template <typename DataType, int DataLayout, typename DenseIndex>
+static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) {
+  DenseIndex sizeDim0 = 9;
+  DenseIndex sizeDim1 = 3;
+  DenseIndex sizeDim2 = 5;
+  DenseIndex sizeDim3 = 7;
+  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3);
+
+  std::vector<DenseIndex> dims;
+  dims.push_back(sizeDim0);
+  dims.push_back(sizeDim1);
+  dims.push_back(sizeDim2);
+  dims.push_back(sizeDim3);
+  for (DenseIndex dim = 0; dim < 4; ++dim) {
+    array<DenseIndex, 3> out_shape;
+    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1];
+
+    Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
+            tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(DataType);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+    DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+    DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(
+        d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}});
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmin(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
+                    size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim)));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    sycl_device.synchronize();
+
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
+            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0;
+          }
+        }
+      }
+    }
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmin(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+    sycl_device.deallocate(d_in);
+    sycl_device.deallocate(d_out);
+  }
+}
+
+template <typename DataType, typename Device_Selector>
+void sycl_argmax_test_per_device(const Device_Selector& d) {
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_sycl_simple_argmax<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_simple_argmax<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_argmax_dim<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_argmax_dim<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_argmin_dim<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_argmin_dim<DataType, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_argmax_test_per_device<float>(device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_assign.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_assign.cpp
index 8fe85d83ccb542c784cb0b1b9d8d6d57bfac20d8..ce9d24369100851eb935a87640ee5caa8f9d24f9 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_assign.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_assign.cpp
@@ -358,7 +358,7 @@ static void test_std_initializers_tensor() {
 #endif  // EIGEN_HAS_VARIADIC_TEMPLATES
 }
 
-void test_cxx11_tensor_assign()
+EIGEN_DECLARE_TEST(cxx11_tensor_assign)
 {
   CALL_SUBTEST(test_1d());
   CALL_SUBTEST(test_2d());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_block_access.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_block_access.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5fb12e0e0eb2f45dae1be044a4b40e10a1b2a19f
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_block_access.cpp
@@ -0,0 +1,576 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Andy Davis <andydavis@google.com>
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <algorithm>
+#include <set>
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::Index;
+using Eigen::RowMajor;
+using Eigen::ColMajor;
+using Eigen::internal::TensorBlockShapeType;
+
+static TensorOpCost zeroCost() { return {0, 0, 0}; }
+
+template<typename T>
+static const T& choose(int layout, const T& col, const T& row) {
+  return layout == ColMajor ? col : row;
+}
+
+static TensorBlockShapeType RandomShape() {
+  return internal::random<bool>()
+         ? TensorBlockShapeType::kUniformAllDims
+         : TensorBlockShapeType::kSkewedInnerDims;
+}
+
+template <int NumDims>
+static size_t RandomTargetSize(const DSizes<Index, NumDims>& dims) {
+  return internal::random<size_t>(1, dims.TotalSize());
+}
+
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims() {
+  array<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<int>(1, 20);
+  }
+  return DSizes<Index, NumDims>(dims);
+}
+
+template <typename T>
+static T* GenerateRandomData(const Index& size) {
+  T* data = new T[size];
+  for (int i = 0; i < size; ++i) {
+    data[i] = internal::random<T>();
+  }
+  return data;
+}
+
+template <int NumDims>
+static void Debug(DSizes<Index, NumDims> dims) {
+  for (int i = 0; i < NumDims; ++i) {
+    std::cout << dims[i] << "; ";
+  }
+  std::cout << std::endl;
+}
+
+template <int Layout>
+static void test_block_mapper_sanity()
+{
+  typedef internal::TensorBlockMapper<2, Layout> TensorBlockMapper;
+
+  DSizes<Index, 2> tensor_dims(100, 100);
+
+  // Test uniform blocks.
+  TensorBlockMapper uniform_block_mapper(
+      tensor_dims, {TensorBlockShapeType::kUniformAllDims, 100, zeroCost()});
+
+  VERIFY_IS_EQUAL(uniform_block_mapper.blockCount(), 100);
+  VERIFY_IS_EQUAL(uniform_block_mapper.blockTotalSize(), 100);
+
+  // 10x10 blocks
+  auto uniform_b0 = uniform_block_mapper.blockDescriptor(0);
+  VERIFY_IS_EQUAL(uniform_b0.dimensions().at(0), 10);
+  VERIFY_IS_EQUAL(uniform_b0.dimensions().at(1), 10);
+
+  // Test skewed to inner dims blocks.
+  TensorBlockMapper skewed_block_mapper(
+      tensor_dims, {TensorBlockShapeType::kSkewedInnerDims, 100, zeroCost()});
+
+  VERIFY_IS_EQUAL(skewed_block_mapper.blockCount(), 100);
+  VERIFY_IS_EQUAL(skewed_block_mapper.blockTotalSize(), 100);
+
+  // 1x100 (100x1) rows/cols depending on a tensor layout.
+  auto skewed_b0 = skewed_block_mapper.blockDescriptor(0);
+  VERIFY_IS_EQUAL(skewed_b0.dimensions().at(0), choose(Layout, 100, 1));
+  VERIFY_IS_EQUAL(skewed_b0.dimensions().at(1), choose(Layout, 1, 100));
+}
+
+// Given a TensorBlock "visit" every element accessible though it, and a keep an
+// index in the visited set. Verify that every coeff accessed only once.
+template<int NumDims, int Layout>
+static void UpdateCoeffSet(
+    const DSizes<Index, NumDims>& tensor_strides,
+    const internal::TensorBlockDescriptor<NumDims>& block,
+    Index first_coeff_index, int dim_index, std::set<Index>* visited_coeffs) {
+  const DSizes<Index, NumDims>& block_sizes = block.dimensions();
+
+  for (int i = 0; i < block_sizes[dim_index]; ++i) {
+    if (tensor_strides[dim_index] == 1) {
+      typedef std::pair<std::set<Index>::iterator, bool> ReturnType;
+      ReturnType inserted = visited_coeffs->insert(first_coeff_index + i);
+      VERIFY_IS_EQUAL(inserted.second, true);
+    } else {
+      int next_dim_index = dim_index + choose(Layout, -1, 1);
+      UpdateCoeffSet<NumDims, Layout>(tensor_strides, block, first_coeff_index,
+                                         next_dim_index, visited_coeffs);
+      first_coeff_index += tensor_strides[dim_index];
+    }
+  }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_mapper_maps_every_element() {
+  typedef internal::TensorBlockMapper<NumDims, Layout> TensorBlockMapper;
+
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>();
+  DSizes<Index, NumDims> strides = internal::strides<Layout>(dims);
+
+  // Keep track of elements indices available via block access.
+  std::set<Index> coeff_set;
+
+  // Try different combinations of block types and sizes.
+  TensorBlockMapper block_mapper(
+      dims, {RandomShape(), RandomTargetSize(dims), zeroCost()});
+
+  for (int i = 0; i < block_mapper.blockCount(); ++i) {
+    auto block = block_mapper.blockDescriptor(i);
+    UpdateCoeffSet<NumDims, Layout>(strides, block, block.offset(),
+                                    choose(Layout, NumDims - 1, 0),
+                                    &coeff_set);
+  }
+
+  // Verify that every coefficient in the original Tensor is accessible through
+  // TensorBlock only once.
+  Index total_coeffs = dims.TotalSize();
+  VERIFY_IS_EQUAL(Index(coeff_set.size()), total_coeffs);
+  VERIFY_IS_EQUAL(*coeff_set.begin(), 0);
+  VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1);
+}
+
+template <int Layout, int NumDims>
+static Index GetInputIndex(Index output_index,
+                         const array<Index, NumDims>& output_to_input_dim_map,
+                         const array<Index, NumDims>& input_strides,
+                         const array<Index, NumDims>& output_strides) {
+  int input_index = 0;
+  if (Layout == ColMajor) {
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[0]];
+  } else {
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[NumDims - 1]];
+  }
+}
+
+template <int Layout, int NumDims>
+static array<Index, NumDims> ComputeStrides(
+    const array<Index, NumDims>& sizes) {
+  array<Index, NumDims> strides;
+  if (Layout == ColMajor) {
+    strides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      strides[i] = strides[i - 1] * sizes[i - 1];
+    }
+  } else {
+    strides[NumDims - 1] = 1;
+    for (int i = NumDims - 2; i >= 0; --i) {
+      strides[i] = strides[i + 1] * sizes[i + 1];
+    }
+  }
+  return strides;
+}
+
+template<typename Scalar, typename StorageIndex, int Dim>
+class EqualityChecker
+{
+    const Scalar* input_data;
+    const DSizes<StorageIndex, Dim> &input_dims, &input_strides, &output_dims, &output_strides;
+    void check_recursive(const Scalar* input, const Scalar* output, int depth=0) const
+    {
+        if(depth==Dim)
+        {
+            VERIFY_IS_EQUAL(*input, *output);
+            return;
+        }
+
+        for(int i=0; i<output_dims[depth]; ++i)
+        {
+            check_recursive(input + i % input_dims[depth] * input_strides[depth], output + i*output_strides[depth], depth+1);
+        }
+    }
+public:
+    EqualityChecker(const Scalar* input_data_,
+            const DSizes<StorageIndex, Dim> &input_dims_, const DSizes<StorageIndex, Dim> &input_strides_,
+            const DSizes<StorageIndex, Dim> &output_dims_, const DSizes<StorageIndex, Dim> &output_strides_)
+        : input_data(input_data_)
+        , input_dims(input_dims_), input_strides(input_strides_)
+        , output_dims(output_dims_), output_strides(output_strides_)
+        {}
+
+    void operator()(const Scalar* output_data) const
+    {
+        check_recursive(input_data, output_data);
+    }
+};
+
+template <int Layout>
+static void test_uniform_block_shape()
+{
+  typedef internal::TensorBlockDescriptor<5> TensorBlock;
+  typedef internal::TensorBlockMapper<5, Layout> TensorBlockMapper;
+
+  {
+    // Test shape 'UniformAllDims' with uniform 'max_coeff count'.
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 5 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    for (int i = 0; i < 5; ++i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+  // partially into first inner-most dimension.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 7 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 5 * 5 * 5 * 5 * 6;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(6, block.dimensions()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+  // fully into first inner-most dimension.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 5 * 5 * 5 * 5 * 7;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+  // fully into first few inner-most dimensions.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(7, 5, 6, 17, 7);
+    const Index max_coeff_count = 7 * 5 * 6 * 7 * 5;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(7, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[4]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(7, 5, 6, 9, 7);
+    const Index max_coeff_count = 5 * 5 * 5 * 6 * 7;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[0]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with full allocation to all dims.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(7, 5, 6, 17, 7);
+    const Index max_coeff_count = 7 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(7, 5, 6, 9, 7);
+    const Index max_coeff_count = 7 * 5 * 6 * 9 * 7;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(9, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+}
+
+template <int Layout>
+static void test_skewed_inner_dim_block_shape()
+{
+  typedef internal::TensorBlockDescriptor<5> TensorBlock;
+  typedef internal::TensorBlockMapper<5, Layout> TensorBlockMapper;
+
+  // Test shape 'SkewedInnerDims' with partial allocation to inner-most dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 10 * 1 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(10, block.dimensions()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 1 * 1 * 1 * 1 * 6;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(6, block.dimensions()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to inner-most dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 1 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 1 * 1 * 1 * 1 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to inner-most dim,
+  // and partial allocation to second inner-dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 3 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(3, block.dimensions()[1]);
+    for (int i = 2; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 1 * 1 * 1 * 15 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(15, block.dimensions()[3]);
+    for (int i = 2; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to inner-most dim,
+  // and partial allocation to third inner-dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 5 * 5 * 1 * 1;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[2]);
+    for (int i = 3; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 1 * 1 * 5 * 17 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[2]);
+    for (int i = 1; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to all dims.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+}
+
+template <int Layout>
+static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
+{
+  // Test blocking of tensors with zero dimensions:
+  //  - we must not crash on asserts and divisions by zero
+  //  - we must not return block with zero dimensions
+  //    (recipe for overflows/underflows, divisions by zero and NaNs later)
+  //  - total block count must be zero
+  {
+    typedef internal::TensorBlockMapper<1, Layout> TensorBlockMapper;
+
+    DSizes<Index, 1> dims(0);
+    for (size_t max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
+      TensorBlockMapper block_mapper(
+          dims, {block_shape, max_coeff_count, zeroCost()});
+      VERIFY_IS_EQUAL(block_mapper.blockCount(), 0);
+      VERIFY(block_mapper.blockTotalSize() >= 1);
+    }
+  }
+
+  {
+    typedef internal::TensorBlockMapper<2, Layout> TensorBlockMapper;
+
+    for (int dim1 = 0; dim1 < 3; ++dim1) {
+      for (int dim2 = 0; dim2 < 3; ++dim2) {
+        DSizes<Index, 2> dims(dim1, dim2);
+        for (size_t max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
+          TensorBlockMapper block_mapper(
+              dims, {block_shape, max_coeff_count, zeroCost()});
+          if (dim1 * dim2 == 0) {
+            VERIFY_IS_EQUAL(block_mapper.blockCount(), 0);
+          }
+          VERIFY(block_mapper.blockTotalSize() >= 1);
+        }
+      }
+    }
+  }
+}
+
+#define TEST_LAYOUTS(NAME) \
+  CALL_SUBTEST(NAME<ColMajor>()); \
+  CALL_SUBTEST(NAME<RowMajor>())
+
+#define TEST_LAYOUTS_AND_DIMS(TYPE, NAME)    \
+  CALL_SUBTEST((NAME<TYPE, 1, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 1, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 2, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 2, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 3, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 3, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 4, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 4, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 5, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 5, RowMajor>()))
+
+#define TEST_LAYOUTS_WITH_ARG(NAME, ARG) \
+  CALL_SUBTEST(NAME<ColMajor>(ARG)); \
+  CALL_SUBTEST(NAME<RowMajor>(ARG))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
+  TEST_LAYOUTS(test_block_mapper_sanity);
+  TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element);
+  TEST_LAYOUTS(test_uniform_block_shape);
+  TEST_LAYOUTS(test_skewed_inner_dim_block_shape);
+  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims);
+  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims);
+}
+
+#undef TEST_LAYOUTS
+#undef TEST_LAYOUTS_WITH_ARG
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_block_eval.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_block_eval.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2e26ebb7334e417bbd957b74a9ff9cd500f0ecd
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_block_eval.cpp
@@ -0,0 +1,858 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// clang-format off
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+// clang-format on
+
+using Eigen::internal::TensorBlockDescriptor;
+using Eigen::internal::TensorExecutor;
+
+// -------------------------------------------------------------------------- //
+// Utility functions to generate random tensors, blocks, and evaluate them.
+
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims(Index min, Index max) {
+  DSizes<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<Index>(min, max);
+  }
+  return DSizes<Index, NumDims>(dims);
+}
+
+// Block offsets and extents allows to construct a TensorSlicingOp corresponding
+// to a TensorBlockDescriptor.
+template <int NumDims>
+struct TensorBlockParams {
+  DSizes<Index, NumDims> offsets;
+  DSizes<Index, NumDims> sizes;
+  TensorBlockDescriptor<NumDims, Index> desc;
+};
+
+template <int Layout, int NumDims>
+static TensorBlockParams<NumDims> RandomBlock(DSizes<Index, NumDims> dims,
+                                              Index min, Index max) {
+  // Choose random offsets and sizes along all tensor dimensions.
+  DSizes<Index, NumDims> offsets(RandomDims<NumDims>(min, max));
+  DSizes<Index, NumDims> sizes(RandomDims<NumDims>(min, max));
+
+  // Make sure that offset + size do not overflow dims.
+  for (int i = 0; i < NumDims; ++i) {
+    offsets[i] = numext::mini(dims[i] - 1, offsets[i]);
+    sizes[i] = numext::mini(sizes[i], dims[i] - offsets[i]);
+  }
+
+  Index offset = 0;
+  DSizes<Index, NumDims> strides = Eigen::internal::strides<Layout>(dims);
+  for (int i = 0; i < NumDims; ++i) {
+    offset += strides[i] * offsets[i];
+  }
+
+  return {offsets, sizes, TensorBlockDescriptor<NumDims, Index>(offset, sizes)};
+}
+
+// Generate block with block sizes skewed towards inner dimensions. This type of
+// block is required for evaluating broadcast expressions.
+template <int Layout, int NumDims>
+static TensorBlockParams<NumDims> SkewedInnerBlock(
+    DSizes<Index, NumDims> dims) {
+  using BlockMapper = internal::TensorBlockMapper<NumDims, Layout, Index>;
+  BlockMapper block_mapper(dims,
+                           {internal::TensorBlockShapeType::kSkewedInnerDims,
+                            internal::random<size_t>(1, dims.TotalSize()),
+                            {0, 0, 0}});
+
+  Index total_blocks = block_mapper.blockCount();
+  Index block_index = internal::random<Index>(0, total_blocks - 1);
+  auto block = block_mapper.blockDescriptor(block_index);
+  DSizes<Index, NumDims> sizes = block.dimensions();
+
+  auto strides = internal::strides<Layout>(dims);
+  DSizes<Index, NumDims> offsets;
+
+  // Compute offsets for the first block coefficient.
+  Index index = block.offset();
+  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / strides[i];
+      index -= idx * strides[i];
+      offsets[i] = idx;
+    }
+    if (NumDims > 0) offsets[0] = index;
+  } else {
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = index / strides[i];
+      index -= idx * strides[i];
+      offsets[i] = idx;
+    }
+    if (NumDims > 0) offsets[NumDims - 1] = index;
+  }
+
+  return {offsets, sizes, block};
+}
+
+template <int NumDims>
+static TensorBlockParams<NumDims> FixedSizeBlock(DSizes<Index, NumDims> dims) {
+  DSizes<Index, NumDims> offsets;
+  for (int i = 0; i < NumDims; ++i) offsets[i] = 0;
+
+  return {offsets, dims, TensorBlockDescriptor<NumDims, Index>(0, dims)};
+}
+
+inline Eigen::IndexList<Index, Eigen::type2index<1>> NByOne(Index n) {
+  Eigen::IndexList<Index, Eigen::type2index<1>> ret;
+  ret.set(0, n);
+  return ret;
+}
+inline Eigen::IndexList<Eigen::type2index<1>, Index> OneByM(Index m) {
+  Eigen::IndexList<Eigen::type2index<1>, Index> ret;
+  ret.set(1, m);
+  return ret;
+}
+
+// -------------------------------------------------------------------------- //
+// Verify that block expression evaluation produces the same result as a
+// TensorSliceOp (reading a tensor block is same to taking a tensor slice).
+
+template <typename T, int NumDims, int Layout, typename Expression,
+          typename GenBlockParams>
+static void VerifyBlockEvaluator(Expression expr, GenBlockParams gen_block) {
+  using Device = DefaultDevice;
+  auto d = Device();
+
+  // Scratch memory allocator for block evaluation.
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+  TensorBlockScratch scratch(d);
+
+  // TensorEvaluator is needed to produce tensor blocks of the expression.
+  auto eval = TensorEvaluator<const decltype(expr), Device>(expr, d);
+  eval.evalSubExprsIfNeeded(nullptr);
+
+  // Choose a random offsets, sizes and TensorBlockDescriptor.
+  TensorBlockParams<NumDims> block_params = gen_block();
+
+  // Evaluate TensorBlock expression into a tensor.
+  Tensor<T, NumDims, Layout> block(block_params.desc.dimensions());
+
+  // Dimensions for the potential destination buffer.
+  DSizes<Index, NumDims> dst_dims;
+  if (internal::random<bool>()) {
+    dst_dims = block_params.desc.dimensions();
+  } else {
+    for (int i = 0; i < NumDims; ++i) {
+      Index extent = internal::random<Index>(0, 5);
+      dst_dims[i] = block_params.desc.dimension(i) + extent;
+    }
+  }
+
+  // Maybe use this tensor as a block desc destination.
+  Tensor<T, NumDims, Layout> dst(dst_dims);
+  dst.setZero();
+  if (internal::random<bool>()) {
+    block_params.desc.template AddDestinationBuffer<Layout>(
+        dst.data(), internal::strides<Layout>(dst.dimensions()));
+  }
+
+  const bool root_of_expr = internal::random<bool>();
+  auto tensor_block = eval.block(block_params.desc, scratch, root_of_expr);
+
+  if (tensor_block.kind() == internal::TensorBlockKind::kMaterializedInOutput) {
+    // Copy data from destination buffer.
+    if (dimensions_match(dst.dimensions(), block.dimensions())) {
+      block = dst;
+    } else {
+      DSizes<Index, NumDims> offsets;
+      for (int i = 0; i < NumDims; ++i) offsets[i] = 0;
+      block = dst.slice(offsets, block.dimensions());
+    }
+
+  } else {
+    // Assign to block from expression.
+    auto b_expr = tensor_block.expr();
+
+    // We explicitly disable vectorization and tiling, to run a simple coefficient
+    // wise assignment loop, because it's very simple and should be correct.
+    using BlockAssign = TensorAssignOp<decltype(block), const decltype(b_expr)>;
+    using BlockExecutor = TensorExecutor<const BlockAssign, Device, false,
+                                         internal::TiledEvaluation::Off>;
+    BlockExecutor::run(BlockAssign(block, b_expr), d);
+  }
+
+  // Cleanup temporary buffers owned by a tensor block.
+  tensor_block.cleanup();
+
+  // Compute a Tensor slice corresponding to a Tensor block.
+  Tensor<T, NumDims, Layout> slice(block_params.desc.dimensions());
+  auto s_expr = expr.slice(block_params.offsets, block_params.sizes);
+
+  // Explicitly use coefficient assignment to evaluate slice expression.
+  using SliceAssign = TensorAssignOp<decltype(slice), const decltype(s_expr)>;
+  using SliceExecutor = TensorExecutor<const SliceAssign, Device, false,
+                                       internal::TiledEvaluation::Off>;
+  SliceExecutor::run(SliceAssign(slice, s_expr), d);
+
+  // Tensor block and tensor slice must be the same.
+  for (Index i = 0; i < block.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(block.coeff(i), slice.coeff(i));
+  }
+}
+
+// -------------------------------------------------------------------------- //
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_block() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Identity tensor expression transformation.
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input, [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_unary_expr_block() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.abs(), [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_binary_expr_block() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> lhs(dims), rhs(dims);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      lhs * rhs, [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_binary_with_unary_expr_block() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> lhs(dims), rhs(dims);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      (lhs.square() + rhs.square()).sqrt(),
+      [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_broadcast() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  DSizes<Index, NumDims> bcast = RandomDims<NumDims>(1, 5);
+
+  DSizes<Index, NumDims> bcasted_dims;
+  for (int i = 0; i < NumDims; ++i) bcasted_dims[i] = dims[i] * bcast[i];
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.broadcast(bcast),
+      [&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.broadcast(bcast),
+      [&bcasted_dims]() { return RandomBlock<Layout>(bcasted_dims, 5, 10); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.broadcast(bcast),
+      [&bcasted_dims]() { return FixedSizeBlock(bcasted_dims); });
+
+  // Check that desc.destination() memory is not shared between two broadcast
+  // materializations.
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.broadcast(bcast) * input.abs().broadcast(bcast),
+      [&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_reshape() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 10);
+
+  DSizes<Index, NumDims> shuffled = dims;
+  std::shuffle(&shuffled[0], &shuffled[NumDims - 1], std::mt19937(g_seed));
+
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.reshape(shuffled),
+      [&shuffled]() { return RandomBlock<Layout>(shuffled, 1, 10); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.reshape(shuffled),
+      [&shuffled]() { return SkewedInnerBlock<Layout>(shuffled); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_cast() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.template cast<int>().template cast<T>(),
+      [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_select() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> lhs(dims);
+  Tensor<T, NumDims, Layout> rhs(dims);
+  Tensor<bool, NumDims, Layout> cond(dims);
+  lhs.setRandom();
+  rhs.setRandom();
+  cond.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(cond.select(lhs, rhs), [&dims]() {
+    return RandomBlock<Layout>(dims, 1, 20);
+  });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_padding() {
+  const int inner_dim = Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  DSizes<Index, NumDims> pad_before = RandomDims<NumDims>(0, 4);
+  DSizes<Index, NumDims> pad_after = RandomDims<NumDims>(0, 4);
+  array<std::pair<Index, Index>, NumDims> paddings;
+  for (int i = 0; i < NumDims; ++i) {
+    paddings[i] = std::make_pair(pad_before[i], pad_after[i]);
+  }
+
+  // Test squeezing reads from inner dim.
+  if (internal::random<bool>()) {
+    pad_before[inner_dim] = 0;
+    pad_after[inner_dim] = 0;
+    paddings[inner_dim] = std::make_pair(0, 0);
+  }
+
+  DSizes<Index, NumDims> padded_dims;
+  for (int i = 0; i < NumDims; ++i) {
+    padded_dims[i] = dims[i] + pad_before[i] + pad_after[i];
+  }
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.pad(paddings),
+      [&padded_dims]() { return FixedSizeBlock(padded_dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.pad(paddings),
+      [&padded_dims]() { return RandomBlock<Layout>(padded_dims, 1, 10); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.pad(paddings),
+      [&padded_dims]() { return SkewedInnerBlock<Layout>(padded_dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_chipping() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  Index chip_dim = internal::random<int>(0, NumDims - 1);
+  Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
+
+  DSizes<Index, NumDims - 1> chipped_dims;
+  for (Index i = 0; i < chip_dim; ++i) {
+    chipped_dims[i] = dims[i];
+  }
+  for (Index i = chip_dim + 1; i < NumDims; ++i) {
+    chipped_dims[i - 1] = dims[i];
+  }
+
+  // Block buffer forwarding.
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+
+  // Block expression assignment.
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.abs().chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.abs().chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+}
+
+
+template<typename T, int NumDims>
+struct SimpleTensorGenerator {
+  T operator()(const array<Index, NumDims>& coords) const {
+    T result = static_cast<T>(0);
+    for (int i = 0; i < NumDims; ++i) {
+      result += static_cast<T>((i + 1) * coords[i]);
+    }
+    return result;
+  }
+};
+
+// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC.
+template<int NumDims>
+struct SimpleTensorGenerator<bool, NumDims> {
+  bool operator()(const array<Index, NumDims>& coords) const {
+    bool result = false;
+    for (int i = 0; i < NumDims; ++i) {
+      result ^= coords[i];
+    }
+    return result;
+  }
+};
+
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_generator() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  auto generator = SimpleTensorGenerator<T, NumDims>();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.generate(generator), [&dims]() { return FixedSizeBlock(dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.generate(generator),
+      [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_reverse() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Randomly reverse dimensions.
+  Eigen::DSizes<bool, NumDims> reverse;
+  for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.reverse(reverse), [&dims]() { return FixedSizeBlock(dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(input.reverse(reverse), [&dims]() {
+    return RandomBlock<Layout>(dims, 1, 10);
+  });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_slice() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Pick a random slice of an input tensor.
+  DSizes<Index, NumDims> slice_start = RandomDims<NumDims>(5, 10);
+  DSizes<Index, NumDims> slice_size = RandomDims<NumDims>(5, 10);
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.slice(slice_start, slice_size),
+      [&slice_size]() { return FixedSizeBlock(slice_size); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.slice(slice_start, slice_size),
+      [&slice_size]() { return RandomBlock<Layout>(slice_size, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_shuffle() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(5, 15);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]];
+
+    VerifyBlockEvaluator<T, NumDims, Layout>(
+        input.shuffle(shuffle),
+        [&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); });
+
+    VerifyBlockEvaluator<T, NumDims, Layout>(
+        input.shuffle(shuffle), [&shuffled_dims]() {
+          return RandomBlock<Layout>(shuffled_dims, 1, 5);
+        });
+
+    break;
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_reshape_with_bcast() {
+  Index dim = internal::random<Index>(1, 100);
+
+  Tensor<T, 2, Layout> lhs(1, dim);
+  Tensor<T, 2, Layout> rhs(dim, 1);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  auto reshapeLhs = NByOne(dim);
+  auto reshapeRhs = OneByM(dim);
+
+  auto bcastLhs = OneByM(dim);
+  auto bcastRhs = NByOne(dim);
+
+  DSizes<Index, 2> dims(dim, dim);
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      lhs.reshape(reshapeLhs).broadcast(bcastLhs) *
+          rhs.reshape(reshapeRhs).broadcast(bcastRhs),
+      [dims]() { return SkewedInnerBlock<Layout, 2>(dims); });
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_forced_eval() {
+  Index dim = internal::random<Index>(1, 100);
+
+  Tensor<T, 2, Layout> lhs(dim, 1);
+  Tensor<T, 2, Layout> rhs(1, dim);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  auto bcastLhs = OneByM(dim);
+  auto bcastRhs = NByOne(dim);
+
+  DSizes<Index, 2> dims(dim, dim);
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      (lhs.broadcast(bcastLhs) * rhs.broadcast(bcastRhs)).eval().reshape(dims),
+      [dims]() { return SkewedInnerBlock<Layout, 2>(dims); });
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      (lhs.broadcast(bcastLhs) * rhs.broadcast(bcastRhs)).eval().reshape(dims),
+      [dims]() { return RandomBlock<Layout, 2>(dims, 1, 50); });
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_chipping_of_bcast() {
+  if (Layout != static_cast<int>(RowMajor)) return;
+
+  Index dim0 = internal::random<Index>(1, 10);
+  Index dim1 = internal::random<Index>(1, 10);
+  Index dim2 = internal::random<Index>(1, 10);
+
+  Tensor<T, 3, Layout> input(1, dim1, dim2);
+  input.setRandom();
+
+  Eigen::array<Index, 3> bcast = {{dim0, 1, 1}};
+  DSizes<Index, 2> chipped_dims(dim0, dim2);
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      input.broadcast(bcast).chip(0, 1),
+      [chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      input.broadcast(bcast).chip(0, 1),
+      [chipped_dims]() { return SkewedInnerBlock<Layout, 2>(chipped_dims); });
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      input.broadcast(bcast).chip(0, 1),
+      [chipped_dims]() { return RandomBlock<Layout, 2>(chipped_dims, 1, 5); });
+}
+
+// -------------------------------------------------------------------------- //
+// Verify that assigning block to a Tensor expression produces the same result
+// as an assignment to TensorSliceOp (writing a block is is identical to
+// assigning one tensor to a slice of another tensor).
+
+template <typename T, int NumDims, int Layout, int NumExprDims = NumDims,
+          typename Expression, typename GenBlockParams>
+static void VerifyBlockAssignment(Tensor<T, NumDims, Layout>& tensor,
+                                  Expression expr, GenBlockParams gen_block) {
+  using Device = DefaultDevice;
+  auto d = Device();
+
+  // We use tensor evaluator as a target for block and slice assignments.
+  auto eval = TensorEvaluator<decltype(expr), Device>(expr, d);
+
+  // Generate a random block, or choose a block that fits in full expression.
+  TensorBlockParams<NumExprDims> block_params = gen_block();
+
+  // Generate random data of the selected block size.
+  Tensor<T, NumExprDims, Layout> block(block_params.desc.dimensions());
+  block.setRandom();
+
+  // ************************************************************************ //
+  // (1) Assignment from a block.
+
+  // Construct a materialize block from a random generated block tensor.
+  internal::TensorMaterializedBlock<T, NumExprDims, Layout> blk(
+      internal::TensorBlockKind::kView, block.data(), block.dimensions());
+
+  // Reset all underlying tensor values to zero.
+  tensor.setZero();
+
+  // Use evaluator to write block into a tensor.
+  eval.writeBlock(block_params.desc, blk);
+
+  // Make a copy of the result after assignment.
+  Tensor<T, NumDims, Layout> block_assigned = tensor;
+
+  // ************************************************************************ //
+  // (2) Assignment to a slice
+
+  // Reset all underlying tensor values to zero.
+  tensor.setZero();
+
+  // Assign block to a slice of original expression
+  auto s_expr = expr.slice(block_params.offsets, block_params.sizes);
+
+  // Explicitly use coefficient assignment to evaluate slice expression.
+  using SliceAssign = TensorAssignOp<decltype(s_expr), const decltype(block)>;
+  using SliceExecutor = TensorExecutor<const SliceAssign, Device, false,
+                                       internal::TiledEvaluation::Off>;
+  SliceExecutor::run(SliceAssign(s_expr, block), d);
+
+  // Make a copy of the result after assignment.
+  Tensor<T, NumDims, Layout> slice_assigned = tensor;
+
+  for (Index i = 0; i < tensor.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(block_assigned.coeff(i), slice_assigned.coeff(i));
+  }
+}
+
+// -------------------------------------------------------------------------- //
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map, [&dims]() { return RandomBlock<Layout>(dims, 10, 20); });
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map, [&dims]() { return FixedSizeBlock(dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_reshape() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  DSizes<Index, NumDims> shuffled = dims;
+  std::shuffle(&shuffled[0], &shuffled[NumDims - 1], std::mt19937(g_seed));
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.reshape(shuffled),
+      [&shuffled]() { return RandomBlock<Layout>(shuffled, 1, 10); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.reshape(shuffled),
+      [&shuffled]() { return SkewedInnerBlock<Layout>(shuffled); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.reshape(shuffled),
+      [&shuffled]() { return FixedSizeBlock(shuffled); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_chipping() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  Index chip_dim = internal::random<int>(0, NumDims - 1);
+  Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
+
+  DSizes<Index, NumDims - 1> chipped_dims;
+  for (Index i = 0; i < chip_dim; ++i) {
+    chipped_dims[i] = dims[i];
+  }
+  for (Index i = chip_dim + 1; i < NumDims; ++i) {
+    chipped_dims[i - 1] = dims[i];
+  }
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+      tensor, map.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+
+  VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+      tensor, map.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return SkewedInnerBlock<Layout>(chipped_dims); });
+
+  VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+      tensor, map.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_slice() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  // Pick a random slice of tensor.
+  DSizes<Index, NumDims> slice_start = RandomDims<NumDims>(5, 10);
+  DSizes<Index, NumDims> slice_size = RandomDims<NumDims>(5, 10);
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.slice(slice_start, slice_size),
+      [&slice_size]() { return RandomBlock<Layout>(slice_size, 1, 10); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.slice(slice_start, slice_size),
+      [&slice_size]() { return SkewedInnerBlock<Layout>(slice_size); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.slice(slice_start, slice_size),
+      [&slice_size]() { return FixedSizeBlock(slice_size); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_shuffle() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(5, 15);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]];
+
+    VerifyBlockAssignment<T, NumDims, Layout>(
+        tensor, map.shuffle(shuffle),
+        [&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); });
+
+    VerifyBlockAssignment<T, NumDims, Layout>(
+        tensor, map.shuffle(shuffle), [&shuffled_dims]() {
+          return RandomBlock<Layout>(shuffled_dims, 1, 5);
+        });
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+// -------------------------------------------------------------------------- //
+
+#define CALL_SUBTEST_PART(PART) \
+  CALL_SUBTEST_##PART
+
+#define CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(PART, NAME)           \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 5, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 5, ColMajor>()))
+
+#define CALL_SUBTESTS_DIMS_LAYOUTS(PART, NAME)     \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, ColMajor>()))
+
+#define CALL_SUBTESTS_LAYOUTS_TYPES(PART, NAME)       \
+  CALL_SUBTEST_PART(PART)((NAME<float, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>()));  \
+  CALL_SUBTEST_PART(PART)((NAME<bool, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, ColMajor>()))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) {
+  // clang-format off
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(1, test_eval_tensor_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(1, test_eval_tensor_binary_expr_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_unary_expr_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_binary_with_unary_expr_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_broadcast);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_reshape);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_cast);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_select);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_padding);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_chipping);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_generator);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_reverse);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(5, test_eval_tensor_slice);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(5, test_eval_tensor_shuffle);
+
+  CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_reshape_with_bcast);
+  CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_forced_eval);
+  CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_chipping_of_bcast);
+
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor_reshape);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor_chipping);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(8, test_assign_to_tensor_slice);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(8, test_assign_to_tensor_shuffle);
+
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8
+
+  // clang-format on
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_block_io.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_block_io.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..52f7dde9b095c48e58596b35819cbb39f8d5c75f
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_block_io.cpp
@@ -0,0 +1,445 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// clang-format off
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+// clang-format on
+
+// -------------------------------------------------------------------------- //
+// A set of tests for TensorBlockIO: copying data between tensor blocks.
+
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims(Index min, Index max) {
+  DSizes<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<Index>(min, max);
+  }
+  return DSizes<Index, NumDims>(dims);
+}
+
+static internal::TensorBlockShapeType RandomBlockShape() {
+  return internal::random<bool>()
+         ? internal::TensorBlockShapeType::kUniformAllDims
+         : internal::TensorBlockShapeType::kSkewedInnerDims;
+}
+
+template <int NumDims>
+static size_t RandomTargetBlockSize(const DSizes<Index, NumDims>& dims) {
+  return internal::random<size_t>(1, dims.TotalSize());
+}
+
+template <int Layout, int NumDims>
+static Index GetInputIndex(Index output_index,
+                           const array<Index, NumDims>& output_to_input_dim_map,
+                           const array<Index, NumDims>& input_strides,
+                           const array<Index, NumDims>& output_strides) {
+  int input_index = 0;
+  if (Layout == ColMajor) {
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[0]];
+  } else {
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[NumDims - 1]];
+  }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_io_copy_data_from_source_to_target() {
+  using TensorBlockIO = internal::TensorBlockIO<T, Index, NumDims, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Generate a random input Tensor.
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 30);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Write data to an output Tensor.
+  Tensor<T, NumDims, Layout> output(dims);
+
+  // Construct a tensor block mapper.
+  using TensorBlockMapper =
+      internal::TensorBlockMapper<NumDims, Layout, Index>;
+  TensorBlockMapper block_mapper(
+      dims, {RandomBlockShape(), RandomTargetBlockSize(dims), {0, 0, 0}});
+
+  // We will copy data from input to output through this buffer.
+  Tensor<T, NumDims, Layout> block(block_mapper.blockDimensions());
+
+  // Precompute strides for TensorBlockIO::Copy.
+  auto input_strides = internal::strides<Layout>(dims);
+  auto output_strides = internal::strides<Layout>(dims);
+
+  const T* input_data = input.data();
+  T* output_data = output.data();
+  T* block_data = block.data();
+
+  for (int i = 0; i < block_mapper.blockCount(); ++i) {
+    auto desc = block_mapper.blockDescriptor(i);
+
+    auto blk_dims = desc.dimensions();
+    auto blk_strides = internal::strides<Layout>(blk_dims);
+
+    {
+      // Read from input into a block buffer.
+      IODst dst(blk_dims, blk_strides, block_data, 0);
+      IOSrc src(input_strides, input_data, desc.offset());
+
+      TensorBlockIO::Copy(dst, src);
+    }
+
+    {
+      // Write from block buffer to output.
+      IODst dst(blk_dims, output_strides, output_data, desc.offset());
+      IOSrc src(blk_strides, block_data, 0);
+
+      TensorBlockIO::Copy(dst, src);
+    }
+  }
+
+  for (int i = 0; i < dims.TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(input_data[i], output_data[i]);
+  }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_io_copy_using_reordered_dimensions() {
+  // Generate a random input Tensor.
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 30);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Create a random dimension re-ordering/shuffle.
+  std::vector<int> shuffle;
+
+  for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
+  std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937(g_seed));
+
+  DSizes<Index, NumDims> output_tensor_dims;
+  DSizes<Index, NumDims> input_to_output_dim_map;
+  DSizes<Index, NumDims> output_to_input_dim_map;
+  for (Index i = 0; i < NumDims; ++i) {
+    output_tensor_dims[shuffle[i]] = dims[i];
+    input_to_output_dim_map[i] = shuffle[i];
+    output_to_input_dim_map[shuffle[i]] = i;
+  }
+
+  // Write data to an output Tensor.
+  Tensor<T, NumDims, Layout> output(output_tensor_dims);
+
+  // Construct a tensor block mapper.
+  // NOTE: Tensor block mapper works with shuffled dimensions.
+  using TensorBlockMapper =
+      internal::TensorBlockMapper<NumDims, Layout, Index>;
+  TensorBlockMapper block_mapper(output_tensor_dims,
+                                 {RandomBlockShape(),
+                                  RandomTargetBlockSize(output_tensor_dims),
+                                  {0, 0, 0}});
+
+  // We will copy data from input to output through this buffer.
+  Tensor<T, NumDims, Layout> block(block_mapper.blockDimensions());
+
+  // Precompute strides for TensorBlockIO::Copy.
+  auto input_strides = internal::strides<Layout>(dims);
+  auto output_strides = internal::strides<Layout>(output_tensor_dims);
+
+  const T* input_data = input.data();
+  T* output_data = output.data();
+  T* block_data = block.data();
+
+  for (Index i = 0; i < block_mapper.blockCount(); ++i) {
+    auto desc = block_mapper.blockDescriptor(i);
+
+    const Index first_coeff_index = GetInputIndex<Layout, NumDims>(
+        desc.offset(), output_to_input_dim_map, input_strides,
+        output_strides);
+
+    // NOTE: Block dimensions are in the same order as output dimensions.
+
+    using TensorBlockIO = internal::TensorBlockIO<T, Index, NumDims, Layout>;
+    using IODst = typename TensorBlockIO::Dst;
+    using IOSrc = typename TensorBlockIO::Src;
+
+    auto blk_dims = desc.dimensions();
+    auto blk_strides = internal::strides<Layout>(blk_dims);
+
+    {
+      // Read from input into a block buffer.
+      IODst dst(blk_dims, blk_strides, block_data, 0);
+      IOSrc src(input_strides, input_data, first_coeff_index);
+
+      // TODO(ezhulenev): Remove when fully switched to TensorBlock.
+      DSizes<int, NumDims> dim_map;
+      for (int j = 0; j < NumDims; ++j)
+        dim_map[j] = static_cast<int>(output_to_input_dim_map[j]);
+      TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/dim_map);
+    }
+
+    {
+      // We need to convert block dimensions from output to input order.
+      auto dst_dims = blk_dims;
+      for (int out_dim = 0; out_dim < NumDims; ++out_dim) {
+        dst_dims[output_to_input_dim_map[out_dim]] = blk_dims[out_dim];
+      }
+
+      // Write from block buffer to output.
+      IODst dst(dst_dims, input_strides, output_data, first_coeff_index);
+      IOSrc src(blk_strides, block_data, 0);
+
+      // TODO(ezhulenev): Remove when fully switched to TensorBlock.
+      DSizes<int, NumDims> dim_map;
+      for (int j = 0; j < NumDims; ++j)
+        dim_map[j] = static_cast<int>(input_to_output_dim_map[j]);
+      TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/dim_map);
+    }
+  }
+
+  for (Index i = 0; i < dims.TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(input_data[i], output_data[i]);
+  }
+}
+
+// This is the special case for reading data with reordering, when dimensions
+// before/after reordering are the same. Squeezing reads along inner dimensions
+// in this case is illegal, because we reorder innermost dimension.
+template <int Layout>
+static void test_block_io_copy_using_reordered_dimensions_do_not_squeeze() {
+  DSizes<Index, 3> tensor_dims(7, 9, 7);
+  DSizes<Index, 3> block_dims = tensor_dims;
+
+  DSizes<int, 3> block_to_tensor_dim;
+  block_to_tensor_dim[0] = 2;
+  block_to_tensor_dim[1] = 1;
+  block_to_tensor_dim[2] = 0;
+
+  auto tensor_strides = internal::strides<Layout>(tensor_dims);
+  auto block_strides = internal::strides<Layout>(block_dims);
+
+  Tensor<float, 3, Layout> block(block_dims);
+  Tensor<float, 3, Layout> tensor(tensor_dims);
+  tensor.setRandom();
+
+  float* tensor_data = tensor.data();
+  float* block_data = block.data();
+
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 3, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Read from a tensor into a block.
+  IODst dst(block_dims, block_strides, block_data, 0);
+  IOSrc src(tensor_strides, tensor_data, 0);
+
+  TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/block_to_tensor_dim);
+
+  TensorMap<Tensor<float, 3, Layout> > block_tensor(block_data, block_dims);
+  TensorMap<Tensor<float, 3, Layout> > tensor_tensor(tensor_data, tensor_dims);
+
+  for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
+    for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
+      for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
+        float block_value = block_tensor(d2, d1, d0);
+        float tensor_value = tensor_tensor(d0, d1, d2);
+        VERIFY_IS_EQUAL(block_value, tensor_value);
+      }
+    }
+  }
+}
+
+// This is the special case for reading data with reordering, when dimensions
+// before/after reordering are the same. Squeezing reads in this case is allowed
+// because we reorder outer dimensions.
+template <int Layout>
+static void test_block_io_copy_using_reordered_dimensions_squeeze() {
+  DSizes<Index, 4> tensor_dims(7, 5, 9, 9);
+  DSizes<Index, 4> block_dims = tensor_dims;
+
+  DSizes<int, 4> block_to_tensor_dim;
+  block_to_tensor_dim[0] = 0;
+  block_to_tensor_dim[1] = 1;
+  block_to_tensor_dim[2] = 3;
+  block_to_tensor_dim[3] = 2;
+
+  auto tensor_strides = internal::strides<Layout>(tensor_dims);
+  auto block_strides = internal::strides<Layout>(block_dims);
+
+  Tensor<float, 4, Layout> block(block_dims);
+  Tensor<float, 4, Layout> tensor(tensor_dims);
+  tensor.setRandom();
+
+  float* tensor_data = tensor.data();
+  float* block_data = block.data();
+
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 4, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Read from a tensor into a block.
+  IODst dst(block_dims, block_strides, block_data, 0);
+  IOSrc src(tensor_strides, tensor_data, 0);
+
+  TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/block_to_tensor_dim);
+
+  TensorMap<Tensor<float, 4, Layout> > block_tensor(block_data, block_dims);
+  TensorMap<Tensor<float, 4, Layout> > tensor_tensor(tensor_data, tensor_dims);
+
+  for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
+    for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
+      for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
+        for (Index d3 = 0; d3 < tensor_dims[3]; ++d3) {
+          float block_value = block_tensor(d0, d1, d3, d2);
+          float tensor_value = tensor_tensor(d0, d1, d2, d3);
+          VERIFY_IS_EQUAL(block_value, tensor_value);
+        }
+      }
+    }
+  }
+}
+
+template <int Layout>
+static void test_block_io_zero_stride() {
+  DSizes<Index, 5> rnd_dims = RandomDims<5>(1, 30);
+
+  DSizes<Index, 5> input_tensor_dims = rnd_dims;
+  input_tensor_dims[0] = 1;
+  input_tensor_dims[2] = 1;
+  input_tensor_dims[4] = 1;
+
+  Tensor<float, 5, Layout> input(input_tensor_dims);
+  input.setRandom();
+
+  DSizes<Index, 5> output_tensor_dims = rnd_dims;
+
+  auto input_tensor_strides = internal::strides<Layout>(input_tensor_dims);
+  auto output_tensor_strides = internal::strides<Layout>(output_tensor_dims);
+
+  auto input_tensor_strides_with_zeros = input_tensor_strides;
+  input_tensor_strides_with_zeros[0] = 0;
+  input_tensor_strides_with_zeros[2] = 0;
+  input_tensor_strides_with_zeros[4] = 0;
+
+  Tensor<float, 5, Layout> output(output_tensor_dims);
+  output.setRandom();
+
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 5, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Write data from input to output with broadcasting in dims [0, 2, 4].
+  IODst dst(output_tensor_dims, output_tensor_strides, output.data(), 0);
+  IOSrc src(input_tensor_strides_with_zeros, input.data(), 0);
+  TensorBlockIO::Copy(dst, src);
+
+  for (int i = 0; i < output_tensor_dims[0]; ++i) {
+    for (int j = 0; j < output_tensor_dims[1]; ++j) {
+      for (int k = 0; k < output_tensor_dims[2]; ++k) {
+        for (int l = 0; l < output_tensor_dims[3]; ++l) {
+          for (int m = 0; m < output_tensor_dims[4]; ++m) {
+            float input_value = input(0, j, 0, l, 0);
+            float output_value = output(i, j, k, l, m);
+            VERIFY_IS_EQUAL(input_value, output_value);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <int Layout>
+static void test_block_io_squeeze_ones() {
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 5, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Total size > 1.
+  {
+    DSizes<Index, 5> block_sizes(1, 2, 1, 2, 1);
+    auto strides = internal::strides<Layout>(block_sizes);
+
+    // Create a random input tensor.
+    Tensor<float, 5> input(block_sizes);
+    input.setRandom();
+
+    Tensor<float, 5> output(block_sizes);
+
+    IODst dst(block_sizes, strides, output.data(), 0);
+    IOSrc src(strides, input.data());
+    TensorBlockIO::Copy(dst, src);
+
+    for (Index i = 0; i < block_sizes.TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(output.data()[i], input.data()[i]);
+    }
+  }
+
+  // Total size == 1.
+  {
+    DSizes<Index, 5> block_sizes(1, 1, 1, 1, 1);
+    auto strides = internal::strides<Layout>(block_sizes);
+
+    // Create a random input tensor.
+    Tensor<float, 5> input(block_sizes);
+    input.setRandom();
+
+    Tensor<float, 5> output(block_sizes);
+
+    IODst dst(block_sizes, strides, output.data(), 0);
+    IOSrc src(strides, input.data());
+    TensorBlockIO::Copy(dst, src);
+
+    for (Index i = 0; i < block_sizes.TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(output.data()[i], input.data()[i]);
+    }
+  }
+}
+
+#define CALL_SUBTESTS(NAME)                   \
+  CALL_SUBTEST((NAME<float, 1, RowMajor>())); \
+  CALL_SUBTEST((NAME<float, 2, RowMajor>())); \
+  CALL_SUBTEST((NAME<float, 4, RowMajor>())); \
+  CALL_SUBTEST((NAME<float, 5, RowMajor>())); \
+  CALL_SUBTEST((NAME<float, 1, ColMajor>())); \
+  CALL_SUBTEST((NAME<float, 2, ColMajor>())); \
+  CALL_SUBTEST((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST((NAME<float, 5, ColMajor>())); \
+  CALL_SUBTEST((NAME<bool, 1, RowMajor>())); \
+  CALL_SUBTEST((NAME<bool, 2, RowMajor>())); \
+  CALL_SUBTEST((NAME<bool, 4, RowMajor>())); \
+  CALL_SUBTEST((NAME<bool, 5, RowMajor>())); \
+  CALL_SUBTEST((NAME<bool, 1, ColMajor>())); \
+  CALL_SUBTEST((NAME<bool, 2, ColMajor>())); \
+  CALL_SUBTEST((NAME<bool, 4, ColMajor>())); \
+  CALL_SUBTEST((NAME<bool, 5, ColMajor>()))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_block_io) {
+  // clang-format off
+  CALL_SUBTESTS(test_block_io_copy_data_from_source_to_target);
+  CALL_SUBTESTS(test_block_io_copy_using_reordered_dimensions);
+
+  CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_do_not_squeeze<RowMajor>());
+  CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_do_not_squeeze<ColMajor>());
+
+  CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_squeeze<RowMajor>());
+  CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_squeeze<ColMajor>());
+
+  CALL_SUBTEST(test_block_io_zero_stride<RowMajor>());
+  CALL_SUBTEST(test_block_io_zero_stride<ColMajor>());
+
+  CALL_SUBTEST(test_block_io_squeeze_ones<RowMajor>());
+  CALL_SUBTEST(test_block_io_squeeze_ones<ColMajor>());
+  // clang-format on
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
index 7201bfe3739385cb43876acc9d344d5396c7a364..20f84b8e09bd72823ea872a3653ef1bb6fb97d30 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
@@ -13,8 +13,8 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_broadcast_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
 #include "main.h"
@@ -25,50 +25,120 @@ using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
 
-static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_broadcast_sycl_fixed(const Eigen::SyclDevice &sycl_device){
 
   // BROADCAST test:
-  array<int, 4> in_range   = {{2, 3, 5, 7}};
-  array<int, 4> broadcasts = {{2, 3, 1, 4}};
-  array<int, 4> out_range;  // = in_range * broadcasts
+  IndexType inDim1=2;
+  IndexType inDim2=3;
+  IndexType inDim3=5;
+  IndexType inDim4=7;
+  IndexType bDim1=2;
+  IndexType bDim2=3;
+  IndexType bDim3=1;
+  IndexType bDim4=4;
+  array<IndexType, 4> in_range   = {{inDim1, inDim2, inDim3, inDim4}};
+  array<IndexType, 4> broadcasts = {{bDim1, bDim2, bDim3, bDim4}};
+  array<IndexType, 4> out_range;  // = in_range * broadcasts
   for (size_t i = 0; i < out_range.size(); ++i)
     out_range[i] = in_range[i] * broadcasts[i];
 
-  Tensor<float, 4>  input(in_range);
-  Tensor<float, 4> out(out_range);
+  Tensor<DataType, 4, DataLayout, IndexType>  input(in_range);
+  Tensor<DataType, 4, DataLayout, IndexType> out(out_range);
 
   for (size_t i = 0; i < in_range.size(); ++i)
     VERIFY_IS_EQUAL(out.dimension(i), out_range[i]);
 
 
-  for (int i = 0; i < input.size(); ++i)
-    input(i) = static_cast<float>(i);
+  for (IndexType i = 0; i < input.size(); ++i)
+    input(i) = static_cast<DataType>(i);
 
-  float * gpu_in_data  = static_cast<float*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_out_data  = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+  DataType * gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data  = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
 
-  TensorMap<Tensor<float, 4>>  gpu_in(gpu_in_data, in_range);
-  TensorMap<Tensor<float, 4>> gpu_out(gpu_out_data, out_range);
-  sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(float));
+  TensorMap<TensorFixedSize<DataType, Sizes<2, 3, 5, 7>, DataLayout, IndexType>> gpu_in(gpu_in_data, in_range);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_out(gpu_out_data, out_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(DataType));
   gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts);
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
 
-  for (int i = 0; i < 4; ++i) {
-    for (int j = 0; j < 9; ++j) {
-      for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 28; ++l) {
+  for (IndexType i = 0; i < inDim1*bDim1; ++i) {
+    for (IndexType j = 0; j < inDim2*bDim2; ++j) {
+      for (IndexType k = 0; k < inDim3*bDim3; ++k) {
+        for (IndexType l = 0; l < inDim4*bDim4; ++l) {
           VERIFY_IS_APPROX(input(i%2,j%3,k%5,l%7), out(i,j,k,l));
         }
       }
     }
   }
+  printf("Broadcast Test with fixed size Passed\n");
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){
+
+  // BROADCAST test:
+  IndexType inDim1=2;
+  IndexType inDim2=3;
+  IndexType inDim3=5;
+  IndexType inDim4=7;
+  IndexType bDim1=2;
+  IndexType bDim2=3;
+  IndexType bDim3=1;
+  IndexType bDim4=4;
+  array<IndexType, 4> in_range   = {{inDim1, inDim2, inDim3, inDim4}};
+  array<IndexType, 4> broadcasts = {{bDim1, bDim2, bDim3, bDim4}};
+  array<IndexType, 4> out_range;  // = in_range * broadcasts
+  for (size_t i = 0; i < out_range.size(); ++i)
+    out_range[i] = in_range[i] * broadcasts[i];
+
+  Tensor<DataType, 4, DataLayout, IndexType>  input(in_range);
+  Tensor<DataType, 4, DataLayout, IndexType> out(out_range);
+
+  for (size_t i = 0; i < in_range.size(); ++i)
+    VERIFY_IS_EQUAL(out.dimension(i), out_range[i]);
+
+
+  for (IndexType i = 0; i < input.size(); ++i)
+    input(i) = static_cast<DataType>(i);
+
+  DataType * gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data  = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>>  gpu_in(gpu_in_data, in_range);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_out(gpu_out_data, out_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(DataType));
+  gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+  for (IndexType i = 0; i < inDim1*bDim1; ++i) {
+    for (IndexType j = 0; j < inDim2*bDim2; ++j) {
+      for (IndexType k = 0; k < inDim3*bDim3; ++k) {
+        for (IndexType l = 0; l < inDim4*bDim4; ++l) {
+          VERIFY_IS_APPROX(input(i%inDim1,j%inDim2,k%inDim3,l%inDim4), out(i,j,k,l));
+        }
+      }
+    }
+  }
   printf("Broadcast Test Passed\n");
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }
 
-void test_cxx11_tensor_broadcast_sycl() {
-  cl::sycl::gpu_selector s;
-  Eigen::SyclDevice sycl_device(s);
-  CALL_SUBTEST(test_broadcast_sycl(sycl_device));
+template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::device& d){
+  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device);
+  test_broadcast_sycl_fixed<DataType, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
+  }
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_broadcasting.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_broadcasting.cpp
index 5c0ea5889871508a556bd00bfffc842fb1d2534a..d3dab891f20d4f0f6eae7bc7b245d408df3f7a2c 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_broadcasting.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_broadcasting.cpp
@@ -91,7 +91,16 @@ static void test_vectorized_broadcasting()
     }
   }
 
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   tensor.resize(11,3,5);
+#else
+  array<Index, 3> new_dims;
+  new_dims[0] = 11;
+  new_dims[1] = 3;
+  new_dims[2] = 5;
+  tensor.resize(new_dims);
+#endif
+
   tensor.setRandom();
   broadcast = tensor.broadcast(broadcasts);
 
@@ -115,7 +124,7 @@ static void test_static_broadcasting()
   Tensor<float, 3, DataLayout> tensor(8,3,5);
   tensor.setRandom();
 
-#if EIGEN_HAS_CONSTEXPR
+#if defined(EIGEN_HAS_INDEX_LIST)
   Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts;
 #else
   Eigen::array<int, 3> broadcasts;
@@ -139,7 +148,16 @@ static void test_static_broadcasting()
     }
   }
 
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   tensor.resize(11,3,5);
+#else
+  array<Index, 3> new_dims;
+  new_dims[0] = 11;
+  new_dims[1] = 3;
+  new_dims[2] = 5;
+  tensor.resize(new_dims);
+#endif
+
   tensor.setRandom();
   broadcast = tensor.broadcast(broadcasts);
 
@@ -180,8 +198,119 @@ static void test_fixed_size_broadcasting()
 #endif
 }
 
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n()
+{
+  Tensor<float, 4, DataLayout> tensor(1,13,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 9;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 1;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 13);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 7);
+
+  for (int i = 0; i < 9; ++i) {
+    for (int j = 0; j < 13; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i%1,j%13,k%5,l%7), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_n_by_one()
+{
+  Tensor<float, 4, DataLayout> tensor(7,3,5,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 1;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 19;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 7);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 3);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 19);
+
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 19; ++l) {
+          VERIFY_IS_EQUAL(tensor(i%7,j%3,k%5,l%1), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n_by_one_1d()
+{
+  Tensor<float, 3, DataLayout> tensor(1,7,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 3> broadcasts;
+  broadcasts[0] = 5;
+  broadcasts[1] = 1;
+  broadcasts[2] = 13;
+  Tensor<float, 3, DataLayout> broadcasted;
+  broadcasted = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcasted.dimension(0), 5);
+  VERIFY_IS_EQUAL(broadcasted.dimension(1), 7);
+  VERIFY_IS_EQUAL(broadcasted.dimension(2), 13);
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 13; ++k) {
+        VERIFY_IS_EQUAL(tensor(0,j%7,0), broadcasted(i,j,k));
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n_by_one_2d()
+{
+  Tensor<float, 4, DataLayout> tensor(1,7,13,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 5;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 19;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 7);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 13);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 19);
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 13; ++k) {
+        for (int l = 0; l < 19; ++l) {
+          VERIFY_IS_EQUAL(tensor(0,j%7,k%13,0), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
 
-void test_cxx11_tensor_broadcasting()
+EIGEN_DECLARE_TEST(cxx11_tensor_broadcasting)
 {
   CALL_SUBTEST(test_simple_broadcasting<ColMajor>());
   CALL_SUBTEST(test_simple_broadcasting<RowMajor>());
@@ -191,4 +320,12 @@ void test_cxx11_tensor_broadcasting()
   CALL_SUBTEST(test_static_broadcasting<RowMajor>());
   CALL_SUBTEST(test_fixed_size_broadcasting<ColMajor>());
   CALL_SUBTEST(test_fixed_size_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_n_by_one<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_n_by_one<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<RowMajor>());
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_builtins_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72cb62fd56eab612f3011c9675398cdc8100b67a
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -0,0 +1,354 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+// Functions used to compare the TensorMap implementation on the device with
+// the equivalent on the host
+namespace cl {
+namespace sycl {
+template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
+template <typename T> T square(T x) { return x * x; }
+template <typename T> T cube(T x) { return x * x * x; }
+template <typename T> T inverse(T x) { return T(1) / x; }
+template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
+template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
+}
+}
+
+struct EqualAssignement {
+  template <typename Lhs, typename Rhs>
+  void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
+};
+
+struct PlusEqualAssignement {
+  template <typename Lhs, typename Rhs>
+  void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; }
+};
+
+template <typename DataType, int DataLayout,
+          typename Assignement, typename Operator>
+void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
+                                    const array<int64_t, 3>& tensor_range) {
+  Operator op;
+  Assignement asgn;
+  {
+    /* Assignement(out, Operator(in)) */
+    Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+    Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+    in = in.random() + DataType(0.01);
+    out = out.random() + DataType(0.01);
+    Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+    DataType *gpu_data = static_cast<DataType *>(
+        sycl_device.allocate(in.size() * sizeof(DataType)));
+    DataType *gpu_data_out = static_cast<DataType *>(
+        sycl_device.allocate(out.size() * sizeof(DataType)));
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+    sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+                                   (in.size()) * sizeof(DataType));
+    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+                                   (out.size()) * sizeof(DataType));
+    auto device_expr = gpu_out.device(sycl_device);
+    asgn(device_expr, op(gpu));
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                   (out.size()) * sizeof(DataType));
+    for (int64_t i = 0; i < out.size(); ++i) {
+      DataType ver = reference(i);
+      asgn(ver, op(in(i)));
+      VERIFY_IS_APPROX(out(i), ver);
+    }
+    sycl_device.deallocate(gpu_data);
+    sycl_device.deallocate(gpu_data_out);
+  }
+  {
+    /* Assignement(out, Operator(out)) */
+    Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+    out = out.random() + DataType(0.01);
+    Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+    DataType *gpu_data_out = static_cast<DataType *>(
+        sycl_device.allocate(out.size() * sizeof(DataType)));
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+                                   (out.size()) * sizeof(DataType));
+    auto device_expr = gpu_out.device(sycl_device);
+    asgn(device_expr, op(gpu_out));
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                   (out.size()) * sizeof(DataType));
+    for (int64_t i = 0; i < out.size(); ++i) {
+      DataType ver = reference(i);
+      asgn(ver, op(reference(i)));
+      VERIFY_IS_APPROX(out(i), ver);
+    }
+    sycl_device.deallocate(gpu_data_out);
+  }
+}
+
+#define DECLARE_UNARY_STRUCT(FUNC)                                 \
+  struct op_##FUNC {                                               \
+    template <typename T>                                          \
+    auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) {   \
+      return cl::sycl::FUNC(x);                                    \
+    }                                                              \
+    template <typename T>                                          \
+    auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
+      return x.FUNC();                                             \
+    }                                                              \
+  };
+
+DECLARE_UNARY_STRUCT(abs)
+DECLARE_UNARY_STRUCT(sqrt)
+DECLARE_UNARY_STRUCT(rsqrt)
+DECLARE_UNARY_STRUCT(square)
+DECLARE_UNARY_STRUCT(cube)
+DECLARE_UNARY_STRUCT(inverse)
+DECLARE_UNARY_STRUCT(tanh)
+DECLARE_UNARY_STRUCT(exp)
+DECLARE_UNARY_STRUCT(expm1)
+DECLARE_UNARY_STRUCT(log)
+DECLARE_UNARY_STRUCT(ceil)
+DECLARE_UNARY_STRUCT(floor)
+DECLARE_UNARY_STRUCT(round)
+DECLARE_UNARY_STRUCT(log1p)
+DECLARE_UNARY_STRUCT(sign)
+DECLARE_UNARY_STRUCT(isnan)
+DECLARE_UNARY_STRUCT(isfinite)
+DECLARE_UNARY_STRUCT(isinf)
+
+template <typename DataType, int DataLayout, typename Assignement>
+void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device,
+                                         const array<int64_t, 3>& tensor_range) {
+#define RUN_UNARY_TEST(FUNC) \
+  test_unary_builtins_for_scalar<DataType, DataLayout, Assignement, \
+                                 op_##FUNC>(sycl_device, tensor_range)
+  RUN_UNARY_TEST(abs);
+  RUN_UNARY_TEST(sqrt);
+  RUN_UNARY_TEST(rsqrt);
+  RUN_UNARY_TEST(square);
+  RUN_UNARY_TEST(cube);
+  RUN_UNARY_TEST(inverse);
+  RUN_UNARY_TEST(tanh);
+  RUN_UNARY_TEST(exp);
+  RUN_UNARY_TEST(expm1);
+  RUN_UNARY_TEST(log);
+  RUN_UNARY_TEST(ceil);
+  RUN_UNARY_TEST(floor);
+  RUN_UNARY_TEST(round);
+  RUN_UNARY_TEST(log1p);
+  RUN_UNARY_TEST(sign);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_unary_builtins_return_bool(const Eigen::SyclDevice& sycl_device,
+                                     const array<int64_t, 3>& tensor_range) {
+  /* out = op(in) */
+  Operator op;
+  Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+  Tensor<bool, 3, DataLayout, int64_t> out(tensor_range);
+  in = in.random() + DataType(0.01);
+  DataType *gpu_data = static_cast<DataType *>(
+      sycl_device.allocate(in.size() * sizeof(DataType)));
+  bool *gpu_data_out =
+      static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+  TensorMap<Tensor<bool, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+                                 (in.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(bool));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_EQUAL(out(i), op(in(i)));
+  }
+  sycl_device.deallocate(gpu_data);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, int DataLayout>
+void test_unary_builtins(const Eigen::SyclDevice& sycl_device,
+                         const array<int64_t, 3>& tensor_range) {
+  test_unary_builtins_for_assignement<DataType, DataLayout,
+                                      PlusEqualAssignement>(sycl_device, tensor_range);
+  test_unary_builtins_for_assignement<DataType, DataLayout,
+                                      EqualAssignement>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isnan>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isfinite>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isinf>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+
+  test_unary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+  test_unary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_func(const Eigen::SyclDevice& sycl_device,
+                               const array<int64_t, 3>& tensor_range) {
+  /* out = op(in_1, in_2) */
+  Operator op;
+  Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> in_2(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+  in_1 = in_1.random() + DataType(0.01);
+  in_2 = in_2.random() + DataType(0.01);
+  Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+  DataType *gpu_data_1 = static_cast<DataType *>(
+      sycl_device.allocate(in_1.size() * sizeof(DataType)));
+  DataType *gpu_data_2 = static_cast<DataType *>(
+      sycl_device.allocate(in_2.size() * sizeof(DataType)));
+  DataType *gpu_data_out = static_cast<DataType *>(
+      sycl_device.allocate(out.size() * sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_2(gpu_data_2, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+                                 (in_1.size()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),
+                                 (in_2.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu_1, gpu_2);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(DataType));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), op(in_1(i), in_2(i)));
+  }
+  sycl_device.deallocate(gpu_data_1);
+  sycl_device.deallocate(gpu_data_2);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device,
+                                     const array<int64_t, 3>& tensor_range) {
+  /* out = op(in_1, 2) */
+  Operator op;
+  const DataType arg2(2);
+  Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+  in_1 = in_1.random();
+  Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+  DataType *gpu_data_1 = static_cast<DataType *>(
+      sycl_device.allocate(in_1.size() * sizeof(DataType)));
+  DataType *gpu_data_out = static_cast<DataType *>(
+      sycl_device.allocate(out.size() * sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+                                 (in_1.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu_1, arg2);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(DataType));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), op(in_1(i), arg2));
+  }
+  sycl_device.deallocate(gpu_data_1);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+#define DECLARE_BINARY_STRUCT(FUNC)                                                          \
+  struct op_##FUNC {                                                                         \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const T1& x, const T2& y) -> decltype(cl::sycl::FUNC(x, y)) {            \
+      return cl::sycl::FUNC(x, y);                                                           \
+    }                                                                                        \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) -> decltype(x.FUNC(y)) { \
+      return x.FUNC(y);                                                                      \
+    }                                                                                        \
+  };
+
+DECLARE_BINARY_STRUCT(cwiseMax)
+DECLARE_BINARY_STRUCT(cwiseMin)
+
+#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR)                          \
+  struct op_##NAME {                                                      \
+    template <typename T1, typename T2>                                   \
+    auto operator()(const T1& x, const T2& y) -> decltype(x OPERATOR y) { \
+      return x OPERATOR y;                                                \
+    }                                                                     \
+  };
+
+DECLARE_BINARY_STRUCT_OP(plus, +)
+DECLARE_BINARY_STRUCT_OP(minus, -)
+DECLARE_BINARY_STRUCT_OP(times, *)
+DECLARE_BINARY_STRUCT_OP(divide, /)
+DECLARE_BINARY_STRUCT_OP(modulo, %)
+
+template <typename DataType, int DataLayout>
+void test_binary_builtins(const Eigen::SyclDevice& sycl_device,
+                          const array<int64_t, 3>& tensor_range) {
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_cwiseMax>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_cwiseMin>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_plus>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_minus>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_times>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_divide>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_floating_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+  test_binary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+  test_binary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_integer_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+  test_binary_builtins_fixed_arg2<DataType, RowMajor,
+                                  op_modulo>(sycl_device, tensor_range);
+  test_binary_builtins_fixed_arg2<DataType, ColMajor,
+                                  op_modulo>(sycl_device, tensor_range);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    QueueInterface queueInterface(device);
+    Eigen::SyclDevice sycl_device(&queueInterface);
+    CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device));
+    CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device));
+    CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/contrib/eigen/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
similarity index 91%
rename from contrib/eigen/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
rename to contrib/eigen/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
index 816e0322075baa07f1d2b64bad19511ac23bfcae..97923d15f19f6d1adfd0077e3ecb9e3bb0384dc4 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
@@ -9,7 +9,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
@@ -18,8 +18,8 @@
 
 using Eigen::Tensor;
 
-void test_cuda_conversion() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_conversion() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -72,8 +72,8 @@ void test_fallback_conversion() {
 }
 
 
-void test_cxx11_tensor_cast_float16_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_cast_float16_gpu)
 {
-  CALL_SUBTEST(test_cuda_conversion());
+  CALL_SUBTEST(test_gpu_conversion());
   CALL_SUBTEST(test_fallback_conversion());
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_casts.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_casts.cpp
index 3c6d0d2fff6347f90dc318f9c981b8e10b6d8910..45456f3efa6d4ce6feaad5e3ae978fb469cb80f3 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_casts.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_casts.cpp
@@ -8,6 +8,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "main.h"
+#include "random_without_cast_overflow.h"
 
 #include <Eigen/CXX11/Tensor>
 
@@ -104,12 +105,82 @@ static void test_small_to_big_type_cast()
   }
 }
 
+template <typename FromType, typename ToType>
+static void test_type_cast() {
+  Tensor<FromType, 2> ftensor(100, 200);
+  // Generate random values for a valid cast.
+  for (int i = 0; i < 100; ++i) {
+    for (int j = 0; j < 200; ++j) {
+      ftensor(i, j) = internal::random_without_cast_overflow<FromType,ToType>::value();
+    }
+  }
+
+  Tensor<ToType, 2> ttensor(100, 200);
+  ttensor = ftensor.template cast<ToType>();
+
+  for (int i = 0; i < 100; ++i) {
+    for (int j = 0; j < 200; ++j) {
+      const ToType ref = internal::cast<FromType,ToType>(ftensor(i, j));
+      VERIFY_IS_APPROX(ttensor(i, j), ref);
+    }
+  }
+}
+
+template<typename Scalar, typename EnableIf = void>
+struct test_cast_runner {
+  static void run() {
+    test_type_cast<Scalar, bool>();
+    test_type_cast<Scalar, int8_t>();
+    test_type_cast<Scalar, int16_t>();
+    test_type_cast<Scalar, int32_t>();
+    test_type_cast<Scalar, int64_t>();
+    test_type_cast<Scalar, uint8_t>();
+    test_type_cast<Scalar, uint16_t>();
+    test_type_cast<Scalar, uint32_t>();
+    test_type_cast<Scalar, uint64_t>();
+    test_type_cast<Scalar, half>();
+    test_type_cast<Scalar, bfloat16>();
+    test_type_cast<Scalar, float>();
+    test_type_cast<Scalar, double>();
+    test_type_cast<Scalar, std::complex<float>>();
+    test_type_cast<Scalar, std::complex<double>>();
+  }
+};
+
+// Only certain types allow cast from std::complex<>.
+template<typename Scalar>
+struct test_cast_runner<Scalar, typename internal::enable_if<NumTraits<Scalar>::IsComplex>::type> {
+  static void run() {
+    test_type_cast<Scalar, half>();
+    test_type_cast<Scalar, bfloat16>();
+    test_type_cast<Scalar, std::complex<float>>();
+    test_type_cast<Scalar, std::complex<double>>();
+  }
+};
+
 
-void test_cxx11_tensor_casts()
+EIGEN_DECLARE_TEST(cxx11_tensor_casts)
 {
-   CALL_SUBTEST(test_simple_cast());
-   CALL_SUBTEST(test_vectorized_cast());
-   CALL_SUBTEST(test_float_to_int_cast());
-   CALL_SUBTEST(test_big_to_small_type_cast());
-   CALL_SUBTEST(test_small_to_big_type_cast());
+  CALL_SUBTEST(test_simple_cast());
+  CALL_SUBTEST(test_vectorized_cast());
+  CALL_SUBTEST(test_float_to_int_cast());
+  CALL_SUBTEST(test_big_to_small_type_cast());
+  CALL_SUBTEST(test_small_to_big_type_cast());
+
+  CALL_SUBTEST(test_cast_runner<bool>::run());
+  CALL_SUBTEST(test_cast_runner<int8_t>::run());
+  CALL_SUBTEST(test_cast_runner<int16_t>::run());
+  CALL_SUBTEST(test_cast_runner<int32_t>::run());
+  CALL_SUBTEST(test_cast_runner<int64_t>::run());
+  CALL_SUBTEST(test_cast_runner<uint8_t>::run());
+  CALL_SUBTEST(test_cast_runner<uint16_t>::run());
+  CALL_SUBTEST(test_cast_runner<uint32_t>::run());
+  CALL_SUBTEST(test_cast_runner<uint64_t>::run());
+  CALL_SUBTEST(test_cast_runner<half>::run());
+  CALL_SUBTEST(test_cast_runner<bfloat16>::run());
+  CALL_SUBTEST(test_cast_runner<float>::run());
+  CALL_SUBTEST(test_cast_runner<double>::run());
+  CALL_SUBTEST(test_cast_runner<std::complex<float>>::run());
+  CALL_SUBTEST(test_cast_runner<std::complex<double>>::run());
+
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_chipping.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_chipping.cpp
index 1832dec8bf46a711cac0b75d729ec7a86fa6410d..922274462bcb7443f9c475f7bcaa40b460c97bc3 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_chipping.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_chipping.cpp
@@ -43,7 +43,7 @@ static void test_simple_chip()
   VERIFY_IS_EQUAL(chip2.dimension(2), 7);
   VERIFY_IS_EQUAL(chip2.dimension(3), 11);
   for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
+    for (int j = 0; j < 5; ++j) {
       for (int k = 0; k < 7; ++k) {
         for (int l = 0; l < 11; ++l) {
           VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
@@ -75,7 +75,7 @@ static void test_simple_chip()
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 7; ++l) {
+        for (int l = 0; l < 11; ++l) {
           VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
         }
       }
@@ -126,7 +126,7 @@ static void test_dynamic_chip()
   VERIFY_IS_EQUAL(chip2.dimension(2), 7);
   VERIFY_IS_EQUAL(chip2.dimension(3), 11);
   for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
+    for (int j = 0; j < 5; ++j) {
       for (int k = 0; k < 7; ++k) {
         for (int l = 0; l < 11; ++l) {
           VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
@@ -158,7 +158,7 @@ static void test_dynamic_chip()
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 7; ++l) {
+        for (int l = 0; l < 11; ++l) {
           VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
         }
       }
@@ -410,7 +410,7 @@ static void test_chip_raw_data_row_major()
   VERIFY_IS_EQUAL(chip4.data(), static_cast<float*>(0));
 }
 
-void test_cxx11_tensor_chipping()
+EIGEN_DECLARE_TEST(cxx11_tensor_chipping)
 {
   CALL_SUBTEST(test_simple_chip<ColMajor>());
   CALL_SUBTEST(test_simple_chip<RowMajor>());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_chipping_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e70931044dad531e5b65d5c40ef80c4abb761f1
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_chipping_sycl.cpp
@@ -0,0 +1,623 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_static_chip_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+
+  tensor.setRandom();
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_chip1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(1l);
+  sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2);
+  VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim2; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange);
+  const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType);
+  DataType* gpu_data_chip2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+  gpu_chip2.device(sycl_device)=gpu_tensor.template chip<1l>(1l);
+  sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange);
+  const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType);
+  DataType* gpu_data_chip3  = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange);
+
+  gpu_chip3.device(sycl_device)=gpu_tensor.template chip<2l>(2l);
+  sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange);
+  const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType);
+  DataType* gpu_data_chip4  = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange);
+
+  gpu_chip4.device(sycl_device)=gpu_tensor.template chip<3l>(5l);
+  sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l));
+        }
+      }
+    }
+  }
+
+
+  array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange);
+  const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType);
+  DataType* gpu_data_chip5  = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange);
+
+  gpu_chip5.device(sycl_device)=gpu_tensor.template chip<4l>(7l);
+  sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l));
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_chip1);
+  sycl_device.deallocate(gpu_data_chip2);
+  sycl_device.deallocate(gpu_data_chip3);
+  sycl_device.deallocate(gpu_data_chip4);
+  sycl_device.deallocate(gpu_data_chip5);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_dynamic_chip_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+
+  tensor.setRandom();
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_chip1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_chip1.device(sycl_device)=gpu_tensor.chip(1l,0l);
+  sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2);
+  VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim2; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange);
+  const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType);
+  DataType* gpu_data_chip2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+  gpu_chip2.device(sycl_device)=gpu_tensor.chip(1l,1l);
+  sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange);
+  const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType);
+  DataType* gpu_data_chip3  = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange);
+
+  gpu_chip3.device(sycl_device)=gpu_tensor.chip(2l,2l);
+  sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange);
+  const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType);
+  DataType* gpu_data_chip4  = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange);
+
+  gpu_chip4.device(sycl_device)=gpu_tensor.chip(5l,3l);
+  sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l));
+        }
+      }
+    }
+  }
+
+
+  array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange);
+  const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType);
+  DataType* gpu_data_chip5  = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange);
+
+  gpu_chip5.device(sycl_device)=gpu_tensor.chip(7l,4l);
+  sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l));
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_chip1);
+  sycl_device.deallocate(gpu_data_chip2);
+  sycl_device.deallocate(gpu_data_chip3);
+  sycl_device.deallocate(gpu_data_chip4);
+  sycl_device.deallocate(gpu_data_chip5);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_chip_in_expr(const Eigen::SyclDevice& sycl_device) {
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+
+  Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> tensor1(chip1TensorRange);
+  tensor.setRandom();
+  tensor1.setRandom();
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_chip1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+  DataType* gpu_data_tensor1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor1(gpu_data_tensor1, chip1TensorRange);
+
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  sycl_device.memcpyHostToDevice(gpu_data_tensor1, tensor1.data(), chip1TensorBuffSize);
+  gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(0l) + gpu_tensor1;
+  sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+  for (int i = 0; i < sizeDim2; ++i) {
+    for (int j = 0; j < sizeDim3; ++j) {
+      for (int k = 0; k < sizeDim4; ++k) {
+        for (int l = 0; l < sizeDim5; ++l) {
+          float expected = tensor(0l,i,j,k,l) + tensor1(i,j,k,l);
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), expected);
+        }
+      }
+    }
+  }
+
+  array<IndexType, 3> chip2TensorRange = {{sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 3, DataLayout,IndexType> tensor2(chip2TensorRange);
+  Tensor<DataType, 3, DataLayout,IndexType> chip2(chip2TensorRange);
+  tensor2.setRandom();
+  const size_t chip2TensorBuffSize =tensor2.size()*sizeof(DataType);
+  DataType* gpu_data_tensor2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  DataType* gpu_data_chip2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_tensor2(gpu_data_tensor2, chip2TensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor2, tensor2.data(), chip2TensorBuffSize);
+  gpu_chip2.device(sycl_device)=gpu_tensor.template chip<0l>(0l).template chip<1l>(2l) + gpu_tensor2;
+  sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+  for (int i = 0; i < sizeDim2; ++i) {
+    for (int j = 0; j < sizeDim4; ++j) {
+      for (int k = 0; k < sizeDim5; ++k) {
+        float expected = tensor(0l,i,2l,j,k) + tensor2(i,j,k);
+        VERIFY_IS_EQUAL(chip2(i,j,k), expected);
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_tensor1);
+  sycl_device.deallocate(gpu_data_chip1);
+  sycl_device.deallocate(gpu_data_tensor2);
+  sycl_device.deallocate(gpu_data_chip2);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> input2TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 5, DataLayout,IndexType> input1(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> input2(input2TensorRange);
+  input1.setRandom();
+  input2.setRandom();
+
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t input2TensorBuffSize =input2.size()*sizeof(DataType);
+  std::cout << tensorBuffSize << " , "<<  input2TensorBuffSize << std::endl;
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_input1  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_input2  = static_cast<DataType*>(sycl_device.allocate(input2TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input1(gpu_data_input1, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input2(gpu_data_input2, input2TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input1, input1.data(), tensorBuffSize);
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  sycl_device.memcpyHostToDevice(gpu_data_input2, input2.data(), input2TensorBuffSize);
+  gpu_tensor.template chip<0l>(1l).device(sycl_device)=gpu_input2;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (i != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input3TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> input3(input3TensorRange);
+  input3.setRandom();
+
+  const size_t input3TensorBuffSize =input3.size()*sizeof(DataType);
+  DataType* gpu_data_input3  = static_cast<DataType*>(sycl_device.allocate(input3TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input3(gpu_data_input3, input3TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input3, input3.data(), input3TensorBuffSize);
+  gpu_tensor.template chip<1l>(1l).device(sycl_device)=gpu_input3;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (j != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input4TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> input4(input4TensorRange);
+  input4.setRandom();
+
+  const size_t input4TensorBuffSize =input4.size()*sizeof(DataType);
+  DataType* gpu_data_input4  = static_cast<DataType*>(sycl_device.allocate(input4TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input4(gpu_data_input4, input4TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input4, input4.data(), input4TensorBuffSize);
+  gpu_tensor.template chip<2l>(3l).device(sycl_device)=gpu_input4;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (k != 3) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> input5(input5TensorRange);
+  input5.setRandom();
+
+  const size_t input5TensorBuffSize =input5.size()*sizeof(DataType);
+  DataType* gpu_data_input5  = static_cast<DataType*>(sycl_device.allocate(input5TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input5(gpu_data_input5, input5TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input5, input5.data(), input5TensorBuffSize);
+  gpu_tensor.template chip<3l>(4l).device(sycl_device)=gpu_input5;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (l != 4) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m));
+            }
+          }
+        }
+      }
+    }
+  }
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input6TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> input6(input6TensorRange);
+  input6.setRandom();
+
+  const size_t input6TensorBuffSize =input6.size()*sizeof(DataType);
+  DataType* gpu_data_input6  = static_cast<DataType*>(sycl_device.allocate(input6TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input6(gpu_data_input6, input6TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input6, input6.data(), input6TensorBuffSize);
+  gpu_tensor.template chip<4l>(5l).device(sycl_device)=gpu_input6;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (m != 5) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l));
+            }
+          }
+        }
+      }
+    }
+  }
+
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  Tensor<DataType, 5, DataLayout,IndexType> input7(tensorRange);
+  input7.setRandom();
+
+  DataType* gpu_data_input7  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input7(gpu_data_input7, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input7, input7.data(), tensorBuffSize);
+  gpu_tensor.chip(0l,0l).device(sycl_device)=gpu_input7.chip(0l,0l);
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (i != 0) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_input1);
+  sycl_device.deallocate(gpu_data_input2);
+  sycl_device.deallocate(gpu_data_input3);
+  sycl_device.deallocate(gpu_data_input4);
+  sycl_device.deallocate(gpu_data_input5);
+  sycl_device.deallocate(gpu_data_input6);
+  sycl_device.deallocate(gpu_data_input7);
+
+}
+
+template<typename DataType, typename dev_Selector> void sycl_chipping_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ /* test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_static_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_dynamic_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_dynamic_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_chip_in_expr<DataType, RowMajor, int64_t>(sycl_device);
+  test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device);*/
+  test_chip_as_lvalue_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ // test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_chipping_test_per_device<float>(device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_comparisons.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_comparisons.cpp
index b1ff8aecb9e85eb15ea9c33afd00d02f2832b9ac..1a18e07cca5a57413289c168e06df55dce541046 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_comparisons.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_comparisons.cpp
@@ -77,7 +77,7 @@ static void test_equality()
 }
 
 
-void test_cxx11_tensor_comparisons()
+EIGEN_DECLARE_TEST(cxx11_tensor_comparisons)
 {
   CALL_SUBTEST(test_orderings());
   CALL_SUBTEST(test_equality());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/contrib/eigen/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
similarity index 89%
rename from contrib/eigen/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
rename to contrib/eigen/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
index aac7809056fcf8dffaab4c229801c8f71e856099..99447b21dcf856b2d8ebf4971387de6569ca1132 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
@@ -8,7 +8,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
+
 #define EIGEN_USE_GPU
 
 #include "main.h"
@@ -28,7 +28,7 @@ void test_cuda_complex_cwise_ops() {
   cudaMalloc((void**)(&d_in2), complex_bytes);
   cudaMalloc((void**)(&d_out), complex_bytes);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(
@@ -48,11 +48,13 @@ void test_cuda_complex_cwise_ops() {
     Add = 0,
     Sub,
     Mul,
-    Div
+    Div,
+    Neg,
+    NbOps
   };
 
   Tensor<std::complex<T>, 1, 0, int> actual(kNumItems);
-  for (int op = Add; op <= Div; op++) {
+  for (int op = Add; op < NbOps; op++) {
     std::complex<T> expected;
     switch (static_cast<CwiseOp>(op)) {
       case Add:
@@ -71,6 +73,12 @@ void test_cuda_complex_cwise_ops() {
         gpu_out.device(gpu_device) = gpu_in1 / gpu_in2;
         expected = a / b;
         break;
+      case Neg:
+        gpu_out.device(gpu_device) = -gpu_in1;
+        expected = -a;
+        break;
+      case NbOps:
+        break;
     }
     assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost,
                            gpu_device.stream()) == cudaSuccess);
@@ -87,7 +95,7 @@ void test_cuda_complex_cwise_ops() {
 }
 
 
-void test_cxx11_tensor_complex_cwise_ops()
+EIGEN_DECLARE_TEST(test_cxx11_tensor_complex_cwise_ops)
 {
   CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
   CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_complex_cuda.cu b/contrib/eigen/unsupported/test/cxx11_tensor_complex_gpu.cu
similarity index 76%
rename from contrib/eigen/unsupported/test/cxx11_tensor_complex_cuda.cu
rename to contrib/eigen/unsupported/test/cxx11_tensor_complex_gpu.cu
index 916f12a84230e7fa9c6a4c399081edeae99fc064..f8b8ae704857fdd30cc090b5f6c215f766bacc8c 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_complex_cuda.cu
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_complex_gpu.cu
@@ -8,7 +8,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_complex
+
 #define EIGEN_USE_GPU
 
 #include "main.h"
@@ -34,7 +34,7 @@ void test_cuda_nullary() {
   cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
@@ -70,7 +70,7 @@ void test_cuda_nullary() {
 
 static void test_cuda_sum_reductions() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   const int num_rows = internal::random<int>(1024, 5*1024);
@@ -104,10 +104,45 @@ static void test_cuda_sum_reductions() {
   gpu_device.deallocate(gpu_out_ptr);
 }
 
+static void test_cuda_mean_reductions() {
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<std::complex<float>, 2> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<std::complex<float>, 0> full_redux;
+  full_redux = in.mean();
+
+  std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
+  std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
+  std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
+  std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.mean();
+
+  Tensor<std::complex<float>, 0> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
 
 static void test_cuda_product_reductions() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   const int num_rows = internal::random<int>(1024, 5*1024);
@@ -142,9 +177,10 @@ static void test_cuda_product_reductions() {
 }
 
 
-void test_cxx11_tensor_complex()
+EIGEN_DECLARE_TEST(test_cxx11_tensor_complex)
 {
   CALL_SUBTEST(test_cuda_nullary());
   CALL_SUBTEST(test_cuda_sum_reductions());
+  CALL_SUBTEST(test_cuda_mean_reductions());
   CALL_SUBTEST(test_cuda_product_reductions());
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_concatenation.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_concatenation.cpp
index 03ef12e636e840925f1f4a7235eec914e565b7ef..bb9418d33a15856567ddb100d569d41be5d02428 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_concatenation.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_concatenation.cpp
@@ -50,7 +50,13 @@ static void test_static_dimension_failure()
       .reshape(Tensor<int, 3>::Dimensions(2, 3, 1))
       .concatenate(right, 0);
   Tensor<int, 2, DataLayout> alternative = left
-      .concatenate(right.reshape(Tensor<int, 2>::Dimensions{{{2, 3}}}), 0);
+   // Clang compiler break with {{{}}} with an ambiguous error on copy constructor
+  // the variadic DSize constructor added for #ifndef EIGEN_EMULATE_CXX11_META_H.
+  // Solution:
+  // either the code should change to 
+  //  Tensor<int, 2>::Dimensions{{2, 3}}
+  // or Tensor<int, 2>::Dimensions{Tensor<int, 2>::Dimensions{{2, 3}}}
+      .concatenate(right.reshape(Tensor<int, 2>::Dimensions(2, 3)), 0);
 }
 
 template<int DataLayout>
@@ -123,7 +129,7 @@ static void test_concatenation_as_lvalue()
 }
 
 
-void test_cxx11_tensor_concatenation()
+EIGEN_DECLARE_TEST(cxx11_tensor_concatenation)
 {
    CALL_SUBTEST(test_dimension_failures<ColMajor>());
    CALL_SUBTEST(test_dimension_failures<RowMajor>());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..765991b357c84d1b4d04da27739c26413b4e061c
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
@@ -0,0 +1,180 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType leftDim1 = 2;
+  IndexType leftDim2 = 3;
+  IndexType leftDim3 = 1;
+  Eigen::array<IndexType, 3> leftRange = {{leftDim1, leftDim2, leftDim3}};
+  IndexType rightDim1 = 2;
+  IndexType rightDim2 = 3;
+  IndexType rightDim3 = 1;
+  Eigen::array<IndexType, 3> rightRange = {{rightDim1, rightDim2, rightDim3}};
+
+  //IndexType concatDim1 = 3;
+//	IndexType concatDim2 = 3;
+//	IndexType concatDim3 = 1;
+  //Eigen::array<IndexType, 3> concatRange = {{concatDim1, concatDim2, concatDim3}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> left(leftRange);
+  Tensor<DataType, 3, DataLayout, IndexType> right(rightRange);
+  left.setRandom();
+  right.setRandom();
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
+  sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
+  ///
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3);
+  DataType * gpu_out_data1 =  static_cast<DataType*>(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType)));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out1(gpu_out_data1, concatenation1.dimensions());
+
+  //concatenation = left.concatenate(right, 0);
+  gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0);
+  sycl_device.memcpyDeviceToHost(concatenation1.data(), gpu_out_data1,(concatenation1.dimensions().TotalSize())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(concatenation1.dimension(0), 4);
+  VERIFY_IS_EQUAL(concatenation1.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation1.dimension(2), 1);
+  for (IndexType j = 0; j < 3; ++j) {
+    for (IndexType i = 0; i < 2; ++i) {
+      VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0));
+    }
+    for (IndexType i = 2; i < 4; ++i) {
+      VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0));
+    }
+  }
+
+  sycl_device.deallocate(gpu_out_data1);
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3);
+  DataType * gpu_out_data2 =  static_cast<DataType*>(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType)));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out2(gpu_out_data2, concatenation2.dimensions());
+  gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1);
+  sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType));
+
+  //concatenation = left.concatenate(right, 1);
+  VERIFY_IS_EQUAL(concatenation2.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation2.dimension(1), 6);
+  VERIFY_IS_EQUAL(concatenation2.dimension(2), 1);
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0));
+    }
+    for (IndexType j = 3; j < 6; ++j) {
+      VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0));
+    }
+  }
+  sycl_device.deallocate(gpu_out_data2);
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3);
+  DataType * gpu_out_data3 =  static_cast<DataType*>(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType)));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out3(gpu_out_data3, concatenation3.dimensions());
+  gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2);
+  sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType));
+
+  //concatenation = left.concatenate(right, 2);
+  VERIFY_IS_EQUAL(concatenation3.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation3.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation3.dimension(2), 2);
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0));
+      VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0));
+    }
+  }
+  sycl_device.deallocate(gpu_out_data3);
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+}
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType leftDim1 = 2;
+  IndexType leftDim2 = 3;
+  Eigen::array<IndexType, 2> leftRange = {{leftDim1, leftDim2}};
+
+  IndexType rightDim1 = 2;
+  IndexType rightDim2 = 3;
+  Eigen::array<IndexType, 2> rightRange = {{rightDim1, rightDim2}};
+
+  IndexType concatDim1 = 4;
+  IndexType concatDim2 = 3;
+  Eigen::array<IndexType, 2> resRange = {{concatDim1, concatDim2}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> left(leftRange);
+  Tensor<DataType, 2, DataLayout, IndexType> right(rightRange);
+  Tensor<DataType, 2, DataLayout, IndexType> result(resRange);
+
+  left.setRandom();
+  right.setRandom();
+  result.setRandom();
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));
+
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(gpu_out_data, resRange);
+
+  sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_out_data, result.data(),(result.dimensions().TotalSize())*sizeof(DataType));
+
+//  t1.concatenate(t2, 0) = result;
+ gpu_in1.concatenate(gpu_in2, 0).device(sycl_device) =gpu_out;
+ sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(left(i, j), result(i, j));
+      VERIFY_IS_EQUAL(right(i, j), result(i+2, j));
+    }
+  }
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+
+template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(Dev_selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_concatenation<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_concatenation<DataType, ColMajor, int64_t>(sycl_device);
+  test_concatenation_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorConcat_perDevice<float>(device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_const.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_const.cpp
index ad9c9da398349a05023c0c554cab7bda2b429378..9d806ee3c4cedfed593f2ebdcc9a42e1a943b3fc 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_const.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_const.cpp
@@ -55,7 +55,7 @@ static void test_assign_of_const_tensor()
 }
 
 
-void test_cxx11_tensor_const()
+EIGEN_DECLARE_TEST(cxx11_tensor_const)
 {
   CALL_SUBTEST(test_simple_assign());
   CALL_SUBTEST(test_assign_of_const_tensor());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_contract_cuda.cu b/contrib/eigen/unsupported/test/cxx11_tensor_contract_gpu.cu
similarity index 68%
rename from contrib/eigen/unsupported/test/cxx11_tensor_contract_cuda.cu
rename to contrib/eigen/unsupported/test/cxx11_tensor_contract_gpu.cu
index e821ccf0ca3befc09988ea9e7ffbe081736cc717..575bdc1f9594ab30d80887c9e0badc5c4f202b26 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_contract_cuda.cu
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_contract_gpu.cu
@@ -10,18 +10,20 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
 using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;
 
 template<int DataLayout>
-void test_cuda_contraction(int m_size, int k_size, int n_size)
+void test_gpu_contraction(int m_size, int k_size, int n_size)
 {
   std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
   // with these dimensions, the output has 300 * 140 elements, which is
@@ -44,14 +46,14 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
   float* d_t_right;
   float* d_t_result;
 
-  cudaMalloc((void**)(&d_t_left), t_left_bytes);
-  cudaMalloc((void**)(&d_t_right), t_right_bytes);
-  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+  gpuMalloc((void**)(&d_t_left), t_left_bytes);
+  gpuMalloc((void**)(&d_t_right), t_right_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
 
-  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
@@ -65,7 +67,7 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
   t_result = t_left.contract(t_right, dims);
 
-  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
   for (DenseIndex i = 0; i < t_result.size(); i++) {
     if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
       continue;
@@ -78,9 +80,9 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
     assert(false);
   }
 
-  cudaFree((void*)d_t_left);
-  cudaFree((void*)d_t_right);
-  cudaFree((void*)d_t_result);
+  gpuFree((void*)d_t_left);
+  gpuFree((void*)d_t_right);
+  gpuFree((void*)d_t_result);
 }
 
 
@@ -108,14 +110,14 @@ void test_scalar(int m_size, int k_size, int n_size)
   float* d_t_right;
   float* d_t_result;
 
-  cudaMalloc((void**)(&d_t_left), t_left_bytes);
-  cudaMalloc((void**)(&d_t_right), t_right_bytes);
-  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+  gpuMalloc((void**)(&d_t_left), t_left_bytes);
+  gpuMalloc((void**)(&d_t_right), t_right_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
 
-  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
@@ -128,7 +130,7 @@ void test_scalar(int m_size, int k_size, int n_size)
   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
   t_result = t_left.contract(t_right, dims);
 
-  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
   if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
       !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
     std::cout << "mismatch detected: " << t_result()
@@ -136,39 +138,39 @@ void test_scalar(int m_size, int k_size, int n_size)
     assert(false);
   }
 
-  cudaFree((void*)d_t_left);
-  cudaFree((void*)d_t_right);
-  cudaFree((void*)d_t_result);
+  gpuFree((void*)d_t_left);
+  gpuFree((void*)d_t_right);
+  gpuFree((void*)d_t_result);
 }
 
 
 template<int DataLayout>
-void test_cuda_contraction_m() {
+void test_gpu_contraction_m() {
   for (int k = 32; k < 256; k++) {
-    test_cuda_contraction<ColMajor>(k, 128, 128);
-    test_cuda_contraction<RowMajor>(k, 128, 128);
+    test_gpu_contraction<ColMajor>(k, 128, 128);
+    test_gpu_contraction<RowMajor>(k, 128, 128);
   }
 }
 
 template<int DataLayout>
-void test_cuda_contraction_k() {
+void test_gpu_contraction_k() {
   for (int k = 32; k < 256; k++) {
-    test_cuda_contraction<ColMajor>(128, k, 128);
-    test_cuda_contraction<RowMajor>(128, k, 128);
+    test_gpu_contraction<ColMajor>(128, k, 128);
+    test_gpu_contraction<RowMajor>(128, k, 128);
   }
 }
 
 template<int DataLayout>
-void test_cuda_contraction_n() {
+void test_gpu_contraction_n() {
   for (int k = 32; k < 256; k++) {
-    test_cuda_contraction<ColMajor>(128, 128, k);
-    test_cuda_contraction<RowMajor>(128, 128, k);
+    test_gpu_contraction<ColMajor>(128, 128, k);
+    test_gpu_contraction<RowMajor>(128, 128, k);
   }
 }
 
 
 template<int DataLayout>
-void test_cuda_contraction_sizes() {
+void test_gpu_contraction_sizes() {
   int m_sizes[] = { 31,  39,   63,   64,   65,
                    127, 129,  255,  257 , 511,
                    512, 513, 1023, 1024, 1025};
@@ -185,29 +187,32 @@ void test_cuda_contraction_sizes() {
   for (int i = 0; i < 15; i++) {
     for (int j = 0; j < 15; j++) {
       for (int k = 0; k < 17; k++) {
-        test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
+        test_gpu_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
       }
     }
   }
 }
 
-void test_cxx11_tensor_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_contract_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
-  CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128));
 
   CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128));
   CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128));
 
-  CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>());
+  CALL_SUBTEST_2(test_gpu_contraction_m<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_contraction_m<RowMajor>());
 
-  CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>());
-  CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>());
+  CALL_SUBTEST_4(test_gpu_contraction_k<ColMajor>());
+  CALL_SUBTEST_5(test_gpu_contraction_k<RowMajor>());
 
-  CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>());
-  CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>());
+  CALL_SUBTEST_6(test_gpu_contraction_n<ColMajor>());
+  CALL_SUBTEST_7(test_gpu_contraction_n<RowMajor>());
 
-  CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>());
-  CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>());
+#if !defined(EIGEN_USE_HIP)
+// disable these subtests for HIP
+  CALL_SUBTEST_8(test_gpu_contraction_sizes<ColMajor>());
+  CALL_SUBTEST_9(test_gpu_contraction_sizes<RowMajor>());
+#endif	
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_contract_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_contract_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fbcc293580efa9da6baf192042c3a4ce50ff8cbd
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_contract_sycl.cpp
@@ -0,0 +1,1026 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <algorithm>
+#include <chrono>
+#include <ctime>
+#include <iostream>
+
+#include "main.h"
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void static test_sycl_contraction(const Device &sycl_device, IndexType m_size,
+                                  IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size);
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, result_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_m(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128,
+                                                           128);
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_k(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k,
+                                                           128);
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_n(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128,
+                                                           128, k);
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_sizes(const Device &sycl_device) {
+  IndexType m_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
+                         257, 511, 512, 513, 1023, 1024, 1025};
+
+  IndexType n_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
+                         257, 511, 512, 513, 1023, 1024, 1025};
+
+  IndexType k_sizes[] = {31,  39,  63,  64,  65,  95,   96,   127, 129,
+                         255, 257, 511, 512, 513, 1023, 1024, 1025};
+
+  for (IndexType i = 0; i < 15; i++) {
+    for (IndexType j = 0; j < 15; j++) {
+      for (IndexType k = 0; k < 17; k++) {
+        test_sycl_contraction<DataLayout, DataType, IndexType>(
+            sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
+      }
+    }
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void static test_no_out_of_bounds(const Device &sycl_device, IndexType m_size,
+                                  IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // Allocate buffers twice as big to check for invalid read and write
+  auto padded_left_size = 2 * t_left.size();
+  auto padded_right_size = 2 * t_right.size();
+  auto padded_result_size = 2 * t_result.size();
+
+  std::size_t t_left_bytes = padded_left_size * sizeof(DataType);
+  std::size_t t_right_bytes = padded_right_size * sizeof(DataType);
+  std::size_t t_result_bytes = padded_result_size * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  // TensorMaps are still of the same size than the Tensors
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, result_dims);
+
+  // Write nan after the actual buffer to propagate nans everywhere in case of
+  // invalid reads
+  DataType nan = std::numeric_limits<DataType>::quiet_NaN();
+  auto host_left_data = new DataType[padded_left_size];
+  std::copy_n(t_left.data(), t_left.size(), host_left_data);
+  std::fill_n(host_left_data + t_left.size(), t_left.size(), nan);
+  auto host_right_data = new DataType[padded_right_size];
+  std::copy_n(t_right.data(), t_right.size(), host_right_data);
+  std::fill_n(host_right_data + t_right.size(), t_right.size(), nan);
+  auto host_result_data = new DataType[padded_result_size];
+  std::fill_n(host_result_data, padded_result_size, nan);
+
+  sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes);
+  sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - host_result_data[i]))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), host_result_data[i],
+                                  error_threshold)) {
+      continue;
+    }
+    if (std::isnan(host_result_data[i])) {
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", invalid read detected at IndexType " << i << ": "
+                << t_result(i) << " vs " << host_result_data[i] << std::endl;
+    } else {
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", mismatch detected at IndexType " << i << ": "
+                << t_result(i) << " vs " << host_result_data[i] << std::endl;
+    }
+    VERIFY_IS_APPROX(host_result_data[i], t_result(i));
+  }
+  // Make sure that the rest of the result is still nans
+  for (IndexType i = t_result.size(); i < padded_result_size; i++) {
+    if (std::isnan(host_result_data[i])) {
+      continue;
+    }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", invalid write detected at IndexType " << i << ": "
+              << host_result_data[i] << std::endl;
+    VERIFY_IS_APPROX(host_result_data[i], t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+
+  delete[] host_left_data;
+  delete[] host_right_data;
+  delete[] host_result_data;
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_scalar(const Device &sycl_device, IndexType m_size, IndexType k_size,
+                 IndexType n_size) {
+  // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size <<
+  // ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 0, DataLayout, IndexType> t_result;
+  Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu;
+  Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType>>
+      gpu_t_result(d_t_result);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+          t_result() - t_result_gpu()))) > error_threshold &&
+      !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) {
+    std::cout << "K: " << k_size << ", N: " << n_size << ", M: " << m_size
+              << " : mismatch detected: " << t_result() << " vs "
+              << t_result_gpu() << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(), t_result());
+  }
+
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_batch(const Device &sycl_device, IndexType m_size,
+                       IndexType k_size, IndexType n_size, IndexType m_batch,
+                       IndexType start, IndexType limit) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  typedef Eigen::array<IndexType, 3> TensorDim;
+  typedef Eigen::Tensor<DataType, 3, DataLayout, IndexType> TensorType;
+  TensorDim left_dims = {{m_batch, k_size, m_size}};
+  TensorDim right_dims = {{m_batch, n_size, k_size}};
+  TensorDim res_dims = {{m_batch, m_size, n_size}};
+  Eigen::array<DimPair, 1> contract_pairs = {{DimPair(0, 1)}};
+
+  TensorType t_left(left_dims);
+  TensorType t_right(right_dims);
+  TensorType t_result_gpu(res_dims);
+  TensorType t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<TensorType> gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<TensorType> gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<TensorType> gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+  for (int i = start; i < limit; ++i) {
+    auto x = gpu_t_left.template chip<0>(i);
+    auto y = gpu_t_right.template chip<0>(i);
+    auto z = gpu_t_result.template chip<0>(i);
+    z.device(sycl_device) = x.contract(y, contract_pairs);
+  }
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  for (int i = start; i < limit; ++i) {
+    auto x = t_left.template chip<0>(i);
+    auto y = t_right.template chip<0>(i);
+    auto z = t_result.template chip<0>(i);
+    z = x.contract(y, contract_pairs);
+  }
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_rhs_transposed(const Device &sycl_device, IndexType m_size,
+                                IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 1)}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType j = 0; j < m_size; j++) {
+    for (IndexType i = 0; i < n_size; i++) {
+      if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+              t_result(j, i) - t_result_gpu(j, i)))) < error_threshold) {
+        continue;
+      }
+      if (Eigen::internal::isApprox(t_result(j, i), t_result_gpu(j, i),
+                                    error_threshold)) {
+        continue;
+      }
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", mismatch detected at IndexType m: " << j << " n: " << i
+                << " CPU : " << t_result(j, i)
+                << " vs SYCL:" << t_result_gpu(j, i) << std::endl;
+      VERIFY_IS_APPROX(t_result_gpu(j, i), t_result(j, i));
+    }
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_lhs_transposed(const Device &sycl_device, IndexType m_size,
+                                IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_both_transposed(const Device &sycl_device, IndexType m_size,
+                                 IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
+  Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(0, 1)}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <typename Dev>
+void inline tensorOutofBound(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Test out of bound for Tensor-Tensor
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
+                                                       1024);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
+                                                       4096);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 4096, 1024,
+                                                       2048);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
+                                                       1024);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 2048, 1024,
+                                                       784);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
+                                                       10);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 513, 4096,
+                                                       513);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 783, 1024,
+                                                       783);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
+                                                       784);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 11, 1024,
+                                                       11);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor out of bound tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 128, 128,
+                                                       128);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 128, 128,
+                                                       128);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_m(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_m<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_m<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_n(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_n<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_n<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_k(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  test_sycl_contraction_k<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_k<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_sizes(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_sizes<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_sizes<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+template <typename Dev>
+void inline vectorVector(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // VECTOR-VECTOR
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1,
+                                                       1025);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1025, 1,
+                                                       1025);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1024, 1,
+                                                       1024);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1,
+                                                       1024);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1,
+                                                       1023);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1,
+                                                       1023);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "contracted tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline vectorTensor(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Vector-Tensor
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1025,
+                                                       1025);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1025,
+                                                       1025);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1024,
+                                                       1024);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1024,
+                                                       1024);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1023,
+                                                       1023);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1023,
+                                                       1023);
+
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4097,
+                                                       4097);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4097,
+                                                       4097);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4096,
+                                                       4096);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4096,
+                                                       4096);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4095,
+                                                       4095);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4095,
+                                                       4095);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 802816,
+                                                       32);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorVector(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Matrix-Vector
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1025,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1125, 1025,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1224, 1024,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1023,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1023,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4097, 4197,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4097, 4097,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4096, 4096,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4096, 8196,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4095, 4095,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4095, 4095,
+                                                       1);
+// If the GEMV disabled it will creates one kernel to calculate the contraction.
+// Therefore the acumuation of float number will overflow the precision
+// threshold for float and cause the test to fail. While it the GMV multiple
+// kernel will be created and each one run the overflow of accumutation breaks
+// among the kernels.
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 32, 802032,
+                                                       1);
+#endif
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorScalar(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // SCALAR Contraction
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline skinnyTensor_row(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 257, 131073,
+                                                       257);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 256, 131072,
+                                                       256);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 131073,
+                                                       16);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 17, 131072,
+                                                       17);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline skinnyTensor_col(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 257, 131073,
+                                                       257);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 256, 131072,
+                                                       256);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 131073,
+                                                       16);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 17, 131072,
+                                                       17);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_batch_per_device(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_batch<RowMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
+                                                   0, 4);
+  contraction_batch<ColMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
+                                                   0, 4);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_lhs_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 8, 4,
+                                                            8);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                            32);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
+                                                            64);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 784,
+                                                            2048, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
+                                                            10, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
+                                                            1024, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
+                                                            4096, 1024);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_rhs_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 16, 4,
+                                                            16);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
+                                                            17);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                            32);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
+                                                            64);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 10,
+                                                            1024, 1024);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
+                                                            1024, 4096);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
+                                                            1024, 2048);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
+                                                            1024, 784);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_both_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
+                                                             17);
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                             32);
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 64,
+                                                             16, 64);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) {
+  for (const auto &device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(tensorOutofBound(sycl_device));
+    CALL_SUBTEST_2(tensorTensor(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_m(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_n(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_k(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_sizes(sycl_device));
+    CALL_SUBTEST_3(vectorVector(sycl_device));
+    CALL_SUBTEST_4(vectorTensor(sycl_device));
+    CALL_SUBTEST_5(tensorVector(sycl_device));
+    CALL_SUBTEST_6(tensorScalar(sycl_device));
+    CALL_SUBTEST_7(skinnyTensor_row(sycl_device));
+    CALL_SUBTEST_7(skinnyTensor_col(sycl_device));
+    CALL_SUBTEST_8(tensor_contraction_batch_per_device(sycl_device));
+    CALL_SUBTEST_9(tensor_contraction_lhs_transposed_per_device(sycl_device));
+    CALL_SUBTEST_10(tensor_contraction_rhs_transposed_per_device(sycl_device));
+    CALL_SUBTEST_11(tensor_contraction_both_transposed_per_device(sycl_device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_contraction.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_contraction.cpp
index ace97057fe4597417bad2b0f128104dddce85493..3b5c6a13c8777cf6a889e5b7c7c2a284c9c1b684 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_contraction.cpp
@@ -471,7 +471,8 @@ static void test_tensor_product()
   mat1.setRandom();
   mat2.setRandom();
 
-  Tensor<float, 4, DataLayout> result = mat1.contract(mat2, Eigen::array<DimPair, 0>{{}});
+  Eigen::array<DimPair, 0> dims;
+  Tensor<float, 4, DataLayout> result = mat1.contract(mat2, dims);
 
   VERIFY_IS_EQUAL(result.dimension(0), 2);
   VERIFY_IS_EQUAL(result.dimension(1), 3);
@@ -510,36 +511,91 @@ static void test_const_inputs()
   VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
 }
 
-void test_cxx11_tensor_contraction()
+// Apply Sqrt to all output elements.
+struct SqrtOutputKernel {
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+      const TensorContractionParams&, Index, Index, Index num_rows,
+      Index num_cols) const {
+    for (int i = 0; i < num_rows; ++i) {
+      for (int j = 0; j < num_cols; ++j) {
+        output_mapper(i, j) = std::sqrt(output_mapper(i, j));
+      }
+    }
+  }
+};
+
+template <int DataLayout>
+static void test_large_contraction_with_output_kernel() {
+  Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+  Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in mat4 to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 248);
+  MapXf m_right(t_right.data(), 248, 1400);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+  // compute results by separate methods
+  t_result = t_left.contract(t_right, dims, SqrtOutputKernel());
+
+  m_result = m_left * m_right;
+
+  for (std::ptrdiff_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_contraction)
 {
-  CALL_SUBTEST(test_evals<ColMajor>());
-  CALL_SUBTEST(test_evals<RowMajor>());
-  CALL_SUBTEST(test_scalar<ColMajor>());
-  CALL_SUBTEST(test_scalar<RowMajor>());
-  CALL_SUBTEST(test_multidims<ColMajor>());
-  CALL_SUBTEST(test_multidims<RowMajor>());
-  CALL_SUBTEST(test_holes<ColMajor>());
-  CALL_SUBTEST(test_holes<RowMajor>());
-  CALL_SUBTEST(test_full_redux<ColMajor>());
-  CALL_SUBTEST(test_full_redux<RowMajor>());
-  CALL_SUBTEST(test_contraction_of_contraction<ColMajor>());
-  CALL_SUBTEST(test_contraction_of_contraction<RowMajor>());
-  CALL_SUBTEST(test_expr<ColMajor>());
-  CALL_SUBTEST(test_expr<RowMajor>());
-  CALL_SUBTEST(test_out_of_order_contraction<ColMajor>());
-  CALL_SUBTEST(test_out_of_order_contraction<RowMajor>());
-  CALL_SUBTEST(test_consistency<ColMajor>());
-  CALL_SUBTEST(test_consistency<RowMajor>());
-  CALL_SUBTEST(test_large_contraction<ColMajor>());
-  CALL_SUBTEST(test_large_contraction<RowMajor>());
-  CALL_SUBTEST(test_matrix_vector<ColMajor>());
-  CALL_SUBTEST(test_matrix_vector<RowMajor>());
-  CALL_SUBTEST(test_tensor_vector<ColMajor>());
-  CALL_SUBTEST(test_tensor_vector<RowMajor>());
-  CALL_SUBTEST(test_small_blocking_factors<ColMajor>());
-  CALL_SUBTEST(test_small_blocking_factors<RowMajor>());
-  CALL_SUBTEST(test_tensor_product<ColMajor>());
-  CALL_SUBTEST(test_tensor_product<RowMajor>());
-  CALL_SUBTEST(test_const_inputs<ColMajor>());
-  CALL_SUBTEST(test_const_inputs<RowMajor>());
+  CALL_SUBTEST_1(test_evals<ColMajor>());
+  CALL_SUBTEST_1(test_evals<RowMajor>());
+  CALL_SUBTEST_1(test_scalar<ColMajor>());
+  CALL_SUBTEST_1(test_scalar<RowMajor>());
+  CALL_SUBTEST_2(test_multidims<ColMajor>());
+  CALL_SUBTEST_2(test_multidims<RowMajor>());
+  CALL_SUBTEST_2(test_holes<ColMajor>());
+  CALL_SUBTEST_2(test_holes<RowMajor>());
+  CALL_SUBTEST_3(test_full_redux<ColMajor>());
+  CALL_SUBTEST_3(test_full_redux<RowMajor>());
+  CALL_SUBTEST_3(test_contraction_of_contraction<ColMajor>());
+  CALL_SUBTEST_3(test_contraction_of_contraction<RowMajor>());
+  CALL_SUBTEST_4(test_expr<ColMajor>());
+  CALL_SUBTEST_4(test_expr<RowMajor>());
+  CALL_SUBTEST_4(test_out_of_order_contraction<ColMajor>());
+  CALL_SUBTEST_4(test_out_of_order_contraction<RowMajor>());
+  CALL_SUBTEST_5(test_consistency<ColMajor>());
+  CALL_SUBTEST_5(test_consistency<RowMajor>());
+  CALL_SUBTEST_5(test_large_contraction<ColMajor>());
+  CALL_SUBTEST_5(test_large_contraction<RowMajor>());
+  CALL_SUBTEST_6(test_matrix_vector<ColMajor>());
+  CALL_SUBTEST_6(test_matrix_vector<RowMajor>());
+  CALL_SUBTEST_6(test_tensor_vector<ColMajor>());
+  CALL_SUBTEST_6(test_tensor_vector<RowMajor>());
+  CALL_SUBTEST_7(test_small_blocking_factors<ColMajor>());
+  CALL_SUBTEST_7(test_small_blocking_factors<RowMajor>());
+  CALL_SUBTEST_7(test_tensor_product<ColMajor>());
+  CALL_SUBTEST_7(test_tensor_product<RowMajor>());
+  CALL_SUBTEST_8(test_const_inputs<ColMajor>());
+  CALL_SUBTEST_8(test_const_inputs<RowMajor>());
+  CALL_SUBTEST_8(test_large_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_8(test_large_contraction_with_output_kernel<RowMajor>());
+
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8
+
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_convolution.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_convolution.cpp
index e3d4675ebea33abbbfc969ed60d3448e8dd9ba00..c3688f678636543757052e76f034aab7701d64c4 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_convolution.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_convolution.cpp
@@ -25,7 +25,8 @@ static void test_evals()
 
   Tensor<float, 2, DataLayout> result(2,3);
   result.setZero();
-  Eigen::array<Tensor<float, 2>::Index, 1> dims3{{0}};
+  Eigen::array<Tensor<float, 2>::Index, 1> dims3;
+  dims3[0] = 0;
 
   typedef TensorEvaluator<decltype(input.convolve(kernel, dims3)), DefaultDevice> Evaluator;
   Evaluator eval(input.convolve(kernel, dims3), DefaultDevice());
@@ -136,7 +137,7 @@ static void test_strides() {
                                input(12)*kernel(2)));
 }
 
-void test_cxx11_tensor_convolution()
+EIGEN_DECLARE_TEST(cxx11_tensor_convolution)
 {
   CALL_SUBTEST(test_evals<ColMajor>());
   CALL_SUBTEST(test_evals<RowMajor>());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_convolution_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_convolution_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3954c8a28c1f7ccdde61fcb4975752b16f016965
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_convolution_sycl.cpp
@@ -0,0 +1,469 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <iomanip>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+static const float error_threshold =1e-4f;
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=55;
+  IndexType outdim2=51;
+  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+  Eigen::array<IndexType, 1> kernel_dims = {{4}};
+  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+  Eigen::array<IndexType, 1> dims3{{0}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+  result_host.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
+      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
+        assert(false);
+      }
+    }
+  }
+}
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=51;
+  IndexType outdim2=51;
+  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+  Eigen::array<IndexType, 2> kernel_dims = {{4,5}};
+  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 2, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+  Eigen::array<IndexType, 2> dims3{{0,1}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+  result_host.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
+      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
+        assert(false);
+      }
+    }
+  }
+}
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=51;
+  IndexType outdim2=49;
+  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+  Eigen::array<IndexType, 3> kernel_dims = {{4,5,3}};
+  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+  Eigen::array<IndexType, 3> dims3{{0,1,2}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+  result_host.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
+      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
+        assert(false);
+      }
+    }
+  }
+}
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_evals(const Eigen::SyclDevice& sycl_device)
+{
+  Eigen::array<IndexType, 2> input_dims = {{3, 3}};
+  Eigen::array<IndexType, 1> kernel_dims = {{2}};
+  Eigen::array<IndexType, 2> result_dims = {{2, 3}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 2, DataLayout,IndexType> result(result_dims);
+
+  Eigen::array<IndexType, 1> dims3{{0}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1));  // index 0
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1));  // index 2
+  VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1));  // index 4
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1));  // index 1
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1));  // index 3
+  VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1));  // index 5
+
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_expr(const Eigen::SyclDevice& sycl_device)
+{
+  Eigen::array<IndexType, 2> input_dims = {{3, 3}};
+  Eigen::array<IndexType, 2> kernel_dims = {{2, 2}};
+  Eigen::array<IndexType, 2> result_dims = {{2, 2}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> kernel(kernel_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> result(result_dims);
+
+  input.setRandom();
+  kernel.setRandom();
+  Eigen::array<IndexType, 2> dims;
+  dims[0] = 0;
+  dims[1] = 1;
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
+                                input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) +
+                                input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) +
+                                input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) +
+                                input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
+
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_modes(const Eigen::SyclDevice& sycl_device){
+
+Eigen::array<IndexType, 1> input_dims = {{3}};
+Eigen::array<IndexType, 1> kernel_dims = {{3}};
+
+Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
+Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
+
+input.setRandom();
+kernel.setRandom();
+Eigen::array<IndexType, 1> dims;
+dims[0] = 0;
+
+  input(0) = 1.0f;
+  input(1) = 2.0f;
+  input(2) = 3.0f;
+  kernel(0) = 0.5f;
+  kernel(1) = 1.0f;
+  kernel(2) = 0.0f;
+
+  Eigen::array<std::pair<IndexType, IndexType>, 1> padding;
+
+  // Emulate VALID mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(0, 0);
+  Tensor<DataType, 1, DataLayout, IndexType> valid(1);
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t valid_bytes = valid.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_valid =  static_cast<DataType*>(sycl_device.allocate(valid_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_valid(d_valid, valid.dimensions());
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(valid.data(), d_valid, valid_bytes);
+
+  VERIFY_IS_EQUAL(valid.dimension(0), 1);
+  VERIFY_IS_APPROX(valid(0), 2.5f);
+
+  // Emulate SAME mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(1, 1);
+  Tensor<DataType, 1, DataLayout, IndexType> same(3);
+  std::size_t same_bytes = same.size() * sizeof(DataType);
+  DataType * d_same =  static_cast<DataType*>(sycl_device.allocate(same_bytes));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_same(d_same, same.dimensions());
+  gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(same.data(), d_same, same_bytes);
+
+  VERIFY_IS_EQUAL(same.dimension(0), 3);
+  VERIFY_IS_APPROX(same(0), 1.0f);
+  VERIFY_IS_APPROX(same(1), 2.5f);
+  VERIFY_IS_APPROX(same(2), 4.0f);
+
+  // Emulate FULL mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(2, 2);
+
+  Tensor<DataType, 1, DataLayout, IndexType> full(5);
+  std::size_t full_bytes = full.size() * sizeof(DataType);
+  DataType * d_full =  static_cast<DataType*>(sycl_device.allocate(full_bytes));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_full(d_full, full.dimensions());
+  gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(full.data(), d_full, full_bytes);
+
+  VERIFY_IS_EQUAL(full.dimension(0), 5);
+  VERIFY_IS_APPROX(full(0), 0.0f);
+  VERIFY_IS_APPROX(full(1), 1.0f);
+  VERIFY_IS_APPROX(full(2), 2.5f);
+  VERIFY_IS_APPROX(full(3), 4.0f);
+  VERIFY_IS_APPROX(full(4), 1.5f);
+
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_valid);
+  sycl_device.deallocate(d_same);
+  sycl_device.deallocate(d_full);
+
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_strides(const Eigen::SyclDevice& sycl_device){
+
+  Eigen::array<IndexType, 1> input_dims = {{13}};
+  Eigen::array<IndexType, 1> kernel_dims = {{3}};
+
+  Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
+  Tensor<DataType, 1, DataLayout, IndexType> result(2);
+
+  input.setRandom();
+  kernel.setRandom();
+  Eigen::array<IndexType, 1> dims;
+  dims[0] = 0;
+
+  Eigen::array<IndexType, 1> stride_of_3;
+  stride_of_3[0] = 3;
+  Eigen::array<IndexType, 1> stride_of_2;
+  stride_of_2[0] = 2;
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_result(d_result, result.dimensions());
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
+                               input(6)*kernel(2)));
+  VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
+                               input(12)*kernel(2)));
+}
+
+template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s){
+  QueueInterface queueInterface(s);
+  auto sycl_device=Eigen::SyclDevice(&queueInterface);
+  test_larg_expr1D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr1D<float, ColMajor, int64_t>(sycl_device);
+  test_larg_expr2D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr2D<float, ColMajor, int64_t>(sycl_device);
+  test_larg_expr3D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr3D<float, ColMajor, int64_t>(sycl_device);
+  test_evals<float, ColMajor, int64_t>(sycl_device);
+  test_evals<float, RowMajor, int64_t>(sycl_device);
+  test_expr<float, ColMajor, int64_t>(sycl_device);
+  test_expr<float, RowMajor, int64_t>(sycl_device);
+  test_modes<float, ColMajor, int64_t>(sycl_device);
+  test_modes<float, RowMajor, int64_t>(sycl_device);
+  test_strides<float, ColMajor, int64_t>(sycl_device);
+  test_strides<float, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_convolution_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorConvolutionPerDevice(device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_custom_index.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_custom_index.cpp
index 4528cc176374e3ed2c0919bc03b0a25c8c9af3a2..b5dbc97bde43d4d2ca11666dbbcbff96c72fc898 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_custom_index.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_custom_index.cpp
@@ -88,7 +88,7 @@ static void test_sizes_as_index()
 }
 
 
-void test_cxx11_tensor_custom_index() {
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_index) {
   test_map_as_index<ColMajor>();
   test_map_as_index<RowMajor>();
   test_matrix_as_index<ColMajor>();
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_custom_op.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_custom_op.cpp
index 8baa477cc992b5215c0842a6e45892db51983397..875ea57d27550148e24af7eeb3cd114ada6c8eba 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_custom_op.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_custom_op.cpp
@@ -104,7 +104,7 @@ static void test_custom_binary_op()
 }
 
 
-void test_cxx11_tensor_custom_op()
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_op)
 {
   CALL_SUBTEST(test_custom_unary_op());
   CALL_SUBTEST(test_custom_binary_op());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d947ead83c3e22f7d2c50ba71b677abeb35ef809
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
@@ -0,0 +1,170 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+template<typename TensorType>
+struct InsertZeros {
+  DSizes<DenseIndex, 2> dimensions(const TensorType& input) const {
+    DSizes<DenseIndex, 2> result;
+    result[0] = input.dimension(0) * 2;
+    result[1] = input.dimension(1) * 2;
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const TensorType& input, Output& output, const Device& device) const
+  {
+    array<DenseIndex, 2> strides;
+    strides[0] = 2;
+    strides[1] = 2;
+    output.stride(strides).device(device) = input;
+
+    Eigen::DSizes<DenseIndex, 2> offsets(1,1);
+    Eigen::DSizes<DenseIndex, 2> extents(output.dimension(0)-1, output.dimension(1)-1);
+    output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f);
+  }
+};
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 3;
+  IndexType sizeDim2 = 5;
+  Eigen::array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+  Eigen::array<IndexType, 2> tensorResultRange = {{6, 10}};
+
+  Eigen::Tensor<DataType, 2, DataLayout, IndexType> in1(tensorRange);
+  Eigen::Tensor<DataType, 2, DataLayout, IndexType> out(tensorResultRange);
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+  typedef Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > TensorType;
+  TensorType gpu_in1(gpu_in1_data, tensorRange);
+  TensorType gpu_out(gpu_out_data, tensorResultRange);
+
+  in1.setRandom();
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+  gpu_out.device(sycl_device) = gpu_in1.customOp(InsertZeros<TensorType>());
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(out.dimension(0), 6);
+  VERIFY_IS_EQUAL(out.dimension(1), 10);
+
+  for (int i = 0; i < 6; i+=2) {
+    for (int j = 0; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(out(i, j), in1(i/2, j/2));
+    }
+  }
+  for (int i = 1; i < 6; i+=2) {
+    for (int j = 1; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(out(i, j), 0);
+    }
+  }
+  sycl_device.deallocate(gpu_in1_data);
+sycl_device.deallocate(gpu_out_data);
+}
+
+template<typename TensorType>
+struct BatchMatMul {
+  DSizes<DenseIndex, 3> dimensions(const TensorType& input1, const TensorType& input2) const {
+    DSizes<DenseIndex, 3> result;
+    result[0] = input1.dimension(0);
+    result[1] = input2.dimension(1);
+    result[2] = input2.dimension(2);
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const TensorType& input1, const TensorType& input2,
+            Output& output, const Device& device) const
+  {
+    typedef typename TensorType::DimensionPair DimPair;
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    for (int64_t i = 0; i < output.dimension(2); ++i) {
+      output.template chip<2>(i).device(device) = input1.template chip<2>(i).contract(input2.template chip<2>(i), dims);
+    }
+  }
+};
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  Eigen::array<IndexType, 3> tensorRange1 = {{2, 3, 5}};
+  Eigen::array<IndexType, 3> tensorRange2 = {{3,7,5}};
+  Eigen::array<IndexType, 3> tensorResultRange  = {{2, 7, 5}};
+
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange1);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange2);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorResultRange);
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+  typedef Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > TensorType;
+  TensorType gpu_in1(gpu_in1_data, tensorRange1);
+  TensorType gpu_in2(gpu_in2_data, tensorRange2);
+  TensorType gpu_out(gpu_out_data, tensorResultRange);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
+
+  gpu_out.device(sycl_device) = gpu_in1.customOp(gpu_in2, BatchMatMul<TensorType>());
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 5; ++i) {
+    typedef typename Eigen::Tensor<DataType, 3, DataLayout, IndexType>::DimensionPair DimPair;
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    Eigen::Tensor<DataType, 2, DataLayout, IndexType> reference = in1.template chip<2>(i).contract(in2.template chip<2>(i), dims);
+    TensorRef<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > val = out.template chip<2>(i);
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(val(j, k), reference(j, k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, typename Dev_selector> void custom_op_perDevice(Dev_selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_custom_unary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_custom_unary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_custom_binary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_custom_binary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_op_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(custom_op_perDevice<float>(device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_device.cu b/contrib/eigen/unsupported/test/cxx11_tensor_device.cu
index cbb43e210c30c5c2e83488e3a3daf3898f4a44df..c9f78d2d39adbf80f4393a715eb96d462938e25a 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_device.cu
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_device.cu
@@ -9,13 +9,15 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_device
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
 using Eigen::Tensor;
 using Eigen::RowMajor;
 
@@ -65,22 +67,22 @@ struct CPUContext {
 // Context for evaluation on GPU
 struct GPUContext {
   GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
-    assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess);
+    assert(gpuMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == gpuSuccess);
     float kernel_1d_val[] = {3.14f, 2.7f};
-    assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+    assert(gpuMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
 
-    assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess);
+    assert(gpuMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == gpuSuccess);
     float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
-    assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+    assert(gpuMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
 
-    assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess);
+    assert(gpuMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == gpuSuccess);
     float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
-    assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+    assert(gpuMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
   }
   ~GPUContext() {
-    assert(cudaFree(kernel_1d_) == cudaSuccess);
-    assert(cudaFree(kernel_2d_) == cudaSuccess);
-    assert(cudaFree(kernel_3d_) == cudaSuccess);
+    assert(gpuFree(kernel_1d_) == gpuSuccess);
+    assert(gpuFree(kernel_2d_) == gpuSuccess);
+    assert(gpuFree(kernel_3d_) == gpuSuccess);
   }
 
   const Eigen::GpuDevice& device() const { return gpu_device_; }
@@ -101,7 +103,7 @@ struct GPUContext {
   float* kernel_2d_;
   float* kernel_3d_;
 
-  Eigen::CudaStreamDevice stream_;
+  Eigen::GpuStreamDevice stream_;
   Eigen::GpuDevice gpu_device_;
 };
 
@@ -280,12 +282,12 @@ void test_gpu() {
   float* d_in1;
   float* d_in2;
   float* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_in2), in2_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_in2), in2_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
 
   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
@@ -293,7 +295,7 @@ void test_gpu() {
 
   GPUContext context(gpu_in1, gpu_in2, gpu_out);
   test_contextual_eval(&context);
-  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
@@ -303,7 +305,7 @@ void test_gpu() {
   }
 
   test_forced_contextual_eval(&context);
-  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
@@ -313,7 +315,7 @@ void test_gpu() {
   }
 
   test_compound_assignment(&context);
-  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
@@ -323,7 +325,7 @@ void test_gpu() {
   }
 
   test_contraction(&context);
-  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 40; ++j) {
       const float result = out(i,j,0);
@@ -338,8 +340,8 @@ void test_gpu() {
   }
 
   test_1d_convolution(&context);
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 70; ++k) {
@@ -349,8 +351,8 @@ void test_gpu() {
   }
 
   test_2d_convolution(&context);
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 69; ++k) {
@@ -362,9 +364,13 @@ void test_gpu() {
     }
   }
 
+#if !defined(EIGEN_USE_HIP)
+// disable this test on the HIP platform
+// 3D tensor convolutions seem to hang on the HIP platform
+
   test_3d_convolution(&context);
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
   for (int i = 0; i < 39; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 69; ++k) {
@@ -377,10 +383,13 @@ void test_gpu() {
       }
     }
   }
+
+#endif
+ 
 }
 
 
-void test_cxx11_tensor_device()
+EIGEN_DECLARE_TEST(cxx11_tensor_device)
 {
   CALL_SUBTEST_1(test_cpu());
   CALL_SUBTEST_2(test_gpu());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_device_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_device_sycl.cpp
index 7f79753c5e794e3860510611e09679665c89c603..5095cb07802ba283a8c96d470daa9d860ebefd50 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -13,19 +13,65 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_device_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
+#include <stdint.h>
+#include <iostream>
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_device_memory(const Eigen::SyclDevice &sycl_device) {
+  std::cout << "Running on : "
+            << sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>()
+            <<std::endl;
+  IndexType sizeDim1 = 100;
+  array<IndexType, 1> tensorRange = {{sizeDim1}};
+  Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
+  Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
+  memset(in1.data(), 1, in1.size() * sizeof(DataType));
+  DataType* gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
+  for (IndexType i=0; i<in.size(); i++) {
+    VERIFY_IS_EQUAL(in(i), in1(i));
+  }
+  sycl_device.deallocate(gpu_in_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_device_exceptions(const Eigen::SyclDevice &sycl_device) {
+  VERIFY(sycl_device.ok());
+  IndexType sizeDim1 = 100;
+  array<IndexType, 1> tensorDims = {{sizeDim1}};
+  DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(sizeDim1*sizeof(DataType)));
+  sycl_device.memset(gpu_data, 1, sizeDim1*sizeof(DataType));
 
-void test_device_sycl(const Eigen::SyclDevice &sycl_device) {
-  std::cout <<"Helo from ComputeCpp: the requested device exists and the device name is : "
-    << sycl_device.m_queue.get_device(). template get_info<cl::sycl::info::device::name>() <<std::endl;;
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> in(gpu_data, tensorDims);
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> out(gpu_data, tensorDims);
+  out.device(sycl_device) = in / in.constant(0);
+
+  sycl_device.synchronize();
+  VERIFY(!sycl_device.ok());
+  sycl_device.deallocate(gpu_data);
+}
+
+template<typename DataType> void sycl_device_test_per_device(const cl::sycl::device& d){
+  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_device_memory<DataType, RowMajor, int64_t>(sycl_device);
+  test_device_memory<DataType, ColMajor, int64_t>(sycl_device);
+  /// this test throw an exception. enable it if you want to see the exception
+  //test_device_exceptions<DataType, RowMajor>(sycl_device);
+  /// this test throw an exception. enable it if you want to see the exception
+  //test_device_exceptions<DataType, ColMajor>(sycl_device);
 }
-void test_cxx11_tensor_device_sycl() {
-  cl::sycl::gpu_selector s;
-  Eigen::SyclDevice sycl_device(s);
-  CALL_SUBTEST(test_device_sycl(sycl_device));
+
+EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_device_test_per_device<float>(device));
+  }
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_dimension.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_dimension.cpp
index 16f168ed4487b730babd1a5b0d29a72e1e9d6e4c..ee416e14a0d975da5d68d20d2c605a66973d35e0 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_dimension.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_dimension.cpp
@@ -60,10 +60,29 @@ static void test_rank_zero()
   VERIFY_IS_EQUAL((int)dscalar.rank(), 0);
 }
 
-void test_cxx11_tensor_dimension()
+static void test_index_type_promotion() {
+  Eigen::DSizes<int, 3> src0(1, 2, 3);
+  Eigen::array<int, 3> src1;
+  src1[0] = 4;
+  src1[1] = 5;
+  src1[2] = 6;
+
+  Eigen::DSizes<long, 3> dst0(src0);
+  Eigen::DSizes<long, 3> dst1(src1);
+
+  VERIFY_IS_EQUAL(dst0[0], 1L);
+  VERIFY_IS_EQUAL(dst0[1], 2L);
+  VERIFY_IS_EQUAL(dst0[2], 3L);
+  VERIFY_IS_EQUAL(dst1[0], 4L);
+  VERIFY_IS_EQUAL(dst1[1], 5L);
+  VERIFY_IS_EQUAL(dst1[2], 6L);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_dimension)
 {
   CALL_SUBTEST(test_dynamic_size());
   CALL_SUBTEST(test_fixed_size());
   CALL_SUBTEST(test_match());
   CALL_SUBTEST(test_rank_zero());
+  CALL_SUBTEST(test_index_type_promotion());
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_empty.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_empty.cpp
index d7eea42d7399d49ac733221636a6f49f5afa140f..fd889c46c64c8a94d6cf5cec51500d820d1afdc5 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_empty.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_empty.cpp
@@ -33,7 +33,7 @@ static void test_empty_fixed_size_tensor()
 }
 
 
-void test_cxx11_tensor_empty()
+EIGEN_DECLARE_TEST(cxx11_tensor_empty)
 {
    CALL_SUBTEST(test_empty_tensor());
    CALL_SUBTEST(test_empty_fixed_size_tensor());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_executor.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_executor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..66b06e8ee91d42666a69ea35b0bfa19d528bab42
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_executor.cpp
@@ -0,0 +1,731 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+using Eigen::ColMajor;
+using Eigen::internal::TiledEvaluation;
+
+// A set of tests to verify that different TensorExecutor strategies yields the
+// same results for all the ops, supporting tiled evaluation.
+
+// Default assignment that does no use block evaluation or vectorization.
+// We assume that default coefficient evaluation is well tested and correct.
+template <typename Dst, typename Expr>
+static void DefaultAssign(Dst& dst, Expr expr) {
+  using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
+  using Executor =
+      Eigen::internal::TensorExecutor<const Assign, DefaultDevice,
+                                      /*Vectorizable=*/false,
+                                      /*Tiling=*/TiledEvaluation::Off>;
+
+  Executor::run(Assign(dst, expr), DefaultDevice());
+}
+
+// Assignment with specified device and tiling strategy.
+template <bool Vectorizable, TiledEvaluation Tiling, typename Device,
+          typename Dst, typename Expr>
+static void DeviceAssign(Device& d, Dst& dst, Expr expr) {
+  using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
+  using Executor = Eigen::internal::TensorExecutor<const Assign, Device,
+                                                   Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+}
+
+template <int NumDims>
+static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
+  array<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<int>(min_dim, max_dim);
+  }
+  return dims;
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_unary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> src(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  src.setRandom();
+  const auto expr = src.square();
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T square = src.coeff(i) * src.coeff(i);
+    VERIFY_IS_EQUAL(square, dst.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_binary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> lhs(dims);
+  Tensor<T, NumDims, Options, Index> rhs(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  lhs.setRandom();
+  rhs.setRandom();
+
+  const auto expr = lhs + rhs;
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T sum = lhs.coeff(i) + rhs.coeff(i);
+    VERIFY_IS_EQUAL(sum, dst.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_broadcasting(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  const auto broadcasts = RandomDims<NumDims>(1, 7);
+  const auto expr = src.broadcast(broadcasts);
+
+  // We assume that broadcasting on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor<T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the broadcasting using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_chipping_rvalue(Device d)
+{
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Layout, Index> src(dims);
+  src.setRandom();
+
+#define TEST_CHIPPING(CHIP_DIM)                                           \
+  if (NumDims > (CHIP_DIM)) {                                             \
+    const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
+    const auto expr = src.template chip<(CHIP_DIM)>(offset);              \
+                                                                          \
+    Tensor<T, NumDims - 1, Layout, Index> golden;                         \
+    golden = expr;                                                        \
+                                                                          \
+    Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions());       \
+                                                                          \
+    using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;   \
+    using Executor = internal::TensorExecutor<const Assign, Device,       \
+                                              Vectorizable, Tiling>;      \
+                                                                          \
+    Executor::run(Assign(dst, expr), d);                                  \
+                                                                          \
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {            \
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                     \
+    }                                                                     \
+  }
+
+  TEST_CHIPPING(0)
+  TEST_CHIPPING(1)
+  TEST_CHIPPING(2)
+  TEST_CHIPPING(3)
+  TEST_CHIPPING(4)
+  TEST_CHIPPING(5)
+
+#undef TEST_CHIPPING
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_chipping_lvalue(Device d)
+{
+  auto dims = RandomDims<NumDims>(1, 10);
+
+#define TEST_CHIPPING(CHIP_DIM)                                             \
+  if (NumDims > (CHIP_DIM)) {                                               \
+    /* Generate random data that we'll assign to the chipped tensor dim. */ \
+    array<Index, NumDims - 1> src_dims;                                     \
+    for (int i = 0; i < NumDims - 1; ++i) {                                 \
+      int dim = i < (CHIP_DIM) ? i : i + 1;                                 \
+      src_dims[i] = dims[dim];                                              \
+    }                                                                       \
+                                                                            \
+    Tensor<T, NumDims - 1, Layout, Index> src(src_dims);                    \
+    src.setRandom();                                                        \
+                                                                            \
+    const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1);   \
+                                                                            \
+    Tensor<T, NumDims, Layout, Index> random(dims);                         \
+    random.setZero();                                                       \
+                                                                            \
+    Tensor<T, NumDims, Layout, Index> golden(dims);                         \
+    golden = random;                                                        \
+    golden.template chip<(CHIP_DIM)>(offset) = src;                         \
+                                                                            \
+    Tensor<T, NumDims, Layout, Index> dst(dims);                            \
+    dst = random;                                                           \
+    auto expr = dst.template chip<(CHIP_DIM)>(offset);                      \
+                                                                            \
+    using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;     \
+    using Executor = internal::TensorExecutor<const Assign, Device,         \
+                                              Vectorizable, Tiling>;        \
+                                                                            \
+    Executor::run(Assign(expr, src), d);                                    \
+                                                                            \
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {              \
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                       \
+    }                                                                       \
+  }
+
+  TEST_CHIPPING(0)
+  TEST_CHIPPING(1)
+  TEST_CHIPPING(2)
+  TEST_CHIPPING(3)
+  TEST_CHIPPING(4)
+  TEST_CHIPPING(5)
+
+#undef TEST_CHIPPING
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_shuffle_rvalue(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  // Test all possible shuffle permutations.
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) {
+      shuffled_dims[i] = dims[shuffle[i]];
+    }
+
+    const auto expr = src.shuffle(shuffle);
+
+    // We assume that shuffling on a default device is tested and correct, so
+    // we can rely on it to verify correctness of tensor executor and tiling.
+    Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
+    DefaultAssign(golden, expr);
+
+    // Now do the shuffling using configured tensor executor.
+    Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+    DeviceAssign<Vectorizable, Tiling>(d, dst, expr);
+
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+    }
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_shuffle_lvalue(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  // Test all possible shuffle permutations.
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
+
+    // We assume that shuffling on a default device is tested and correct, so
+    // we can rely on it to verify correctness of tensor executor and tiling.
+    Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
+    auto golden_shuffle = golden.shuffle(shuffle);
+    DefaultAssign(golden_shuffle, src);
+
+    // Now do the shuffling using configured tensor executor.
+    Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+    auto dst_shuffle = dst.shuffle(shuffle);
+    DeviceAssign<Vectorizable, Tiling>(d, dst_shuffle, src);
+
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+    }
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_reshape(Device d)
+{
+  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+
+  static constexpr int ReshapedDims = NumDims - 1;
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Multiple 0th dimension and then shuffle.
+  std::vector<Index> shuffle;
+  for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i);
+  std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
+
+  DSizes<Index, ReshapedDims> reshaped_dims;
+  reshaped_dims[shuffle[0]] = dims[0] * dims[1];
+  for (int i = 1; i < ReshapedDims; ++i) reshaped_dims[shuffle[i]] = dims[i + 1];
+
+  Tensor<T, ReshapedDims, Options, Index> golden = src.reshape(reshaped_dims);
+
+  // Now reshape using configured tensor executor.
+  Tensor<T, ReshapedDims, Options, Index> dst(golden.dimensions());
+
+  auto expr = src.reshape(reshaped_dims);
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_slice_rvalue(Device d)
+{
+  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Pick a random slice of src tensor.
+  auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>());
+  auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>());
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  Tensor<T, NumDims, Options, Index> golden =
+      src.slice(slice_start, slice_size);
+
+  // Now reshape using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  auto expr = src.slice(slice_start, slice_size);
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_slice_lvalue(Device d)
+{
+  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Pick a random slice of src tensor.
+  auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
+  auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  Tensor<T, NumDims, Options, Index> slice(slice_size);
+  slice.setRandom();
+
+  // Assign a slice using default executor.
+  Tensor<T, NumDims, Options, Index> golden = src;
+  golden.slice(slice_start, slice_size) = slice;
+
+  // And using configured execution strategy.
+  Tensor<T, NumDims, Options, Index> dst = src;
+  auto expr = dst.slice(slice_start, slice_size);
+
+  using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(expr, slice), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_broadcasting_of_forced_eval(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  const auto broadcasts = RandomDims<NumDims>(1, 7);
+  const auto expr = src.square().eval().broadcast(broadcasts);
+
+  // We assume that broadcasting on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor<T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the broadcasting using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template<typename T, int NumDims>
+struct DummyGenerator {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  T operator()(const array <Index, NumDims>& dims) const {
+    T result = static_cast<T>(0);
+    for (int i = 0; i < NumDims; ++i) {
+      result += static_cast<T>((i + 1) * dims[i]);
+    }
+    return result;
+  }
+};
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_generator_op(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(20, 30);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  const auto expr = src.generate(DummyGenerator<T, NumDims>());
+
+  // We assume that generator on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor<T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the broadcasting using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+    internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_reverse_rvalue(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims));
+  Tensor <T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Reverse half of the dimensions.
+  Eigen::array<bool, NumDims> reverse;
+  for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
+
+  const auto expr = src.reverse(reverse);
+
+  // We assume that reversing on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor <T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the reversing using configured tensor executor.
+  Tensor <T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+    internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_async_execute_unary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> src(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  src.setRandom();
+  const auto expr = src.square();
+
+  Eigen::Barrier done(1);
+  auto on_done = [&done]() { done.Notify(); };
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using DoneCallback = decltype(on_done);
+  using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
+                                                 Vectorizable, Tiling>;
+
+  Executor::runAsync(Assign(dst, expr), d, on_done);
+  done.Wait();
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T square = src.coeff(i) * src.coeff(i);
+    VERIFY_IS_EQUAL(square, dst.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_async_execute_binary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> lhs(dims);
+  Tensor<T, NumDims, Options, Index> rhs(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  lhs.setRandom();
+  rhs.setRandom();
+
+  const auto expr = lhs + rhs;
+
+  Eigen::Barrier done(1);
+  auto on_done = [&done]() { done.Notify(); };
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using DoneCallback = decltype(on_done);
+  using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
+                                                 Vectorizable, Tiling>;
+
+  Executor::runAsync(Assign(dst, expr), d, on_done);
+  done.Wait();
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T sum = lhs.coeff(i) + rhs.coeff(i);
+    VERIFY_IS_EQUAL(sum, dst.coeff(i));
+  }
+}
+
+#ifdef EIGEN_DONT_VECTORIZE
+#define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL
+#else
+#define VECTORIZABLE(VAL) VAL
+#endif
+
+#define CALL_SUBTEST_PART(PART) \
+  CALL_SUBTEST_##PART
+
+#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                                 \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  ColMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  RowMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));          \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));          \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));          \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device)))
+
+// NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
+#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device)))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
+  Eigen::DefaultDevice default_device;
+  // Default device is unused in ASYNC tests.
+  EIGEN_UNUSED_VARIABLE(default_device);
+
+  const auto num_threads = internal::random<int>(20, 24);
+  Eigen::ThreadPool tp(num_threads);
+  Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
+
+  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3);
+  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4);
+  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3);
+  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4);
+  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3);
+  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4);
+  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2);
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3);
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4);
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2);
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3);
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
+
+  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 5);
+
+  CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 3);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 4);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 5);
+
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_expr.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_expr.cpp
index 77e24cb67f500f70ef86f444c4d296e53723fad2..169fc1898a6803a5ff7b617981e65068fd524ae8 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_expr.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_expr.cpp
@@ -7,6 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#include <numeric>
+
 #include "main.h"
 
 #include <Eigen/CXX11/Tensor>
@@ -193,26 +195,23 @@ static void test_constants()
 
 static void test_boolean()
 {
-  Tensor<int, 1> vec(6);
-  std::copy_n(std::begin({0, 1, 2, 3, 4, 5}), 6, vec.data());
+  const int kSize = 31;
+  Tensor<int, 1> vec(kSize);
+  std::iota(vec.data(), vec.data() + kSize, 0);
 
   // Test ||.
   Tensor<bool, 1> bool1 = vec < vec.constant(1) || vec > vec.constant(4);
-  VERIFY_IS_EQUAL(bool1[0], true);
-  VERIFY_IS_EQUAL(bool1[1], false);
-  VERIFY_IS_EQUAL(bool1[2], false);
-  VERIFY_IS_EQUAL(bool1[3], false);
-  VERIFY_IS_EQUAL(bool1[4], false);
-  VERIFY_IS_EQUAL(bool1[5], true);
+  for (int i = 0; i < kSize; ++i) {
+    bool expected = i < 1 || i > 4;
+    VERIFY_IS_EQUAL(bool1[i], expected);
+  }
 
   // Test &&, including cast of operand vec.
   Tensor<bool, 1> bool2 = vec.cast<bool>() && vec < vec.constant(4);
-  VERIFY_IS_EQUAL(bool2[0], false);
-  VERIFY_IS_EQUAL(bool2[1], true);
-  VERIFY_IS_EQUAL(bool2[2], true);
-  VERIFY_IS_EQUAL(bool2[3], true);
-  VERIFY_IS_EQUAL(bool2[4], false);
-  VERIFY_IS_EQUAL(bool2[5], false);
+  for (int i = 0; i < kSize; ++i) {
+    bool expected = bool(i) && i < 4;
+    VERIFY_IS_EQUAL(bool2[i], expected);
+  }
 
   // Compilation tests:
   // Test Tensor<bool> against results of cast or comparison; verifies that
@@ -300,8 +299,152 @@ static void test_select()
   }
 }
 
+template <typename Scalar>
+void test_minmax_nan_propagation_templ() {
+  for (int size = 1; size < 17; ++size) {
+    const Scalar kNaN = std::numeric_limits<Scalar>::quiet_NaN();
+    const Scalar kInf = std::numeric_limits<Scalar>::infinity();
+    const Scalar kZero(0);
+    Tensor<Scalar, 1> vec_all_nan(size);
+    Tensor<Scalar, 1> vec_one_nan(size);
+    Tensor<Scalar, 1> vec_zero(size);
+    vec_all_nan.setConstant(kNaN);
+    vec_zero.setZero();
+    vec_one_nan.setZero();
+    vec_one_nan(size/2) = kNaN;
+
+    auto verify_all_nan = [&](const Tensor<Scalar, 1>& v) {
+      for (int i = 0; i < size; ++i) {
+        VERIFY((numext::isnan)(v(i)));
+      }
+    };
+
+    auto verify_all_zero = [&](const Tensor<Scalar, 1>& v) {
+      for (int i = 0; i < size; ++i) {
+        VERIFY_IS_EQUAL(v(i), Scalar(0));
+      }
+    };
+
+    // Test NaN propagating max.
+    // max(nan, nan) = nan
+    // max(nan, 0) = nan
+    // max(0, nan) = nan
+    // max(0, 0) = 0
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(kNaN));
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(vec_all_nan));
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(kZero));
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(vec_zero));
+    verify_all_nan(vec_zero.template cwiseMax<PropagateNaN>(kNaN));
+    verify_all_nan(vec_zero.template cwiseMax<PropagateNaN>(vec_all_nan));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNaN>(kZero));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNaN>(vec_zero));
+
+    // Test number propagating max.
+    // max(nan, nan) = nan
+    // max(nan, 0) = 0
+    // max(0, nan) = 0
+    // max(0, 0) = 0
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNumbers>(kNaN));
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNumbers>(vec_all_nan));
+    verify_all_zero(vec_all_nan.template cwiseMax<PropagateNumbers>(kZero));
+    verify_all_zero(vec_all_nan.template cwiseMax<PropagateNumbers>(vec_zero));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(kNaN));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(vec_all_nan));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(kZero));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(vec_zero));
+
+    // Test NaN propagating min.
+    // min(nan, nan) = nan
+    // min(nan, 0) = nan
+    // min(0, nan) = nan
+    // min(0, 0) = 0
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(kNaN));
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(vec_all_nan));
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(kZero));
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(vec_zero));
+    verify_all_nan(vec_zero.template cwiseMin<PropagateNaN>(kNaN));
+    verify_all_nan(vec_zero.template cwiseMin<PropagateNaN>(vec_all_nan));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNaN>(kZero));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNaN>(vec_zero));
+
+    // Test number propagating min.
+    // min(nan, nan) = nan
+    // min(nan, 0) = 0
+    // min(0, nan) = 0
+    // min(0, 0) = 0
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNumbers>(kNaN));
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNumbers>(vec_all_nan));
+    verify_all_zero(vec_all_nan.template cwiseMin<PropagateNumbers>(kZero));
+    verify_all_zero(vec_all_nan.template cwiseMin<PropagateNumbers>(vec_zero));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(kNaN));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(vec_all_nan));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(kZero));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(vec_zero));
+
+    // Test min and max reduction
+    Tensor<Scalar, 0> val;
+    val = vec_zero.minimum();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.template minimum<PropagateNaN>();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.template minimum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.maximum();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.template maximum<PropagateNaN>();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.template maximum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), kZero);
+
+    // Test NaN propagation for tensor of all NaNs.
+    val = vec_all_nan.template minimum<PropagateNaN>();
+    VERIFY((numext::isnan)(val()));
+    val = vec_all_nan.template minimum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), kInf);
+    val = vec_all_nan.template maximum<PropagateNaN>();
+    VERIFY((numext::isnan)(val()));
+    val = vec_all_nan.template maximum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), -kInf);
+
+    // Test NaN propagation for tensor with a single NaN.
+    val = vec_one_nan.template minimum<PropagateNaN>();
+    VERIFY((numext::isnan)(val()));
+    val = vec_one_nan.template minimum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), (size == 1 ? kInf : kZero));
+    val = vec_one_nan.template maximum<PropagateNaN>();
+    VERIFY((numext::isnan)(val()));
+    val = vec_one_nan.template maximum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), (size == 1 ? -kInf : kZero));
+  }
+}
+
+static void test_clip()
+{
+  Tensor<float, 1> vec(6);
+  vec(0) = 4.0;
+  vec(1) = 8.0;
+  vec(2) = 15.0;
+  vec(3) = 16.0;
+  vec(4) = 23.0;
+  vec(5) = 42.0;
+
+  float kMin = 20;
+  float kMax = 30;
+
+  Tensor<float, 1> vec_clipped(6);
+  vec_clipped = vec.clip(kMin, kMax);
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(vec_clipped(i), numext::mini(numext::maxi(vec(i), kMin), kMax));
+  }
+}
+
+static void test_minmax_nan_propagation()
+{
+  test_minmax_nan_propagation_templ<float>();
+  test_minmax_nan_propagation_templ<double>();
+}
 
-void test_cxx11_tensor_expr()
+EIGEN_DECLARE_TEST(cxx11_tensor_expr)
 {
   CALL_SUBTEST(test_1d());
   CALL_SUBTEST(test_2d());
@@ -311,4 +454,11 @@ void test_cxx11_tensor_expr()
   CALL_SUBTEST(test_functors());
   CALL_SUBTEST(test_type_casting());
   CALL_SUBTEST(test_select());
+  CALL_SUBTEST(test_clip());
+
+// Nan propagation does currently not work like one would expect from std::max/std::min,
+// so we disable it for now
+#if !EIGEN_ARCH_ARM_OR_ARM64
+  CALL_SUBTEST(test_minmax_nan_propagation());
+#endif
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_fft.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_fft.cpp
index 2f14ebc622db82adf09adb71eeff8436dde2110a..2e1008eca6564d8e085a3364e8732079cc0cf753 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_fft.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_fft.cpp
@@ -224,7 +224,35 @@ static void test_fft_real_input_energy() {
   }
 }
 
-void test_cxx11_tensor_fft() {
+template <typename RealScalar>
+static void test_fft_non_power_of_2_round_trip(int exponent) {
+  int n = (1 << exponent) + 1;
+
+  Eigen::DSizes<ptrdiff_t, 1> dimensions;
+  dimensions[0] = n;
+  const DSizes<ptrdiff_t, 1> arr = dimensions;
+  Tensor<RealScalar, 1, ColMajor, ptrdiff_t> input;
+
+  input.resize(arr);
+  input.setRandom();
+
+  array<int, 1> fft;
+  fft[0] = 0;
+
+  Tensor<std::complex<RealScalar>, 1, ColMajor> forward =
+      input.template fft<BothParts, FFT_FORWARD>(fft);
+
+  Tensor<RealScalar, 1, ColMajor, ptrdiff_t> output =
+      forward.template fft<RealPart, FFT_REVERSE>(fft);
+
+  for (int i = 0; i < n; ++i) {
+    RealScalar tol = test_precision<RealScalar>() *
+                     (std::abs(input[i]) + std::abs(output[i]) + 1);
+    VERIFY_IS_APPROX_OR_LESS_THAN(std::abs(input[i] - output[i]), tol);
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_fft) {
     test_fft_complex_input_golden();
     test_fft_real_input_golden();
 
@@ -270,4 +298,7 @@ void test_cxx11_tensor_fft() {
     test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 4>();
     test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 4>();
     test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 4>();
+
+    test_fft_non_power_of_2_round_trip<float>(7);
+    test_fft_non_power_of_2_round_trip<double>(7);
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_fixed_size.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_fixed_size.cpp
index 4c660de653ec196d7dba67861b281037589e1ebd..456ce6beaa2c057f00a0b2b35458408c13088e51 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_fixed_size.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -21,7 +21,7 @@ static void test_0d()
   TensorFixedSize<float, Sizes<>, RowMajor> scalar2;
   VERIFY_IS_EQUAL(scalar1.rank(), 0);
   VERIFY_IS_EQUAL(scalar1.size(), 1);
-  VERIFY_IS_EQUAL(array_prod(scalar1.dimensions()), 1);
+  VERIFY_IS_EQUAL(internal::array_prod(scalar1.dimensions()), 1);
 
   scalar1() = 7.0;
   scalar2() = 13.0;
@@ -250,7 +250,7 @@ static void test_array()
   }
 }
 
-void test_cxx11_tensor_fixed_size()
+EIGEN_DECLARE_TEST(cxx11_tensor_fixed_size)
 {
   CALL_SUBTEST(test_0d());
   CALL_SUBTEST(test_1d());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_forced_eval.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_forced_eval.cpp
index 45d7345e93b8fab871b20f4c12cfdab855bab8dc..a21a02bec76c6fb76b4b8dc5d90a346a00382022 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_forced_eval.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_forced_eval.cpp
@@ -61,7 +61,7 @@ static void test_const()
   Eigen::array<int, 2> bcast;
   bcast[0] = 3;
   bcast[1] = 1;
-  const TensorMap<Tensor<const float, 2> > input_tensor(input.data(), 3, 3);
+  const TensorMap<const Tensor<float, 2> > input_tensor(input.data(), 3, 3);
   Tensor<float, 2> output_tensor= (input_tensor - input_tensor.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
 
   for (int i = 0; i < 3; ++i) {
@@ -72,7 +72,7 @@ static void test_const()
 }
 
 
-void test_cxx11_tensor_forced_eval()
+EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval)
 {
   CALL_SUBTEST(test_simple());
   CALL_SUBTEST(test_const());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
index 5690da7238617e884388811263b52910ac576a9e..a55a5ad8a4b3ccecf7f1feea6f350cb83e576716 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@@ -13,44 +13,44 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
-
+template <typename DataType, int DataLayout, typename IndexType>
 void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
 
-  int sizeDim1 = 100;
-  int sizeDim2 = 200;
-  int sizeDim3 = 200;
-  Eigen::array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  Eigen::Tensor<float, 3> in1(tensorRange);
-  Eigen::Tensor<float, 3> in2(tensorRange);
-  Eigen::Tensor<float, 3> out(tensorRange);
+  IndexType sizeDim1 = 100;
+  IndexType sizeDim2 = 20;
+  IndexType sizeDim3 = 20;
+  Eigen::array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
 
-  float * gpu_in1_data  = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_in2_data  = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_out_data =  static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
 
-  in1 = in1.random() + in1.constant(10.0f);
-  in2 = in2.random() + in2.constant(10.0f);
+  in1 = in1.random() + in1.constant(static_cast<DataType>(10.0f));
+  in2 = in2.random() + in2.constant(static_cast<DataType>(10.0f));
 
   // creating TensorMap from tensor
-  Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange);
-  Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange);
-  Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange);
-  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(float));
-  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(float));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
   /// c=(a+b)*b
   gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2;
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i, j, k),
                          (in1(i, j, k) + in2(i, j, k)) * in2(i, j, k));
       }
@@ -63,8 +63,15 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
 
 }
 
-void test_cxx11_tensor_forced_eval_sycl() {
-  cl::sycl::gpu_selector s;
-  Eigen::SyclDevice sycl_device(s);
-  CALL_SUBTEST(test_forced_eval_sycl(sycl_device));
+template <typename DataType, typename Dev_selector> void tensorForced_evalperDevice(Dev_selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_forced_eval_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_forced_eval_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorForced_evalperDevice<float>(device));
+    CALL_SUBTEST(tensorForced_evalperDevice<half>(device));
+  }
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_generator.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_generator.cpp
index dcb928714b1d66c2d0220705e01840ab8c730046..6dcf676bb24325bb2a2f684b9efec9f862cf10bd 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_generator.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_generator.cpp
@@ -42,11 +42,11 @@ struct Generator2D {
 template <int DataLayout>
 static void test_2D()
 {
-  Tensor<float, 2> matrix(5, 7);
+  Tensor<float, 2> matrix(512, 512);
   Tensor<float, 2> result = matrix.generate(Generator2D());
 
-  for (int i = 0; i < 5; ++i) {
-    for (int j = 0; j < 5; ++j) {
+  for (int i = 0; i < 512; ++i) {
+    for (int j = 0; j < 512; ++j) {
       VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j);
     }
   }
@@ -80,7 +80,7 @@ static void test_gaussian()
 }
 
 
-void test_cxx11_tensor_generator()
+EIGEN_DECLARE_TEST(cxx11_tensor_generator)
 {
   CALL_SUBTEST(test_1D<ColMajor>());
   CALL_SUBTEST(test_1D<RowMajor>());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_generator_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_generator_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fb6e3d9d0a8982d6a1e22e6b4177c2f8a2ad208c
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_generator_sycl.cpp
@@ -0,0 +1,147 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+static const float error_threshold =1e-8f;
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+struct Generator1D {
+  Generator1D() { }
+
+  float operator()(const array<Eigen::DenseIndex, 1>& coordinates) const {
+    return coordinates[0];
+  }
+};
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_1D_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 6;
+  array<IndexType, 1> tensorRange = {{sizeDim1}};
+  Tensor<DataType, 1, DataLayout,IndexType> vec(tensorRange);
+  Tensor<DataType, 1, DataLayout,IndexType> result(tensorRange);
+
+  const size_t tensorBuffSize =vec.size()*sizeof(DataType);
+  DataType* gpu_data_vec  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_result  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> gpu_vec(gpu_data_vec, tensorRange);
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_vec, vec.data(), tensorBuffSize);
+  gpu_result.device(sycl_device)=gpu_vec.generate(Generator1D());
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+  for (IndexType i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(result(i), i);
+  }
+}
+
+
+struct Generator2D {
+  Generator2D() { }
+
+  float operator()(const array<Eigen::DenseIndex, 2>& coordinates) const {
+    return 3 * coordinates[0] + 11 * coordinates[1];
+  }
+};
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_2D_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 5;
+  IndexType sizeDim2 = 7;
+  array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+  Tensor<DataType, 2, DataLayout,IndexType> matrix(tensorRange);
+  Tensor<DataType, 2, DataLayout,IndexType> result(tensorRange);
+
+  const size_t tensorBuffSize =matrix.size()*sizeof(DataType);
+  DataType* gpu_data_matrix  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_result  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+  TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_matrix(gpu_data_matrix, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize);
+  gpu_result.device(sycl_device)=gpu_matrix.generate(Generator2D());
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+  for (IndexType i = 0; i < 5; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j);
+    }
+  }
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_gaussian_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType rows = 32;
+  IndexType cols = 48;
+  array<DataType, 2> means;
+  means[0] = rows / 2.0f;
+  means[1] = cols / 2.0f;
+  array<DataType, 2> std_devs;
+  std_devs[0] = 3.14f;
+  std_devs[1] = 2.7f;
+  internal::GaussianGenerator<DataType, Eigen::DenseIndex, 2> gaussian_gen(means, std_devs);
+
+  array<IndexType, 2> tensorRange = {{rows, cols}};
+  Tensor<DataType, 2, DataLayout,IndexType> matrix(tensorRange);
+  Tensor<DataType, 2, DataLayout,IndexType> result(tensorRange);
+
+  const size_t tensorBuffSize =matrix.size()*sizeof(DataType);
+  DataType* gpu_data_matrix  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_result  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+  TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_matrix(gpu_data_matrix, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize);
+  gpu_result.device(sycl_device)=gpu_matrix.generate(gaussian_gen);
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+  for (IndexType i = 0; i < rows; ++i) {
+    for (IndexType j = 0; j < cols; ++j) {
+      DataType g_rows = powf(rows/2.0f - i, 2) / (3.14f * 3.14f) * 0.5f;
+      DataType g_cols = powf(cols/2.0f - j, 2) / (2.7f * 2.7f) * 0.5f;
+      DataType gaussian = expf(-g_rows - g_cols);
+      Eigen::internal::isApprox(result(i, j), gaussian, error_threshold);
+    }
+  }
+}
+
+template<typename DataType, typename dev_Selector> void sycl_generator_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_1D_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_1D_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_2D_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_2D_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_gaussian_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_gaussian_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_generator_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_generator_test_per_device<float>(device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_cuda.cu b/contrib/eigen/unsupported/test/cxx11_tensor_gpu.cu
similarity index 56%
rename from contrib/eigen/unsupported/test/cxx11_tensor_cuda.cu
rename to contrib/eigen/unsupported/test/cxx11_tensor_gpu.cu
index 9584a539fd587cdf42383b30f59a4b379532795b..137d0d5969fbd6e3b545db01f6e0d09e7f59ed72 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_cuda.cu
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_gpu.cu
@@ -9,15 +9,19 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+
 #define EIGEN_USE_GPU
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
+#define EIGEN_GPU_TEST_C99_MATH  EIGEN_HAS_CXX11
+
 using Eigen::Tensor;
 
-void test_cuda_nullary() {
+void test_gpu_nullary() {
   Tensor<float, 1, 0, int> in1(2);
   Tensor<float, 1, 0, int> in2(2);
   in1.setRandom();
@@ -27,12 +31,12 @@ void test_cuda_nullary() {
 
   float* d_in1;
   float* d_in2;
-  cudaMalloc((void**)(&d_in1), tensor_bytes);
-  cudaMalloc((void**)(&d_in2), tensor_bytes);
-  cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
+  gpuMalloc((void**)(&d_in1), tensor_bytes);
+  gpuMalloc((void**)(&d_in2), tensor_bytes);
+  gpuMemcpy(d_in1, in1.data(), tensor_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), tensor_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
@@ -46,23 +50,23 @@ void test_cuda_nullary() {
   Tensor<float, 1, 0, int> new1(2);
   Tensor<float, 1, 0, int> new2(2);
 
-  assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
-                         gpu_device.stream()) == cudaSuccess);
-  assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
-                         gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(new1.data(), d_in1, tensor_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuMemcpyAsync(new2.data(), d_in2, tensor_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
 
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 2; ++i) {
     VERIFY_IS_APPROX(new1(i), 3.14f);
     VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_in2);
+  gpuFree(d_in1);
+  gpuFree(d_in2);
 }
 
-void test_cuda_elementwise_small() {
+void test_gpu_elementwise_small() {
   Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
   Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
   Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
@@ -76,14 +80,14 @@ void test_cuda_elementwise_small() {
   float* d_in1;
   float* d_in2;
   float* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_in2), in2_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_in2), in2_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
@@ -95,9 +99,9 @@ void test_cuda_elementwise_small() {
 
   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
-                         gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 2; ++i) {
     VERIFY_IS_APPROX(
@@ -105,12 +109,12 @@ void test_cuda_elementwise_small() {
         in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_in2);
-  cudaFree(d_out);
+  gpuFree(d_in1);
+  gpuFree(d_in2);
+  gpuFree(d_out);
 }
 
-void test_cuda_elementwise()
+void test_gpu_elementwise()
 {
   Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
   Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
@@ -129,16 +133,16 @@ void test_cuda_elementwise()
   float* d_in2;
   float* d_in3;
   float* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_in2), in2_bytes);
-  cudaMalloc((void**)(&d_in3), in3_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_in2), in2_bytes);
+  gpuMalloc((void**)(&d_in3), in3_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in3, in3.data(), in3_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
@@ -148,8 +152,8 @@ void test_cuda_elementwise()
 
   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 53; ++j) {
@@ -159,13 +163,13 @@ void test_cuda_elementwise()
     }
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_in2);
-  cudaFree(d_in3);
-  cudaFree(d_out);
+  gpuFree(d_in1);
+  gpuFree(d_in2);
+  gpuFree(d_in3);
+  gpuFree(d_out);
 }
 
-void test_cuda_props() {
+void test_gpu_props() {
   Tensor<float, 1> in1(200);
   Tensor<bool, 1> out(200);
   in1.setRandom();
@@ -175,12 +179,12 @@ void test_cuda_props() {
 
   float* d_in1;
   bool* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
@@ -190,19 +194,19 @@ void test_cuda_props() {
 
   gpu_out.device(gpu_device) = (gpu_in1.isnan)();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
-                         gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 200; ++i) {
     VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_out);
+  gpuFree(d_in1);
+  gpuFree(d_out);
 }
 
-void test_cuda_reduction()
+void test_gpu_reduction()
 {
   Tensor<float, 4> in1(72,53,97,113);
   Tensor<float, 2> out(72,97);
@@ -213,12 +217,12 @@ void test_cuda_reduction()
 
   float* d_in1;
   float* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
@@ -230,8 +234,8 @@ void test_cuda_reduction()
 
   gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 97; ++j) {
@@ -246,12 +250,12 @@ void test_cuda_reduction()
     }
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_out);
+  gpuFree(d_in1);
+  gpuFree(d_out);
 }
 
 template<int DataLayout>
-void test_cuda_contraction()
+void test_gpu_contraction()
 {
   // with these dimensions, the output has 300 * 140 elements, which is
   // more than 30 * 1024, which is the number of threads in blocks on
@@ -271,14 +275,14 @@ void test_cuda_contraction()
   float* d_t_right;
   float* d_t_result;
 
-  cudaMalloc((void**)(&d_t_left), t_left_bytes);
-  cudaMalloc((void**)(&d_t_right), t_right_bytes);
-  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+  gpuMalloc((void**)(&d_t_left), t_left_bytes);
+  gpuMalloc((void**)(&d_t_right), t_right_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
 
-  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
@@ -298,7 +302,7 @@ void test_cuda_contraction()
   m_result = m_left * m_right;
   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
 
-  cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  gpuMemcpy(t_result.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
 
   for (DenseIndex i = 0; i < t_result.size(); i++) {
     if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
@@ -307,13 +311,13 @@ void test_cuda_contraction()
     }
   }
 
-  cudaFree(d_t_left);
-  cudaFree(d_t_right);
-  cudaFree(d_t_result);
+  gpuFree(d_t_left);
+  gpuFree(d_t_right);
+  gpuFree(d_t_result);
 }
 
 template<int DataLayout>
-void test_cuda_convolution_1d()
+void test_gpu_convolution_1d()
 {
   Tensor<float, 4, DataLayout> input(74,37,11,137);
   Tensor<float, 1, DataLayout> kernel(4);
@@ -328,14 +332,14 @@ void test_cuda_convolution_1d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
@@ -345,8 +349,8 @@ void test_cuda_convolution_1d()
   Eigen::array<Eigen::DenseIndex, 1> dims(1);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 74; ++i) {
     for (int j = 0; j < 34; ++j) {
@@ -361,12 +365,12 @@ void test_cuda_convolution_1d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
-void test_cuda_convolution_inner_dim_col_major_1d()
+void test_gpu_convolution_inner_dim_col_major_1d()
 {
   Tensor<float, 4, ColMajor> input(74,9,11,7);
   Tensor<float, 1, ColMajor> kernel(4);
@@ -381,14 +385,14 @@ void test_cuda_convolution_inner_dim_col_major_1d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
@@ -398,8 +402,8 @@ void test_cuda_convolution_inner_dim_col_major_1d()
   Eigen::array<Eigen::DenseIndex, 1> dims(0);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 71; ++i) {
     for (int j = 0; j < 9; ++j) {
@@ -414,12 +418,12 @@ void test_cuda_convolution_inner_dim_col_major_1d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
-void test_cuda_convolution_inner_dim_row_major_1d()
+void test_gpu_convolution_inner_dim_row_major_1d()
 {
   Tensor<float, 4, RowMajor> input(7,9,11,74);
   Tensor<float, 1, RowMajor> kernel(4);
@@ -434,14 +438,14 @@ void test_cuda_convolution_inner_dim_row_major_1d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
@@ -451,8 +455,8 @@ void test_cuda_convolution_inner_dim_row_major_1d()
   Eigen::array<Eigen::DenseIndex, 1> dims(3);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 7; ++i) {
     for (int j = 0; j < 9; ++j) {
@@ -467,13 +471,13 @@ void test_cuda_convolution_inner_dim_row_major_1d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
 template<int DataLayout>
-void test_cuda_convolution_2d()
+void test_gpu_convolution_2d()
 {
   Tensor<float, 4, DataLayout> input(74,37,11,137);
   Tensor<float, 2, DataLayout> kernel(3,4);
@@ -488,14 +492,14 @@ void test_cuda_convolution_2d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
@@ -505,8 +509,8 @@ void test_cuda_convolution_2d()
   Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 74; ++i) {
     for (int j = 0; j < 35; ++j) {
@@ -531,13 +535,13 @@ void test_cuda_convolution_2d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
 template<int DataLayout>
-void test_cuda_convolution_3d()
+void test_gpu_convolution_3d()
 {
   Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
   Tensor<float, 3, DataLayout> kernel(3,4,2);
@@ -552,14 +556,14 @@ void test_cuda_convolution_3d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;    
+  Eigen::GpuStreamDevice stream;    
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
@@ -569,8 +573,8 @@ void test_cuda_convolution_3d()
   Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 74; ++i) {
     for (int j = 0; j < 35; ++j) {
@@ -609,14 +613,15 @@ void test_cuda_convolution_3d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
 
+#if EIGEN_GPU_TEST_C99_MATH
 template <typename Scalar>
-void test_cuda_lgamma(const Scalar stddev)
+void test_gpu_lgamma(const Scalar stddev)
 {
   Tensor<Scalar, 2> in(72,97);
   in.setRandom();
@@ -628,12 +633,12 @@ void test_cuda_lgamma(const Scalar stddev)
 
   Scalar* d_in;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
@@ -641,8 +646,8 @@ void test_cuda_lgamma(const Scalar stddev)
 
   gpu_out.device(gpu_device) = gpu_in.lgamma();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 97; ++j) {
@@ -650,12 +655,13 @@ void test_cuda_lgamma(const Scalar stddev)
     }
   }
 
-  cudaFree(d_in);
-  cudaFree(d_out);
+  gpuFree(d_in);
+  gpuFree(d_out);
 }
+#endif
 
 template <typename Scalar>
-void test_cuda_digamma()
+void test_gpu_digamma()
 {
   Tensor<Scalar, 1> in(7);
   Tensor<Scalar, 1> out(7);
@@ -682,12 +688,12 @@ void test_cuda_digamma()
 
   Scalar* d_in;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
@@ -695,8 +701,8 @@ void test_cuda_digamma()
 
   gpu_out.device(gpu_device) = gpu_in.digamma();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 5; ++i) {
     VERIFY_IS_APPROX(out(i), expected_out(i));
@@ -705,12 +711,12 @@ void test_cuda_digamma()
     VERIFY_IS_EQUAL(out(i), expected_out(i));
   }
 
-  cudaFree(d_in);
-  cudaFree(d_out);
+  gpuFree(d_in);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_zeta()
+void test_gpu_zeta()
 {
   Tensor<Scalar, 1> in_x(6);
   Tensor<Scalar, 1> in_q(6);
@@ -744,14 +750,14 @@ void test_cuda_zeta()
   Scalar* d_in_x;
   Scalar* d_in_q;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in_x), bytes);
-  cudaMalloc((void**)(&d_in_q), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_in_q), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_q, in_q.data(), bytes, gpuMemcpyHostToDevice);
   
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
@@ -760,8 +766,8 @@ void test_cuda_zeta()
 
   gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   VERIFY_IS_EQUAL(out(0), expected_out(0));
   VERIFY((std::isnan)(out(3)));
@@ -772,13 +778,13 @@ void test_cuda_zeta()
     }
   }
 
-  cudaFree(d_in_x);
-  cudaFree(d_in_q);
-  cudaFree(d_out);
+  gpuFree(d_in_x);
+  gpuFree(d_in_q);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_polygamma()
+void test_gpu_polygamma()
 {
   Tensor<Scalar, 1> in_x(7);
   Tensor<Scalar, 1> in_n(7);
@@ -815,14 +821,14 @@ void test_cuda_polygamma()
   Scalar* d_in_x;
   Scalar* d_in_n;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in_x), bytes);
-  cudaMalloc((void**)(&d_in_n), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_in_n), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_n, in_n.data(), bytes, gpuMemcpyHostToDevice);
   
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
@@ -831,20 +837,20 @@ void test_cuda_polygamma()
 
   gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 7; ++i) {
     VERIFY_IS_APPROX(out(i), expected_out(i));
   }
 
-  cudaFree(d_in_x);
-  cudaFree(d_in_n);
-  cudaFree(d_out);
+  gpuFree(d_in_x);
+  gpuFree(d_in_n);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_igamma()
+void test_gpu_igamma()
 {
   Tensor<Scalar, 2> a(6, 6);
   Tensor<Scalar, 2> x(6, 6);
@@ -880,14 +886,14 @@ void test_cuda_igamma()
   Scalar* d_a;
   Scalar* d_x;
   Scalar* d_out;
-  assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess);
-  assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess);
-  assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
+  assert(gpuMalloc((void**)(&d_a), bytes) == gpuSuccess);
+  assert(gpuMalloc((void**)(&d_x), bytes) == gpuSuccess);
+  assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
 
-  cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
@@ -896,8 +902,8 @@ void test_cuda_igamma()
 
   gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 6; ++j) {
@@ -909,13 +915,13 @@ void test_cuda_igamma()
     }
   }
 
-  cudaFree(d_a);
-  cudaFree(d_x);
-  cudaFree(d_out);
+  gpuFree(d_a);
+  gpuFree(d_x);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_igammac()
+void test_gpu_igammac()
 {
   Tensor<Scalar, 2> a(6, 6);
   Tensor<Scalar, 2> x(6, 6);
@@ -950,14 +956,14 @@ void test_cuda_igammac()
   Scalar* d_a;
   Scalar* d_x;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_a), bytes);
-  cudaMalloc((void**)(&d_x), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_a), bytes);
+  gpuMalloc((void**)(&d_x), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
@@ -966,8 +972,8 @@ void test_cuda_igammac()
 
   gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 6; ++j) {
@@ -979,13 +985,14 @@ void test_cuda_igammac()
     }
   }
 
-  cudaFree(d_a);
-  cudaFree(d_x);
-  cudaFree(d_out);
+  gpuFree(d_a);
+  gpuFree(d_x);
+  gpuFree(d_out);
 }
 
+#if EIGEN_GPU_TEST_C99_MATH
 template <typename Scalar>
-void test_cuda_erf(const Scalar stddev)
+void test_gpu_erf(const Scalar stddev)
 {
   Tensor<Scalar, 2> in(72,97);
   in.setRandom();
@@ -997,12 +1004,12 @@ void test_cuda_erf(const Scalar stddev)
 
   Scalar* d_in;
   Scalar* d_out;
-  assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess);
-  assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
+  assert(gpuMalloc((void**)(&d_in), bytes) == gpuSuccess);
+  assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
 
-  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
@@ -1010,8 +1017,8 @@ void test_cuda_erf(const Scalar stddev)
 
   gpu_out.device(gpu_device) = gpu_in.erf();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 97; ++j) {
@@ -1019,12 +1026,12 @@ void test_cuda_erf(const Scalar stddev)
     }
   }
 
-  cudaFree(d_in);
-  cudaFree(d_out);
+  gpuFree(d_in);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_erfc(const Scalar stddev)
+void test_gpu_erfc(const Scalar stddev)
 {
   Tensor<Scalar, 2> in(72,97);
   in.setRandom();
@@ -1036,12 +1043,12 @@ void test_cuda_erfc(const Scalar stddev)
 
   Scalar* d_in;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
@@ -1049,8 +1056,8 @@ void test_cuda_erfc(const Scalar stddev)
 
   gpu_out.device(gpu_device) = gpu_in.erfc();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 97; ++j) {
@@ -1058,12 +1065,73 @@ void test_cuda_erfc(const Scalar stddev)
     }
   }
 
-  cudaFree(d_in);
-  cudaFree(d_out);
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+#endif
+template <typename Scalar>
+void test_gpu_ndtri()
+{
+  Tensor<Scalar, 1> in_x(8);
+  Tensor<Scalar, 1> out(8);
+  Tensor<Scalar, 1> expected_out(8);
+  out.setZero();
+
+  in_x(0) = Scalar(1);
+  in_x(1) = Scalar(0.);
+  in_x(2) = Scalar(0.5);
+  in_x(3) = Scalar(0.2);
+  in_x(4) = Scalar(0.8);
+  in_x(5) = Scalar(0.9);
+  in_x(6) = Scalar(0.1);
+  in_x(7) = Scalar(0.99);
+  in_x(8) = Scalar(0.01);
+
+  expected_out(0) = std::numeric_limits<Scalar>::infinity();
+  expected_out(1) = -std::numeric_limits<Scalar>::infinity();
+  expected_out(2) = Scalar(0.0);
+  expected_out(3) = Scalar(-0.8416212335729142);
+  expected_out(4) = Scalar(0.8416212335729142);
+  expected_out(5) = Scalar(1.2815515655446004);
+  expected_out(6) = Scalar(-1.2815515655446004);
+  expected_out(7) = Scalar(2.3263478740408408);
+  expected_out(8) = Scalar(-2.3263478740408408);
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
+
+  gpu_out.device(gpu_device) = gpu_in_x.ndtri();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  VERIFY_IS_EQUAL(out(0), expected_out(0));
+  VERIFY((std::isnan)(out(3)));
+
+  for (int i = 1; i < 6; ++i) {
+    if (i != 3) {
+      VERIFY_IS_APPROX(out(i), expected_out(i));
+    }
+  }
+
+  gpuFree(d_in_x);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_betainc()
+void test_gpu_betainc()
 {
   Tensor<Scalar, 1> in_x(125);
   Tensor<Scalar, 1> in_a(125);
@@ -1172,16 +1240,16 @@ void test_cuda_betainc()
   Scalar* d_in_a;
   Scalar* d_in_b;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in_x), bytes);
-  cudaMalloc((void**)(&d_in_a), bytes);
-  cudaMalloc((void**)(&d_in_b), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_in_a), bytes);
+  gpuMalloc((void**)(&d_in_b), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_b, in_b.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
@@ -1191,8 +1259,8 @@ void test_cuda_betainc()
 
   gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 1; i < 125; ++i) {
     if ((std::isnan)(expected_out(i))) {
@@ -1202,83 +1270,374 @@ void test_cuda_betainc()
     }
   }
 
-  cudaFree(d_in_x);
-  cudaFree(d_in_a);
-  cudaFree(d_in_b);
-  cudaFree(d_out);
+  gpuFree(d_in_x);
+  gpuFree(d_in_a);
+  gpuFree(d_in_b);
+  gpuFree(d_out);
 }
 
+template <typename Scalar>
+void test_gpu_i0e()
+{
+  Tensor<Scalar, 1> in_x(21);
+  Tensor<Scalar, 1> out(21);
+  Tensor<Scalar, 1> expected_out(21);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_x_array(21);
+  Array<Scalar, 1, Dynamic> expected_out_array(21);
+
+  in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
+      -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+  expected_out_array << 0.0897803118848, 0.0947062952128, 0.100544127361,
+      0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857,
+      0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554,
+      0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163,
+      0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128,
+      0.0897803118848;
+
+  for (int i = 0; i < 21; ++i) {
+    in_x(i) = in_x_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
+
+  gpu_out.device(gpu_device) = gpu_in.bessel_i0e();
 
-void test_cxx11_tensor_cuda()
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 21; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_i1e()
 {
-  CALL_SUBTEST_1(test_cuda_nullary());
-  CALL_SUBTEST_1(test_cuda_elementwise_small());
-  CALL_SUBTEST_1(test_cuda_elementwise());
-  CALL_SUBTEST_1(test_cuda_props());
-  CALL_SUBTEST_1(test_cuda_reduction());
-  CALL_SUBTEST_2(test_cuda_contraction<ColMajor>());
-  CALL_SUBTEST_2(test_cuda_contraction<RowMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d());
-  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d());
-  CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
-
-#if __cplusplus > 199711L
+  Tensor<Scalar, 1> in_x(21);
+  Tensor<Scalar, 1> out(21);
+  Tensor<Scalar, 1> expected_out(21);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_x_array(21);
+  Array<Scalar, 1, Dynamic> expected_out_array(21);
+
+  in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
+      -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+  expected_out_array << -0.0875062221833, -0.092036796872, -0.0973496147565,
+      -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293,
+      -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249,
+      0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384,
+      0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872,
+      0.0875062221833;
+
+  for (int i = 0; i < 21; ++i) {
+    in_x(i) = in_x_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
+
+  gpu_out.device(gpu_device) = gpu_in.bessel_i1e();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 21; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_igamma_der_a()
+{
+  Tensor<Scalar, 1> in_x(30);
+  Tensor<Scalar, 1> in_a(30);
+  Tensor<Scalar, 1> out(30);
+  Tensor<Scalar, 1> expected_out(30);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_a_array(30);
+  Array<Scalar, 1, Dynamic> in_x_array(30);
+  Array<Scalar, 1, Dynamic> expected_out_array(30);
+
+  // See special_functions.cpp for the Python code that generates the test data.
+
+  in_a_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
+      1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+      100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+  in_x_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+      1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
+      0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
+      1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
+      7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
+      92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
+      968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+  expected_out_array << -32.7256441441, -36.4394150514, -9.66467612263,
+      -36.4394150514, -36.4394150514, -1.0891900302, -2.66351229645,
+      -2.48666868596, -0.929700494428, -3.56327722764, -0.455320135314,
+      -0.391437214323, -0.491352055991, -0.350454834292, -0.471773162921,
+      -0.104084440522, -0.0723646747909, -0.0992828975532, -0.121638215446,
+      -0.122619605294, -0.0317670267286, -0.0359974812869, -0.0154359225363,
+      -0.0375775365921, -0.00794899153653, -0.00777303219211, -0.00796085782042,
+      -0.0125850719397, -0.00455500206958, -0.00476436993148;
+
+  for (int i = 0; i < 30; ++i) {
+    in_x(i) = in_x_array(i);
+    in_a(i) = in_a_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_a), bytes);
+  gpuMalloc((void**)(&d_x), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_a(d_a, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_x(d_x, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
+
+  gpu_out.device(gpu_device) = gpu_a.igamma_der_a(gpu_x);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 30; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_a);
+  gpuFree(d_x);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_gamma_sample_der_alpha()
+{
+  Tensor<Scalar, 1> in_alpha(30);
+  Tensor<Scalar, 1> in_sample(30);
+  Tensor<Scalar, 1> out(30);
+  Tensor<Scalar, 1> expected_out(30);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_alpha_array(30);
+  Array<Scalar, 1, Dynamic> in_sample_array(30);
+  Array<Scalar, 1, Dynamic> expected_out_array(30);
+
+  // See special_functions.cpp for the Python code that generates the test data.
+
+  in_alpha_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0,
+      1.0, 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0,
+      100.0, 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+  in_sample_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+      1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
+      0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
+      1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
+      7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
+      92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
+      968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+  expected_out_array << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738,
+      1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243,
+      0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302,
+      1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534,
+      0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812,
+      1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061,
+      0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206,
+      1.00106492525, 0.97734200649, 1.02198794179;
+
+  for (int i = 0; i < 30; ++i) {
+    in_alpha(i) = in_alpha_array(i);
+    in_sample(i) = in_sample_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_alpha.size() * sizeof(Scalar);
+
+  Scalar* d_alpha;
+  Scalar* d_sample;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_alpha), bytes);
+  gpuMalloc((void**)(&d_sample), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_alpha, in_alpha.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_sample, in_sample.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_alpha(d_alpha, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_sample(d_sample, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
+
+  gpu_out.device(gpu_device) = gpu_alpha.gamma_sample_der_alpha(gpu_sample);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 30; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_alpha);
+  gpuFree(d_sample);
+  gpuFree(d_out);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
+{
+  CALL_SUBTEST_1(test_gpu_nullary());
+  CALL_SUBTEST_1(test_gpu_elementwise_small());
+  CALL_SUBTEST_1(test_gpu_elementwise());
+  CALL_SUBTEST_1(test_gpu_props());
+  CALL_SUBTEST_1(test_gpu_reduction());
+  CALL_SUBTEST_2(test_gpu_contraction<ColMajor>());
+  CALL_SUBTEST_2(test_gpu_contraction<RowMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_1d<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_1d<RowMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_inner_dim_col_major_1d());
+  CALL_SUBTEST_3(test_gpu_convolution_inner_dim_row_major_1d());
+  CALL_SUBTEST_3(test_gpu_convolution_2d<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_2d<RowMajor>());
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+// they hang..need to investigate and fix
+  CALL_SUBTEST_3(test_gpu_convolution_3d<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_3d<RowMajor>());
+#endif
+
+#if EIGEN_GPU_TEST_C99_MATH
   // std::erf, std::erfc, and so on where only added in c++11. We use them
   // as a golden reference to validate the results produced by Eigen. Therefore
   // we can only run these tests if we use a c++11 compiler.
-  CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
-  CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
-  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
-  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
-
-  CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
-  CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
-  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
-  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
-
-  CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
-  CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
-  CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
-  CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
-
-  CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
-  // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
-  CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
-  CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
-  CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
-
-  CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
-  CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
-  CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
-  CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
-
-  CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
-  // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
-  CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
-  CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
-  CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
-
-  CALL_SUBTEST_5(test_cuda_digamma<float>());
-  CALL_SUBTEST_5(test_cuda_digamma<double>());
-
-  CALL_SUBTEST_5(test_cuda_polygamma<float>());
-  CALL_SUBTEST_5(test_cuda_polygamma<double>());
-
-  CALL_SUBTEST_5(test_cuda_zeta<float>());
-  CALL_SUBTEST_5(test_cuda_zeta<double>());
-
-  CALL_SUBTEST_5(test_cuda_igamma<float>());
-  CALL_SUBTEST_5(test_cuda_igammac<float>());
-
-  CALL_SUBTEST_5(test_cuda_igamma<double>());
-  CALL_SUBTEST_5(test_cuda_igammac<double>());
-
-  CALL_SUBTEST_6(test_cuda_betainc<float>());
-  CALL_SUBTEST_6(test_cuda_betainc<double>());
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(1.0f));
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(100.0f));
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(0.01f));
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(0.001f));
+
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(1.0));
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(100.0));
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(0.01));
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(0.001));
+
+  CALL_SUBTEST_4(test_gpu_erf<float>(1.0f));
+  CALL_SUBTEST_4(test_gpu_erf<float>(100.0f));
+  CALL_SUBTEST_4(test_gpu_erf<float>(0.01f));
+  CALL_SUBTEST_4(test_gpu_erf<float>(0.001f));
+
+  CALL_SUBTEST_4(test_gpu_erfc<float>(1.0f));
+  // CALL_SUBTEST(test_gpu_erfc<float>(100.0f));
+  CALL_SUBTEST_4(test_gpu_erfc<float>(5.0f)); // GPU erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_gpu_erfc<float>(0.01f));
+  CALL_SUBTEST_4(test_gpu_erfc<float>(0.001f));
+
+  CALL_SUBTEST_4(test_gpu_erf<double>(1.0));
+  CALL_SUBTEST_4(test_gpu_erf<double>(100.0));
+  CALL_SUBTEST_4(test_gpu_erf<double>(0.01));
+  CALL_SUBTEST_4(test_gpu_erf<double>(0.001));
+
+  CALL_SUBTEST_4(test_gpu_erfc<double>(1.0));
+  // CALL_SUBTEST(test_gpu_erfc<double>(100.0));
+  CALL_SUBTEST_4(test_gpu_erfc<double>(5.0)); // GPU erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_gpu_erfc<double>(0.01));
+  CALL_SUBTEST_4(test_gpu_erfc<double>(0.001));
+
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+
+  CALL_SUBTEST_5(test_gpu_ndtri<float>());
+  CALL_SUBTEST_5(test_gpu_ndtri<double>());
+
+  CALL_SUBTEST_5(test_gpu_digamma<float>());
+  CALL_SUBTEST_5(test_gpu_digamma<double>());
+
+  CALL_SUBTEST_5(test_gpu_polygamma<float>());
+  CALL_SUBTEST_5(test_gpu_polygamma<double>());
+
+  CALL_SUBTEST_5(test_gpu_zeta<float>());
+  CALL_SUBTEST_5(test_gpu_zeta<double>());
+#endif
+
+  CALL_SUBTEST_5(test_gpu_igamma<float>());
+  CALL_SUBTEST_5(test_gpu_igammac<float>());
+
+  CALL_SUBTEST_5(test_gpu_igamma<double>());
+  CALL_SUBTEST_5(test_gpu_igammac<double>());
+
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+  CALL_SUBTEST_6(test_gpu_betainc<float>());
+  CALL_SUBTEST_6(test_gpu_betainc<double>());
+
+  CALL_SUBTEST_6(test_gpu_i0e<float>());
+  CALL_SUBTEST_6(test_gpu_i0e<double>());
+
+  CALL_SUBTEST_6(test_gpu_i1e<float>());
+  CALL_SUBTEST_6(test_gpu_i1e<double>());
+
+  CALL_SUBTEST_6(test_gpu_i1e<float>());
+  CALL_SUBTEST_6(test_gpu_i1e<double>());
+
+  CALL_SUBTEST_6(test_gpu_igamma_der_a<float>());
+  CALL_SUBTEST_6(test_gpu_igamma_der_a<double>());
+
+  CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<float>());
+  CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<double>());
+#endif
+
 #endif
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_ifft.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_ifft.cpp
index 5fd88fa6c6fd0f8f9a819b92eb039ed5c50c08af..c20edd9acf0f4926e66bb838384c3ef81957ac46 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_ifft.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_ifft.cpp
@@ -131,7 +131,7 @@ static void test_sub_fft_ifft_invariant(int dim0, int dim1, int dim2, int dim3)
   }
 }
 
-void test_cxx11_tensor_ifft() {
+EIGEN_DECLARE_TEST(cxx11_tensor_ifft) {
   CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(4));
   CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(16));
   CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(32));
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_image_op_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_image_op_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db1c0206e8adec8dec21af112a86987a259228ee
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_image_op_sycl.cpp
@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_image_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 245;
+  IndexType sizeDim2 = 343;
+  IndexType sizeDim3 = 577;
+
+  array<IndexType, 3> input_range ={{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> slice_range ={{sizeDim1-1, sizeDim2, sizeDim3}};
+
+  Tensor<DataType, 3,DataLayout, IndexType> tensor1(input_range);
+  Tensor<DataType, 3,DataLayout, IndexType> tensor2(input_range);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor3(slice_range);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor3_cpu(slice_range);
+
+
+
+  typedef Eigen::DSizes<IndexType, 3> Index3;
+  Index3 strides1(1L,1L, 1L);
+  Index3 indicesStart1(1L, 0L, 0L);
+  Index3 indicesStop1(sizeDim1, sizeDim2, sizeDim3);
+
+  Index3 strides2(1L,1L, 1L);
+  Index3 indicesStart2(0L, 0L, 0L);
+  Index3 indicesStop2(sizeDim1-1, sizeDim2, sizeDim3);
+  Eigen::DSizes<IndexType, 3> sizes(sizeDim1-1,sizeDim2,sizeDim3);
+
+  tensor1.setRandom();
+  tensor2.setRandom();
+
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, input_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, input_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu3(gpu_data3, slice_range);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data2, tensor2.data(),(tensor2.size())*sizeof(DataType));
+  gpu3.device(sycl_device)= gpu1.slice(indicesStart1, sizes) - gpu2.slice(indicesStart2, sizes);
+  sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType));
+
+  tensor3_cpu = tensor1.stridedSlice(indicesStart1,indicesStop1,strides1) - tensor2.stridedSlice(indicesStart2,indicesStop2,strides2);
+
+
+  for (IndexType i = 0; i <slice_range[0] ; ++i) {
+    for (IndexType j = 0; j < slice_range[1]; ++j) {
+      for (IndexType k = 0; k < slice_range[2]; ++k) {
+        VERIFY_IS_EQUAL(tensor3_cpu(i,j,k), tensor3(i,j,k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_image_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) { 
+   CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+   CALL_SUBTEST(sycl_computing_test_per_device<double>(device));
+#endif
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_image_patch.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_image_patch.cpp
index 475c596517d5feb5ac96bd2aedbbe623900d6dac..862f1f7f0d4c6cfb53b889603441365cc68bc39b 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_image_patch.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_image_patch.cpp
@@ -405,6 +405,57 @@ void test_patch_padding_same()
   }
 }
 
+// Verifies that SAME padding, when computed as negative values, will be clipped
+// to zero.
+void test_patch_padding_same_negative_padding_clip_to_zero() {
+  int input_depth = 1;
+  int input_rows = 15;
+  int input_cols = 1;
+  int input_batches = 1;
+  int ksize = 1;  // Corresponds to the Rows and Cols for
+                  // tensor.extract_image_patches<>.
+  int row_stride = 5;
+  int col_stride = 1;
+  // ColMajor
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  // Initializes tensor with incrementing numbers.
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.data()[i] = i + 1;
+  }
+  Tensor<float, 5> result = tensor.extract_image_patches(
+      ksize, ksize, row_stride, col_stride, 1, 1, PADDING_SAME);
+  // row padding will be computed as -2 originally and then be clipped to 0.
+  VERIFY_IS_EQUAL(result.coeff(0), 1.0f);
+  VERIFY_IS_EQUAL(result.coeff(1), 6.0f);
+  VERIFY_IS_EQUAL(result.coeff(2), 11.0f);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);    // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);          // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);          // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 3);              // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+  Tensor<float, 5, RowMajor> result_row_major =
+      tensor_row_major.extract_image_patches(ksize, ksize, row_stride,
+                                             col_stride, 1, 1, PADDING_SAME);
+  VERIFY_IS_EQUAL(result_row_major.coeff(0), 1.0f);
+  VERIFY_IS_EQUAL(result_row_major.coeff(1), 6.0f);
+  VERIFY_IS_EQUAL(result_row_major.coeff(2), 11.0f);
+
+  VERIFY_IS_EQUAL(result.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result.dimension(4), result_row_major.dimension(0));
+}
+
 void test_patch_no_extra_dim()
 {
   Tensor<float, 3> tensor(2,3,5);
@@ -746,7 +797,7 @@ void test_imagenet_patches()
   }
 }
 
-void test_cxx11_tensor_image_patch()
+EIGEN_DECLARE_TEST(cxx11_tensor_image_patch)
 {
   CALL_SUBTEST_1(test_simple_patch());
   CALL_SUBTEST_2(test_patch_no_extra_dim());
@@ -754,4 +805,5 @@ void test_cxx11_tensor_image_patch()
   CALL_SUBTEST_4(test_patch_padding_valid_same_value());
   CALL_SUBTEST_5(test_patch_padding_same());
   CALL_SUBTEST_6(test_imagenet_patches());
+  CALL_SUBTEST_7(test_patch_padding_same_negative_padding_clip_to_zero());
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_image_patch_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_image_patch_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c1828a0ec8749efe1bf6ac069f091cd4b9083f0c
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_image_patch_sycl.cpp
@@ -0,0 +1,1092 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+static const int DataLayout = ColMajor;
+
+template <typename DataType, typename IndexType>
+static void test_simple_image_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  array<IndexType, 4> tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+  tensor_col_major.setRandom();
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+  // Single pixel patch: ColMajor
+  array<IndexType, 5> patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 5, DataLayout,IndexType> single_patch_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange);
+  gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1);
+  sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(4), 7);
+
+  // Single pixel patch: RowMajor
+  array<IndexType, 5> patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 1, 1, sizeDim1}};
+  Tensor<DataType, 5, RowMajor,IndexType> single_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange);
+  gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1);
+  sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(4), 2);
+
+  for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+    // ColMajor
+    if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) {
+      std::cout << "Mismatch detected at index colmajor " << i << " : "
+           << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i]
+           << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]);
+    // RowMajor
+    if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) {
+      std::cout << "Mismatch detected at index row major" << i << " : "
+           << tensor_row_major.data()[i] << " vs "
+           << single_patch_row_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_patch_row_major.data()[i],
+                    tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(single_patch_col_major.data()[i],
+                    single_patch_row_major.data()[i]);
+  }
+
+
+  // Entire image patch: ColMajor
+  patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 5, DataLayout,IndexType> entire_image_patch_col_major(patchColMajorTensorRange);
+  patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_entire_image_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange);
+  gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5);
+  sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(4), 7);
+
+  // Entire image patch: RowMajor
+  patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}};
+  Tensor<DataType, 5, RowMajor,IndexType> entire_image_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_entire_image_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange);
+  gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5);
+  sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(4), 2);
+
+  for (IndexType i = 0; i < 3; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      IndexType patchId = i+3*j;
+      for (IndexType r = 0; r < 3; ++r) {
+        for (IndexType c = 0; c < 5; ++c) {
+          for (IndexType d = 0; d < 2; ++d) {
+            for (IndexType b = 0; b < 7; ++b) {
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+                expected_col_major = tensor_col_major(d, r-1+i, c-2+j, b);
+                expected_row_major = tensor_row_major(b, c-2+j, r-1+i, d);
+              }
+              // ColMajor
+              if (entire_image_patch_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId, b), expected_col_major);
+              // RowMajor
+              if (entire_image_patch_row_major(b, patchId, c, r, d) !=
+                  expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j
+                     << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+                     << std::endl;
+              }
+              VERIFY_IS_EQUAL(entire_image_patch_row_major(b, patchId, c, r, d),
+                              expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // 2D patch: ColMajor
+  patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 5, DataLayout,IndexType> twod_patch_col_major(patchColMajorTensorRange);
+  patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange);
+  gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2);
+  sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(4), 7);
+
+  // 2D patch: RowMajor
+  patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 2, 2, sizeDim1}};
+  Tensor<DataType, 5, RowMajor,IndexType> twod_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange);
+  gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2);
+  sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(4), 2);
+
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+  IndexType stride = 1;
+
+  for (IndexType i = 0; i < 3; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      IndexType patchId = i+3*j;
+      for (IndexType r = 0; r < 2; ++r) {
+        for (IndexType c = 0; c < 2; ++c) {
+          for (IndexType d = 0; d < 2; ++d) {
+            for (IndexType b = 0; b < 7; ++b) {
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              IndexType row_offset = r*stride + i - row_padding;
+              IndexType col_offset = c*stride + j - col_padding;
+              // ColMajor
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) {
+                expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+              }
+              if (twod_patch_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId, b), expected_col_major);
+
+              // RowMajor
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(2) && col_offset < tensor_row_major.dimension(1)) {
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+
+              }
+              if (twod_patch_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(twod_patch_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_single_patch_col_major);
+  sycl_device.deallocate(gpu_data_single_patch_row_major);
+  sycl_device.deallocate(gpu_data_entire_image_patch_col_major);
+  sycl_device.deallocate(gpu_data_entire_image_patch_row_major);
+  sycl_device.deallocate(gpu_data_twod_patch_col_major);
+  sycl_device.deallocate(gpu_data_twod_patch_row_major);
+
+}
+
+
+// Verifies VALID padding (no padding) with incrementing values.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_valid_sycl(const Eigen::SyclDevice& sycl_device){
+  IndexType input_depth = 3;
+  IndexType input_rows = 3;
+  IndexType input_cols = 3;
+  IndexType input_batches = 1;
+  IndexType ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  IndexType stride = 2;  // Only same stride is supported.
+
+  array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+  array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+  // Initializes tensor with incrementing numbers.
+  for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+    tensor_col_major.data()[i] = i + 1;
+  }
+  // ColMajor
+  array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 1, input_batches}};
+  Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+  gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), 1);  // number of patches
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 1, ksize, ksize, input_depth }};
+  Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+  gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+  // No padding is carried out.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+
+  for (IndexType i = 0; (i+stride+ksize-1) < input_rows; i += stride) {  // input rows
+    for (IndexType j = 0; (j+stride+ksize-1) < input_cols; j += stride) {  // input cols
+      IndexType patchId = i+input_rows*j;
+      for (IndexType r = 0; r < ksize; ++r) {  // patch rows
+        for (IndexType c = 0; c < ksize; ++c) {  // patch cols
+          for (IndexType d = 0; d < input_depth; ++d) {  // depth
+            for (IndexType b = 0; b < input_batches; ++b) {  // batch
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              IndexType row_offset = r + i - row_padding;
+              IndexType col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_result_col_major);
+  sycl_device.deallocate(gpu_data_result_row_major);
+}
+
+// Verifies VALID padding (no padding) with the same value.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_valid_same_value_sycl(const Eigen::SyclDevice& sycl_device){
+  IndexType input_depth = 1;
+  IndexType input_rows = 5;
+  IndexType input_cols = 5;
+  IndexType input_batches = 2;
+  IndexType ksize = 3;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  IndexType stride = 2;  // Only same stride is supported.
+  // ColMajor
+
+  array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+  array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+  gpu_col_major.device(sycl_device)=gpu_col_major.constant(11.0f);
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_col_major.data(), gpu_data_col_major, (tensor_col_major.size())*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+  array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 4, input_batches}};
+  Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+  gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), 4);  // number of patches
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 4, ksize, ksize, input_depth }};
+  Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+  gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+  // No padding is carried out.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+
+  for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      IndexType patchId = i+input_rows*j;
+      for (IndexType r = 0; r < ksize; ++r) {  // patch rows
+        for (IndexType c = 0; c < ksize; ++c) {  // patch cols
+          for (IndexType d = 0; d < input_depth; ++d) {  // depth
+            for (IndexType b = 0; b < input_batches; ++b) {  // batch
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              IndexType row_offset = r + i - row_padding;
+              IndexType col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies SAME padding.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_same_sycl(const Eigen::SyclDevice& sycl_device){
+  IndexType input_depth = 3;
+  IndexType input_rows = 4;
+  IndexType input_cols = 2;
+  IndexType input_batches = 1;
+  IndexType ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  IndexType stride = 2;  // Only same stride is supported.
+
+  // ColMajor
+  array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+  array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+  // Initializes tensor with incrementing numbers.
+  for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+    tensor_col_major.data()[i] = i + 1;
+  }
+
+array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 2, input_batches}};
+Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+DataType* gpu_data_result_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), 2);  // number of patches
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+
+  array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 2, ksize, ksize, input_depth }};
+  Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+  gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+  sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+
+  for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      IndexType patchId = i+input_rows*j;
+      for (IndexType r = 0; r < ksize; ++r) {  // patch rows
+        for (IndexType c = 0; c < ksize; ++c) {  // patch cols
+          for (IndexType d = 0; d < input_depth; ++d) {  // depth
+            for (IndexType b = 0; b < input_batches; ++b) {  // batch
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              IndexType row_offset = r*stride + i - row_padding;
+              IndexType col_offset = c*stride + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+template <typename DataType, typename IndexType>
+static void test_patch_no_extra_dim_sycl(const Eigen::SyclDevice& sycl_device){
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+
+  // ColMajor
+  array<IndexType, 3> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> tensorRowMajorRange = {{sizeDim3, sizeDim2, sizeDim1}};
+  Tensor<DataType, 3, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  tensor_col_major.setRandom();
+  Tensor<DataType, 3, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(0));
+
+
+  // Single pixel patch: ColMajor
+  array<IndexType, 4> patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3}};
+  Tensor<DataType, 4, DataLayout,IndexType> single_patch_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange);
+  gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1);
+  sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), sizeDim2*sizeDim3);
+
+  // Single pixel patch: RowMajor
+  array<IndexType, 4> patchRowMajorTensorRange={{sizeDim2*sizeDim3, 1, 1, sizeDim1}};
+  Tensor<DataType, 4, RowMajor,IndexType> single_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange);
+  gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1);
+  sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), sizeDim2*sizeDim3);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), sizeDim1);
+
+  for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+    // ColMajor
+    if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : " << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]);
+    // RowMajor
+    if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : "
+           << tensor_col_major.data()[i] << " vs "
+           << single_patch_row_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_patch_row_major.data()[i],
+                    tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(single_patch_col_major.data()[i],
+                    single_patch_row_major.data()[i]);
+  }
+
+  // Entire image patch: ColMajor
+  patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3}};
+  Tensor<DataType, 4, DataLayout,IndexType> entire_image_patch_col_major(patchColMajorTensorRange);
+  patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_entire_image_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange);
+  gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5);
+  sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5);
+
+  // Entire image patch: RowMajor
+patchRowMajorTensorRange={{sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}};
+Tensor<DataType, 4, RowMajor,IndexType> entire_image_patch_row_major(patchRowMajorTensorRange);
+patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType);
+DataType* gpu_data_entire_image_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange);
+gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5);
+sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 2);
+
+  for (IndexType i = 0; i < 3; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      IndexType patchId = i+3*j;
+      for (IndexType r = 0; r < 3; ++r) {
+        for (IndexType c = 0; c < 5; ++c) {
+          for (IndexType d = 0; d < 2; ++d) {
+            DataType expected_col_major = 0.0f;
+            DataType expected_row_major = 0.0f;
+            if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+              expected_col_major = tensor_col_major(d, r-1+i, c-2+j);
+              expected_row_major = tensor_row_major(c-2+j, r-1+i, d);
+            }
+            // ColMajor
+            if (entire_image_patch_col_major(d, r, c, patchId) != expected_col_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId), expected_col_major);
+            // RowMajor
+            if (entire_image_patch_row_major(patchId, c, r, d) !=
+                expected_row_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(entire_image_patch_row_major(patchId, c, r, d),
+                            expected_row_major);
+            // Check that ColMajor and RowMajor agree.
+            VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+          }
+        }
+      }
+    }
+  }
+
+  // 2D patch: ColMajor
+  patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3}};
+  Tensor<DataType, 4, DataLayout,IndexType> twod_patch_col_major(patchColMajorTensorRange);
+  patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange);
+  gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2);
+  sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5);
+
+  // 2D patch: RowMajor
+  patchRowMajorTensorRange={{sizeDim2*sizeDim3, 2, 2, sizeDim1}};
+  Tensor<DataType, 4, RowMajor,IndexType> twod_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange);
+  gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2);
+  sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+  IndexType stride = 1;
+
+  for (IndexType i = 0; i < 3; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      IndexType patchId = i+3*j;
+      for (IndexType r = 0; r < 2; ++r) {
+        for (IndexType c = 0; c < 2; ++c) {
+          for (IndexType d = 0; d < 2; ++d) {
+            DataType expected_col_major = 0.0f;
+            DataType expected_row_major = 0.0f;
+            IndexType row_offset = r*stride + i - row_padding;
+            IndexType col_offset = c*stride + j - col_padding;
+            // ColMajor
+            if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) {
+              expected_col_major = tensor_col_major(d, row_offset, col_offset);
+            }
+            if (twod_patch_col_major(d, r, c, patchId) != expected_col_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId), expected_col_major);
+            // RowMajor
+            if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(1) && col_offset < tensor_row_major.dimension(0)) {
+              expected_row_major = tensor_row_major(col_offset, row_offset, d);
+            }
+            if (twod_patch_row_major(patchId, c, r, d) != expected_row_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(twod_patch_row_major(patchId, c, r, d), expected_row_major);
+            // Check that ColMajor and RowMajor agree.
+            VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+          }
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_single_patch_col_major);
+  sycl_device.deallocate(gpu_data_single_patch_row_major);
+  sycl_device.deallocate(gpu_data_entire_image_patch_col_major);
+  sycl_device.deallocate(gpu_data_entire_image_patch_row_major);
+  sycl_device.deallocate(gpu_data_twod_patch_col_major);
+  sycl_device.deallocate(gpu_data_twod_patch_row_major);
+}
+
+template <typename DataType, typename IndexType>
+static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  // Test the code on typical configurations used by the 'imagenet' benchmarks at
+  // https://github.com/soumith/convnet-benchmarks
+  // ColMajor
+  IndexType sizeDim1 = 3;
+  IndexType sizeDim2 = 128;
+  IndexType sizeDim3 = 128;
+  IndexType sizeDim4 = 16;
+  array<IndexType, 4> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> l_in_col_major(tensorColMajorRange);
+  l_in_col_major.setRandom();
+
+  DataType* gpu_data_l_in_col_major  = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_l_in_col_major(gpu_data_l_in_col_major, tensorColMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+
+  array<IndexType, 5> patchTensorRange={{sizeDim1, 11, 11, sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 5, DataLayout,IndexType> l_out_col_major(patchTensorRange);
+  size_t patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_l_out_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_l_out_col_major(gpu_data_l_out_col_major, patchTensorRange);
+  gpu_l_out_col_major.device(sycl_device)=gpu_l_in_col_major.extract_image_patches(11, 11);
+  sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 11);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 11);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(3), sizeDim2*sizeDim3);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(4), sizeDim4);
+
+  // RowMajor
+  patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 11, 11, sizeDim1}};
+  Tensor<DataType, 5, RowMajor,IndexType> l_out_row_major(patchTensorRange);
+  patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_l_out_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_l_out_row_major(gpu_data_l_out_row_major, patchTensorRange);
+  gpu_l_out_row_major.device(sycl_device)=gpu_l_in_col_major.swap_layout().extract_image_patches(11, 11);
+  sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), sizeDim4);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), sizeDim2*sizeDim3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), sizeDim1);
+
+  for (IndexType b = 0; b < 16; ++b) {
+    for (IndexType i = 0; i < 128; ++i) {
+      for (IndexType j = 0; j < 128; ++j) {
+        IndexType patchId = i+128*j;
+        for (IndexType c = 0; c < 11; ++c) {
+          for (IndexType r = 0; r < 11; ++r) {
+            for (IndexType d = 0; d < 3; ++d) {
+              DataType expected = 0.0f;
+              if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) {
+                expected = l_in_col_major(d, r-5+i, c-5+j, b);
+              }
+              // ColMajor
+              if (l_out_col_major(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) !=
+                  expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j
+                     << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+                     << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d),
+                              expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  sycl_device.deallocate(gpu_data_l_in_col_major);
+  sycl_device.deallocate(gpu_data_l_out_col_major);
+  sizeDim1 = 16;
+  sizeDim2 = 64;
+  sizeDim3 = 64;
+  sizeDim4 = 32;
+  tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  l_in_col_major.resize(tensorColMajorRange);
+  l_in_col_major.setRandom();
+  gpu_data_l_in_col_major  = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize1(gpu_data_l_in_col_major, tensorColMajorRange);
+
+  patchTensorRange={{sizeDim1, 9, 9, sizeDim2*sizeDim3, sizeDim4}};
+  l_out_col_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+  gpu_data_l_out_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize1(gpu_data_l_out_col_major, patchTensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+  gpu_l_out_col_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.extract_image_patches(9, 9);
+  sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 16);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 9);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 9);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 64*64);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+// RowMajor
+  sycl_device.deallocate(gpu_data_l_out_row_major);
+  patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 9, 9 ,sizeDim1}};
+  l_out_row_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+  gpu_data_l_out_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize1(gpu_data_l_out_row_major, patchTensorRange);
+  gpu_l_out_row_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.swap_layout().extract_image_patches(9, 9);
+  sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 16);
+
+  for (IndexType b = 0; b < 32; ++b) {
+    for (IndexType i = 0; i < 64; ++i) {
+      for (IndexType j = 0; j < 64; ++j) {
+        IndexType patchId = i+64*j;
+        for (IndexType c = 0; c < 9; ++c) {
+          for (IndexType r = 0; r < 9; ++r) {
+            for (IndexType d = 0; d < 16; ++d) {
+              DataType expected = 0.0f;
+              if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) {
+                expected = l_in_col_major(d, r-4+i, c-4+j, b);
+              }
+              // ColMajor
+              if (l_out_col_major(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+
+  sycl_device.deallocate(gpu_data_l_in_col_major);
+  sycl_device.deallocate(gpu_data_l_out_col_major);
+  sizeDim1 = 32;
+  sizeDim2 = 16;
+  sizeDim3 = 16;
+  sizeDim4 = 32;
+  tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  l_in_col_major.resize(tensorColMajorRange);
+  l_in_col_major.setRandom();
+  gpu_data_l_in_col_major  = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize2(gpu_data_l_in_col_major, tensorColMajorRange);
+
+  patchTensorRange={{sizeDim1, 7, 7, sizeDim2*sizeDim3, sizeDim4}};
+  l_out_col_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+  gpu_data_l_out_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize2(gpu_data_l_out_col_major, patchTensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+  gpu_l_out_col_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.extract_image_patches(7, 7);
+  sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 7);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 7);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 16*16);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+  // RowMajor
+  sycl_device.deallocate(gpu_data_l_out_row_major);
+  patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 7, 7 ,sizeDim1}};
+  l_out_row_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+  gpu_data_l_out_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize2(gpu_data_l_out_row_major, patchTensorRange);
+  gpu_l_out_row_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.swap_layout().extract_image_patches(7, 7);
+  sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 32);
+
+  for (IndexType b = 0; b < 32; ++b) {
+    for (IndexType i = 0; i < 16; ++i) {
+      for (IndexType j = 0; j < 16; ++j) {
+        IndexType patchId = i+16*j;
+        for (IndexType c = 0; c < 7; ++c) {
+          for (IndexType r = 0; r < 7; ++r) {
+            for (IndexType d = 0; d < 32; ++d) {
+              DataType expected = 0.0f;
+              if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) {
+                expected = l_in_col_major(d, r-3+i, c-3+j, b);
+              }
+              // ColMajor
+              if (l_out_col_major(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  sycl_device.deallocate(gpu_data_l_in_col_major);
+  sycl_device.deallocate(gpu_data_l_out_col_major);
+  sizeDim1 = 64;
+  sizeDim2 = 13;
+  sizeDim3 = 13;
+  sizeDim4 = 32;
+  tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  l_in_col_major.resize(tensorColMajorRange);
+  l_in_col_major.setRandom();
+  gpu_data_l_in_col_major  = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize3(gpu_data_l_in_col_major, tensorColMajorRange);
+
+  patchTensorRange={{sizeDim1, 3, 3, sizeDim2*sizeDim3, sizeDim4}};
+  l_out_col_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+  gpu_data_l_out_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize3(gpu_data_l_out_col_major, patchTensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+  gpu_l_out_col_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.extract_image_patches(3, 3);
+  sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 64);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 3);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 13*13);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+  // RowMajor
+  sycl_device.deallocate(gpu_data_l_out_row_major);
+  patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 3, 3 ,sizeDim1}};
+  l_out_row_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+  gpu_data_l_out_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize3(gpu_data_l_out_row_major, patchTensorRange);
+  gpu_l_out_row_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.swap_layout().extract_image_patches(3, 3);
+  sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 64);
+
+  for (IndexType b = 0; b < 32; ++b) {
+    for (IndexType i = 0; i < 13; ++i) {
+      for (IndexType j = 0; j < 13; ++j) {
+        IndexType patchId = i+13*j;
+        for (IndexType c = 0; c < 3; ++c) {
+          for (IndexType r = 0; r < 3; ++r) {
+            for (IndexType d = 0; d < 64; ++d) {
+              DataType expected = 0.0f;
+              if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) {
+                expected = l_in_col_major(d, r-1+i, c-1+j, b);
+              }
+              // ColMajor
+              if (l_out_col_major(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_l_in_col_major);
+  sycl_device.deallocate(gpu_data_l_out_col_major);
+  sycl_device.deallocate(gpu_data_l_out_row_major);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_image_patch_test_per_device(dev_Selector s){
+QueueInterface queueInterface(s);
+auto sycl_device = Eigen::SyclDevice(&queueInterface);
+test_simple_image_patch_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_valid_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_valid_same_value_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_same_sycl<DataType, int64_t>(sycl_device);
+test_patch_no_extra_dim_sycl<DataType, int64_t>(sycl_device);
+test_imagenet_patches_sycl<DataType, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_image_patch_sycl)
+{
+for (const auto& device :Eigen::get_sycl_supported_devices()) {
+  CALL_SUBTEST(sycl_tensor_image_patch_test_per_device<float>(device));
+}
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_index_list.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_index_list.cpp
index 4cf5df6667644824431060fa171b7810fca11de2..2166532c84e67fc4edfbdb36e4b8a7881aa31f58 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_index_list.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_index_list.cpp
@@ -22,9 +22,9 @@ static void test_static_index_list()
   VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0);
   VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
   VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 0);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 2);
 
   EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_axis) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT((internal::array_get<1>(reduction_axis) == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -167,19 +167,18 @@ static void test_type2indexpair_list()
 
   typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>> Dims0;
   typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::type2indexpair<1,11>, Eigen::type2indexpair<2,12>> Dims2_a;
-  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<2,12>> Dims2_b;
-  typedef Eigen::IndexPairList<Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<1,11>, Eigen::IndexPair<DenseIndex>> Dims2_c;
+  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::IndexPair<Index>, Eigen::type2indexpair<2,12>> Dims2_b;
+  typedef Eigen::IndexPairList<Eigen::IndexPair<Index>, Eigen::type2indexpair<1,11>, Eigen::IndexPair<Index>> Dims2_c;
 
-  Dims0 d0;
   Dims2_a d2_a;
 
   Dims2_b d2_b;
-  d2_b.set(1, Eigen::IndexPair<DenseIndex>(1,11));
+  d2_b.set(1, Eigen::IndexPair<Index>(1,11));
 
   Dims2_c d2_c;
-  d2_c.set(0, Eigen::IndexPair<DenseIndex>(Eigen::IndexPair<DenseIndex>(0,10)));
-  d2_c.set(1, Eigen::IndexPair<DenseIndex>(1,11));  // setting type2indexpair to correct value.
-  d2_c.set(2, Eigen::IndexPair<DenseIndex>(2,12));
+  d2_c.set(0, Eigen::IndexPair<Index>(Eigen::IndexPair<Index>(0,10)));
+  d2_c.set(1, Eigen::IndexPair<Index>(1,11));  // setting type2indexpair to correct value.
+  d2_c.set(2, Eigen::IndexPair<Index>(2,12));
 
   VERIFY_IS_EQUAL(d2_a[0].first, 0);
   VERIFY_IS_EQUAL(d2_a[0].second, 10);
@@ -278,9 +277,9 @@ static void test_dynamic_index_list()
   VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 2);
   VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
   VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 0);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 2);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 0);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 2);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 0);
 
   Tensor<float, 1> result = tensor.sum(reduction_axis);
   for (int i = 0; i < result.size(); ++i) {
@@ -310,10 +309,10 @@ static void test_mixed_index_list()
   VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
   VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
   VERIFY_IS_EQUAL(internal::array_get<3>(reduction_axis), 3);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[3]), 3);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 0);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 2);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[3]), 3);
 
   typedef IndexList<type2index<0>, int, type2index<2>, int> ReductionIndices;
   ReductionIndices reduction_indices;
@@ -373,7 +372,7 @@ static void test_dim_check()
 
 #endif
 
-void test_cxx11_tensor_index_list()
+EIGEN_DECLARE_TEST(cxx11_tensor_index_list)
 {
 #ifdef EIGEN_HAS_INDEX_LIST
   CALL_SUBTEST(test_static_index_list());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_inflation.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_inflation.cpp
index 4997935e97e472c3867ba56c9fc96593081d7760..75089e856fb42688411f0b162b1c321dcd3a1c9c 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_inflation.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_inflation.cpp
@@ -74,7 +74,7 @@ static void test_simple_inflation()
   }
 }
 
-void test_cxx11_tensor_inflation()
+EIGEN_DECLARE_TEST(cxx11_tensor_inflation)
 {
   CALL_SUBTEST(test_simple_inflation<ColMajor>());
   CALL_SUBTEST(test_simple_inflation<RowMajor>());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_inflation_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..521ae0cc3d31dbdf2f5d3e407d6d289ebb50c59b
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_inflation_sycl.cpp
@@ -0,0 +1,136 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+// Inflation Definition for each dimension the inflated val would be
+//((dim-1)*strid[dim] +1)
+
+// for 1 dimension vector of size 3 with value (4,4,4) with the inflated stride value of 3 would be changed to
+// tensor of size (2*3) +1 = 7 with the value of
+// (4, 0, 0, 4, 0, 0, 4).
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_simple_inflation_sycl(const Eigen::SyclDevice &sycl_device) {
+
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensorRange);
+  tensor.setRandom();
+
+  array<IndexType, 4> strides;
+  strides[0] = 1;
+  strides[1] = 1;
+  strides[2] = 1;
+  strides[3] = 1;
+
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_no_stride  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_no_stride(gpu_data_no_stride, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_no_stride.device(sycl_device)=gpu_tensor.inflate(strides);
+  sycl_device.memcpyDeviceToHost(no_stride.data(), gpu_data_no_stride, tensorBuffSize);
+
+  VERIFY_IS_EQUAL(no_stride.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(no_stride.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(no_stride.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(no_stride.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+
+  IndexType inflatedSizeDim1 = 3;
+  IndexType inflatedSizeDim2 = 9;
+  IndexType inflatedSizeDim3 = 9;
+  IndexType inflatedSizeDim4 = 19;
+  array<IndexType, 4> inflatedTensorRange = {{inflatedSizeDim1, inflatedSizeDim2, inflatedSizeDim3, inflatedSizeDim4}};
+
+  Tensor<DataType, 4, DataLayout, IndexType> inflated(inflatedTensorRange);
+
+  const size_t inflatedTensorBuffSize =inflated.size()*sizeof(DataType);
+  DataType* gpu_data_inflated  = static_cast<DataType*>(sycl_device.allocate(inflatedTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_inflated(gpu_data_inflated, inflatedTensorRange);
+  gpu_inflated.device(sycl_device)=gpu_tensor.inflate(strides);
+  sycl_device.memcpyDeviceToHost(inflated.data(), gpu_data_inflated, inflatedTensorBuffSize);
+
+  VERIFY_IS_EQUAL(inflated.dimension(0), inflatedSizeDim1);
+  VERIFY_IS_EQUAL(inflated.dimension(1), inflatedSizeDim2);
+  VERIFY_IS_EQUAL(inflated.dimension(2), inflatedSizeDim3);
+  VERIFY_IS_EQUAL(inflated.dimension(3), inflatedSizeDim4);
+
+  for (IndexType i = 0; i < inflatedSizeDim1; ++i) {
+    for (IndexType j = 0; j < inflatedSizeDim2; ++j) {
+      for (IndexType k = 0; k < inflatedSizeDim3; ++k) {
+        for (IndexType l = 0; l < inflatedSizeDim4; ++l) {
+          if (i % strides[0] == 0 &&
+              j % strides[1] == 0 &&
+              k % strides[2] == 0 &&
+              l % strides[3] == 0) {
+            VERIFY_IS_EQUAL(inflated(i,j,k,l),
+                            tensor(i/strides[0], j/strides[1], k/strides[2], l/strides[3]));
+          } else {
+            VERIFY_IS_EQUAL(0, inflated(i,j,k,l));
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_no_stride);
+  sycl_device.deallocate(gpu_data_inflated);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_inflation_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_inflation_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_inflation_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_inflation_test_per_device<float>(device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_intdiv.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_intdiv.cpp
index 8e2b70b75ee7acd5fa328e31e9c56a1222823fea..d18a05ec4c3f2d376f671aef3b8accd06aa0101d 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_intdiv.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_intdiv.cpp
@@ -135,7 +135,7 @@ void test_specific() {
   VERIFY_IS_EQUAL(result, result_op);
 }
 
-void test_cxx11_tensor_intdiv()
+EIGEN_DECLARE_TEST(cxx11_tensor_intdiv)
 {
   CALL_SUBTEST_1(test_signed_32bit());
   CALL_SUBTEST_2(test_unsigned_32bit());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_io.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_io.cpp
index 489960529a8b7dc1d725f6fcd677f044d683b308..2c638f9bf1b4fbcbd4da86e983469980db8b5055 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_io.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_io.cpp
@@ -119,7 +119,7 @@ static void test_output_const()
 }
 
 
-void test_cxx11_tensor_io()
+EIGEN_DECLARE_TEST(cxx11_tensor_io)
 {
   CALL_SUBTEST(test_output_0d<ColMajor>());
   CALL_SUBTEST(test_output_0d<RowMajor>());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_layout_swap.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_layout_swap.cpp
index ae297a9da1169ac4dab3d10e9873187cb99c3366..efb33336073698b0311db3b4f8b207d8e87ba235 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_layout_swap.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_layout_swap.cpp
@@ -54,7 +54,7 @@ static void test_swap_as_lvalue()
 }
 
 
-void test_cxx11_tensor_layout_swap()
+EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap)
 {
   CALL_SUBTEST(test_simple_swap());
   CALL_SUBTEST(test_swap_as_lvalue());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9546b911caa00cd28392217807424caee8acf35b
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
@@ -0,0 +1,126 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, typename IndexType>
+static void test_simple_swap_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 7;
+  array<IndexType, 3> tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}};
+
+
+  Tensor<DataType, 3, ColMajor, IndexType> tensor1(tensorColRange);
+  Tensor<DataType, 3, RowMajor, IndexType> tensor2(tensorRowRange);
+  tensor1.setRandom();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu1(gpu_data1, tensorColRange);
+  TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu2(gpu_data2, tensorRowRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+
+//  Tensor<float, 3, ColMajor> tensor(2,3,7);
+  //tensor.setRandom();
+
+//  Tensor<float, 3, RowMajor> tensor2 = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2));
+  VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1));
+  VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+template <typename DataType, typename IndexType>
+static void test_swap_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 7;
+  array<IndexType, 3> tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}};
+
+  Tensor<DataType, 3, ColMajor, IndexType> tensor1(tensorColRange);
+  Tensor<DataType, 3, RowMajor, IndexType> tensor2(tensorRowRange);
+  tensor1.setRandom();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu1(gpu_data1, tensorColRange);
+  TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu2(gpu_data2, tensorRowRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+  gpu2.swap_layout().device(sycl_device)=gpu1;
+  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+
+//  Tensor<float, 3, ColMajor> tensor(2,3,7);
+//  tensor.setRandom();
+
+  //Tensor<float, 3, RowMajor> tensor2(7,3,2);
+//  tensor2.swap_layout() = tensor;
+  VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2));
+  VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1));
+  VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_layout_swap_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_swap_sycl<DataType, int64_t>(sycl_device);
+  test_swap_as_lvalue_sycl<DataType, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_lvalue.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_lvalue.cpp
index 071f5b4065871294179b20fc55e49dea2f53e300..6ba9a212d0be5fab13bc95c9ff199be9fa798c29 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_lvalue.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_lvalue.cpp
@@ -36,7 +36,7 @@ static void test_compound_assignment()
 }
 
 
-void test_cxx11_tensor_lvalue()
+EIGEN_DECLARE_TEST(cxx11_tensor_lvalue)
 {
   CALL_SUBTEST(test_compound_assignment());
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_map.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_map.cpp
index 3db0ee7c061ad62bed2794de18833651f5ff0fcf..4d4f68911ef310dbd298c497c71053a345940df4 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_map.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_map.cpp
@@ -19,8 +19,8 @@ static void test_0d()
   Tensor<int, 0> scalar1;
   Tensor<int, 0, RowMajor> scalar2;
 
-  TensorMap<Tensor<const int, 0> > scalar3(scalar1.data());
-  TensorMap<Tensor<const int, 0, RowMajor> > scalar4(scalar2.data());
+  TensorMap<const Tensor<int, 0> > scalar3(scalar1.data());
+  TensorMap<const Tensor<int, 0, RowMajor> > scalar4(scalar2.data());
 
   scalar1() = 7;
   scalar2() = 13;
@@ -37,8 +37,8 @@ static void test_1d()
   Tensor<int, 1> vec1(6);
   Tensor<int, 1, RowMajor> vec2(6);
 
-  TensorMap<Tensor<const int, 1> > vec3(vec1.data(), 6);
-  TensorMap<Tensor<const int, 1, RowMajor> > vec4(vec2.data(), 6);
+  TensorMap<const Tensor<int, 1> > vec3(vec1.data(), 6);
+  TensorMap<const Tensor<int, 1, RowMajor> > vec4(vec2.data(), 6);
 
   vec1(0) = 4;  vec2(0) = 0;
   vec1(1) = 8;  vec2(1) = 1;
@@ -85,8 +85,8 @@ static void test_2d()
   mat2(1,1) = 4;
   mat2(1,2) = 5;
 
-  TensorMap<Tensor<const int, 2> > mat3(mat1.data(), 2, 3);
-  TensorMap<Tensor<const int, 2, RowMajor> > mat4(mat2.data(), 2, 3);
+  TensorMap<const Tensor<int, 2> > mat3(mat1.data(), 2, 3);
+  TensorMap<const Tensor<int, 2, RowMajor> > mat4(mat2.data(), 2, 3);
 
   VERIFY_IS_EQUAL(mat3.rank(), 2);
   VERIFY_IS_EQUAL(mat3.size(), 6);
@@ -129,8 +129,8 @@ static void test_3d()
     }
   }
 
-  TensorMap<Tensor<const int, 3> > mat3(mat1.data(), 2, 3, 7);
-  TensorMap<Tensor<const int, 3, RowMajor> > mat4(mat2.data(), 2, 3, 7);
+  TensorMap<const Tensor<int, 3> > mat3(mat1.data(), 2, 3, 7);
+  TensorMap<const Tensor<int, 3, RowMajor> > mat4(mat2.data(), 2, 3, 7);
 
   VERIFY_IS_EQUAL(mat3.rank(), 3);
   VERIFY_IS_EQUAL(mat3.size(), 2*3*7);
@@ -265,7 +265,54 @@ static void test_casting()
   VERIFY_IS_EQUAL(sum1, 861);
 }
 
-void test_cxx11_tensor_map()
+template<typename T>
+static const T& add_const(T& value) {
+  return value;
+}
+
+static void test_0d_const_tensor()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+
+  TensorMap<const Tensor<int, 0> > scalar3(add_const(scalar1).data());
+  TensorMap<const Tensor<int, 0, RowMajor> > scalar4(add_const(scalar2).data());
+
+  scalar1() = 7;
+  scalar2() = 13;
+
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+
+  VERIFY_IS_EQUAL(scalar3(), 7);
+  VERIFY_IS_EQUAL(scalar4(), 13);
+}
+
+static void test_0d_const_tensor_map()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+
+  const TensorMap<Tensor<int, 0> > scalar3(scalar1.data());
+  const TensorMap<Tensor<int, 0, RowMajor> > scalar4(scalar2.data());
+
+  // Although TensorMap is constant, we still can write to the underlying
+  // storage, because we map over non-constant Tensor.
+  scalar3() = 7;
+  scalar4() = 13;
+
+  VERIFY_IS_EQUAL(scalar1(), 7);
+  VERIFY_IS_EQUAL(scalar2(), 13);
+
+  // Pointer to the underlying storage is also non-const.
+  scalar3.data()[0] = 8;
+  scalar4.data()[0] = 14;
+
+  VERIFY_IS_EQUAL(scalar1(), 8);
+  VERIFY_IS_EQUAL(scalar2(), 14);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_map)
 {
   CALL_SUBTEST(test_0d());
   CALL_SUBTEST(test_1d());
@@ -274,4 +321,7 @@ void test_cxx11_tensor_map()
 
   CALL_SUBTEST(test_from_tensor());
   CALL_SUBTEST(test_casting());
+
+  CALL_SUBTEST(test_0d_const_tensor());
+  CALL_SUBTEST(test_0d_const_tensor_map());
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_math.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_math.cpp
index 61c742a168f4819cd7dcfeab63fd8ce59434eb96..82a1a26d85e1f2c57c61cb4f8744e085d559d02a 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_math.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_math.cpp
@@ -39,7 +39,7 @@ static void test_sigmoid()
 }
 
 
-void test_cxx11_tensor_math()
+EIGEN_DECLARE_TEST(cxx11_tensor_math)
 {
   CALL_SUBTEST(test_tanh());
   CALL_SUBTEST(test_sigmoid());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_math_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_math_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..029653e271b95528cddf84c2222d6d4dab4e2694
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_math_sycl.cpp
@@ -0,0 +1,105 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_tanh_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  IndexType sizeDim1 = 4;
+  IndexType sizeDim2 = 4;
+  IndexType sizeDim3 = 1;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange);
+
+  in = in.random();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType));
+  gpu2.device(sycl_device) = gpu1.tanh();
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType));
+
+  out_cpu=in.tanh();
+
+  for (int i = 0; i < in.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), out_cpu(i));
+  }
+}
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_sigmoid_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  IndexType sizeDim1 = 4;
+  IndexType sizeDim2 = 4;
+  IndexType sizeDim3 = 1;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange);
+
+  in = in.random();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType));
+  gpu2.device(sycl_device) = gpu1.sigmoid();
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType));
+
+  out_cpu=in.sigmoid();
+
+  for (int i = 0; i < in.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), out_cpu(i));
+  }
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_tanh_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_tanh_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_sigmoid_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_sigmoid_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_mixed_indices.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_mixed_indices.cpp
index 4fba6fdd17b64279e2762de2e95f2d0a244f68a3..ee2616fd7f082117cb119a0d1bcb054b94b3be09 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_mixed_indices.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_mixed_indices.cpp
@@ -47,7 +47,7 @@ static void test_simple()
 }
 
 
-void test_cxx11_tensor_mixed_indices()
+EIGEN_DECLARE_TEST(cxx11_tensor_mixed_indices)
 {
   CALL_SUBTEST(test_simple());
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_morphing.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_morphing.cpp
index f7de43110faa50cacd72ff213a9477bc4ac5aed9..ed5d5ade3926908106990104048fc6ad7436ee42 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_morphing.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_morphing.cpp
@@ -41,7 +41,29 @@ static void test_simple_reshape()
   }
 }
 
-template<typename>
+template <typename>
+static void test_static_reshape() {
+#if defined(EIGEN_HAS_INDEX_LIST)
+  using Eigen::type2index;
+
+  Tensor<float, 5> tensor(2, 3, 1, 7, 1);
+  tensor.setRandom();
+
+  // New dimensions: [2, 3, 7]
+  Eigen::IndexList<type2index<2>, type2index<3>, type2index<7>> dim;
+  Tensor<float, 3> reshaped = tensor.reshape(static_cast<Eigen::DSizes<ptrdiff_t,3>>(dim));
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor(i, j, 0, k, 0), reshaped(i, j, k));
+      }
+    }
+  }
+#endif
+}
+
+template <typename>
 static void test_reshape_in_expr() {
   MatrixXf m1(2,3*5*7*11);
   MatrixXf m2(3*5*7*11,13);
@@ -90,19 +112,19 @@ static void test_reshape_as_lvalue()
   }
 }
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_simple_slice()
 {
-  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  Tensor<T, 5, DataLayout> tensor(2,3,5,7,11);
   tensor.setRandom();
 
-  Tensor<float, 5, DataLayout> slice1(1,1,1,1,1);
+  Tensor<T, 5, DataLayout> slice1(1,1,1,1,1);
   Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
   Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
   slice1 = tensor.slice(indices, sizes);
   VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
 
-  Tensor<float, 5, DataLayout> slice2(1,1,2,2,3);
+  Tensor<T, 5, DataLayout> slice2(1,1,2,2,3);
   Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
   Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
   slice2 = tensor.slice(indices2, sizes2);
@@ -115,20 +137,20 @@ static void test_simple_slice()
   }
 }
 
-template<typename=void>
+template<typename T>
 static void test_const_slice()
 {
-  const float b[1] = {42};
-  TensorMap<Tensor<const float, 1> > m(b, 1);
+  const T b[1] = {42};
+  TensorMap<Tensor<const T, 1> > m(b, 1);
   DSizes<DenseIndex, 1> offsets;
   offsets[0] = 0;
-  TensorRef<Tensor<const float, 1> > slice_ref(m.slice(offsets, m.dimensions()));
+  TensorRef<Tensor<const T, 1> > slice_ref(m.slice(offsets, m.dimensions()));
   VERIFY_IS_EQUAL(slice_ref(0), 42);
 }
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_slice_in_expr() {
-  typedef Matrix<float, Dynamic, Dynamic, DataLayout> Mtx;
+  typedef Matrix<T, Dynamic, Dynamic, DataLayout> Mtx;
   Mtx m1(7,7);
   Mtx m2(3,3);
   m1.setRandom();
@@ -136,10 +158,10 @@ static void test_slice_in_expr() {
 
   Mtx m3 = m1.block(1, 2, 3, 3) * m2.block(0, 2, 3, 1);
 
-  TensorMap<Tensor<float, 2, DataLayout>> tensor1(m1.data(), 7, 7);
-  TensorMap<Tensor<float, 2, DataLayout>> tensor2(m2.data(), 3, 3);
-  Tensor<float, 2, DataLayout> tensor3(3,1);
-  typedef Tensor<float, 1>::DimensionPair DimPair;
+  TensorMap<Tensor<T, 2, DataLayout>> tensor1(m1.data(), 7, 7);
+  TensorMap<Tensor<T, 2, DataLayout>> tensor2(m2.data(), 3, 3);
+  Tensor<T, 2, DataLayout> tensor3(3,1);
+  typedef typename Tensor<T, 1>::DimensionPair DimPair;
   array<DimPair, 1> contract_along{{DimPair(1, 0)}};
 
   Eigen::DSizes<ptrdiff_t, 2> indices1(1,2);
@@ -156,28 +178,28 @@ static void test_slice_in_expr() {
   }
 
   // Take an arbitrary slice of an arbitrarily sized tensor.
-  TensorMap<Tensor<const float, 2, DataLayout>> tensor4(m1.data(), 7, 7);
-  Tensor<float, 1, DataLayout> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35));
+  TensorMap<Tensor<const T, 2, DataLayout>> tensor4(m1.data(), 7, 7);
+  Tensor<T, 1, DataLayout> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35));
   for (int i = 0; i < 35; ++i) {
     VERIFY_IS_APPROX(tensor6(i), expf(tensor4.data()[i]));
   }
 }
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_slice_as_lvalue()
 {
-  Tensor<float, 3, DataLayout> tensor1(2,2,7);
+  Tensor<T, 3, DataLayout> tensor1(2,2,7);
   tensor1.setRandom();
-  Tensor<float, 3, DataLayout> tensor2(2,2,7);
+  Tensor<T, 3, DataLayout> tensor2(2,2,7);
   tensor2.setRandom();
-  Tensor<float, 3, DataLayout> tensor3(4,3,5);
+  Tensor<T, 3, DataLayout> tensor3(4,3,5);
   tensor3.setRandom();
-  Tensor<float, 3, DataLayout> tensor4(4,3,2);
+  Tensor<T, 3, DataLayout> tensor4(4,3,2);
   tensor4.setRandom();
-  Tensor<float, 3, DataLayout> tensor5(10,13,12);
+  Tensor<T, 3, DataLayout> tensor5(10,13,12);
   tensor5.setRandom();
 
-  Tensor<float, 3, DataLayout> result(4,5,7);
+  Tensor<T, 3, DataLayout> result(4,5,7);
   Eigen::DSizes<ptrdiff_t, 3> sizes12(2,2,7);
   Eigen::DSizes<ptrdiff_t, 3> first_slice(0,0,0);
   result.slice(first_slice, sizes12) = tensor1;
@@ -223,10 +245,10 @@ static void test_slice_as_lvalue()
   }
 }
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_slice_raw_data()
 {
-  Tensor<float, 4, DataLayout> tensor(3,5,7,11);
+  Tensor<T, 4, DataLayout> tensor(3,5,7,11);
   tensor.setRandom();
 
   Eigen::DSizes<ptrdiff_t, 4> offsets(1,2,3,4);
@@ -253,7 +275,7 @@ static void test_slice_raw_data()
   extents = Eigen::DSizes<ptrdiff_t, 4>(1,2,1,1);
   auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
   VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2);
-  VERIFY_IS_EQUAL(slice3.data(), static_cast<float*>(0));
+  VERIFY_IS_EQUAL(slice3.data(), static_cast<T*>(0));
 
   if (DataLayout == ColMajor) {
     offsets = Eigen::DSizes<ptrdiff_t, 4>(0,2,3,4);
@@ -318,15 +340,15 @@ static void test_slice_raw_data()
 }
 
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_strided_slice()
 {
-  typedef Tensor<float, 5, DataLayout> Tensor5f;
+  typedef Tensor<T, 5, DataLayout> Tensor5f;
   typedef Eigen::DSizes<Eigen::DenseIndex, 5> Index5;
-  typedef Tensor<float, 2, DataLayout> Tensor2f;
+  typedef Tensor<T, 2, DataLayout> Tensor2f;
   typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
-  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
-  Tensor<float, 2, DataLayout> tensor2(7,11);
+  Tensor<T, 5, DataLayout> tensor(2,3,5,7,11);
+  Tensor<T, 2, DataLayout> tensor2(7,11);
   tensor.setRandom();
   tensor2.setRandom();
 
@@ -412,13 +434,13 @@ static void test_strided_slice()
   }
 }
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_strided_slice_write()
 {
-  typedef Tensor<float, 2, DataLayout> Tensor2f;
+  typedef Tensor<T, 2, DataLayout> Tensor2f;
   typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
 
-  Tensor<float, 2, DataLayout> tensor(7,11),tensor2(7,11);
+  Tensor<T, 2, DataLayout> tensor(7,11),tensor2(7,11);
   tensor.setRandom();
   tensor2=tensor;
   Tensor2f slice(2,3);
@@ -438,15 +460,14 @@ static void test_strided_slice_write()
   }
 }
 
-
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_composition()
 {
-  Eigen::Tensor<float, 2, DataLayout> matrix(7, 11);
+  Eigen::Tensor<T, 2, DataLayout> matrix(7, 11);
   matrix.setRandom();
 
   const DSizes<ptrdiff_t, 3> newDims(1, 1, 11);
-  Eigen::Tensor<float, 3, DataLayout> tensor =
+  Eigen::Tensor<T, 3, DataLayout> tensor =
       matrix.slice(DSizes<ptrdiff_t, 2>(2, 0), DSizes<ptrdiff_t, 2>(1, 11)).reshape(newDims);
 
   VERIFY_IS_EQUAL(tensor.dimensions().TotalSize(), 11);
@@ -458,28 +479,87 @@ static void test_composition()
   }
 }
 
+template<typename T, int DataLayout>
+static void test_empty_slice()
+{
+  Tensor<T, 3, DataLayout> tensor(2,3,5);
+  tensor.setRandom();
+  Tensor<T, 3, DataLayout> copy = tensor;
+
+  // empty size in first dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices1(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes1(0,1,2);
+  Tensor<T, 3, DataLayout> slice1(0,1,2);
+  slice1.setRandom();
+  tensor.slice(indices1, sizes1) = slice1;
+
+  // empty size in second dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices2(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes2(1,0,2);
+  Tensor<T, 3, DataLayout> slice2(1,0,2);
+  slice2.setRandom();
+  tensor.slice(indices2, sizes2) = slice2;
+
+  // empty size in third dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices3(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes3(1,1,0);
+  Tensor<T, 3, DataLayout> slice3(1,1,0);
+  slice3.setRandom();
+  tensor.slice(indices3, sizes3) = slice3;
+
+  // empty size in first and second dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices4(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes4(0,0,2);
+  Tensor<T, 3, DataLayout> slice4(0,0,2);
+  slice4.setRandom();
+  tensor.slice(indices4, sizes4) = slice4;
+
+  // empty size in second and third dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices5(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes5(1,0,0);
+  Tensor<T, 3, DataLayout> slice5(1,0,0);
+  slice5.setRandom();
+  tensor.slice(indices5, sizes5) = slice5;
+
+  // empty size in all dimensions
+  Eigen::DSizes<ptrdiff_t, 3> indices6(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes6(0,0,0);
+  Tensor<T, 3, DataLayout> slice6(0,0,0);
+  slice6.setRandom();
+  tensor.slice(indices6, sizes6) = slice6;
+
+  // none of these operations should change the tensor's components
+  // because all of the rvalue slices have at least one zero dimension
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+          VERIFY_IS_EQUAL(tensor(i,j,k), copy(i,j,k));
+      }
+    }
+  }
+}
+
+#define CALL_SUBTEST_PART(PART) \
+  CALL_SUBTEST_##PART
+
+#define CALL_SUBTESTS_TYPES_LAYOUTS(PART, NAME)       \
+  CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, RowMajor>()))
 
-void test_cxx11_tensor_morphing()
+EIGEN_DECLARE_TEST(cxx11_tensor_morphing)
 {
   CALL_SUBTEST_1(test_simple_reshape<void>());
-  CALL_SUBTEST_1(test_reshape_in_expr<void>());
+  CALL_SUBTEST_1(test_static_reshape<void>());
   CALL_SUBTEST_1(test_reshape_as_lvalue<void>());
-
-  CALL_SUBTEST_1(test_simple_slice<ColMajor>());
-  CALL_SUBTEST_1(test_simple_slice<RowMajor>());
-  CALL_SUBTEST_1(test_const_slice());
-  CALL_SUBTEST_2(test_slice_in_expr<ColMajor>());
-  CALL_SUBTEST_3(test_slice_in_expr<RowMajor>());
-  CALL_SUBTEST_4(test_slice_as_lvalue<ColMajor>());
-  CALL_SUBTEST_4(test_slice_as_lvalue<RowMajor>());
-  CALL_SUBTEST_5(test_slice_raw_data<ColMajor>());
-  CALL_SUBTEST_5(test_slice_raw_data<RowMajor>());
-
-  CALL_SUBTEST_6(test_strided_slice_write<ColMajor>());
-  CALL_SUBTEST_6(test_strided_slice<ColMajor>());
-  CALL_SUBTEST_6(test_strided_slice_write<RowMajor>());
-  CALL_SUBTEST_6(test_strided_slice<RowMajor>());
-
-  CALL_SUBTEST_7(test_composition<ColMajor>());
-  CALL_SUBTEST_7(test_composition<RowMajor>());
+  CALL_SUBTEST_1(test_reshape_in_expr<void>());
+  CALL_SUBTEST_1(test_const_slice<float>());
+
+  CALL_SUBTESTS_TYPES_LAYOUTS(2, test_simple_slice);
+  CALL_SUBTESTS_TYPES_LAYOUTS(3, test_slice_as_lvalue);
+  CALL_SUBTESTS_TYPES_LAYOUTS(4, test_slice_raw_data);
+  CALL_SUBTESTS_TYPES_LAYOUTS(5, test_strided_slice_write);
+  CALL_SUBTESTS_TYPES_LAYOUTS(6, test_strided_slice);
+  CALL_SUBTESTS_TYPES_LAYOUTS(7, test_composition);
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_morphing_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bf001b40fb2906d1f2cbcf9b99ba181ad0203d10
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_morphing_sycl.cpp
@@ -0,0 +1,386 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)
+{
+  typename Tensor<DataType, 5 ,DataLayout, IndexType>::Dimensions dim1(2,3,1,7,1);
+  typename Tensor<DataType, 3 ,DataLayout, IndexType>::Dimensions dim2(2,3,7);
+  typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim3(6,7);
+  typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim4(2,21);
+
+  Tensor<DataType, 5, DataLayout, IndexType> tensor1(dim1);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor2(dim2);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor3(dim3);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor4(dim4);
+
+  tensor1.setRandom();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
+  DataType* gpu_data4  = static_cast<DataType*>(sycl_device.allocate(tensor4.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, dim1);
+  TensorMap<Tensor<DataType, 3,DataLayout, IndexType>> gpu2(gpu_data2, dim2);
+  TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu3(gpu_data3, dim3);
+  TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu4(gpu_data4, dim4);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+
+  gpu2.device(sycl_device)=gpu1.reshape(dim2);
+  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor1.size())*sizeof(DataType));
+
+  gpu3.device(sycl_device)=gpu1.reshape(dim3);
+  sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType));
+
+  gpu4.device(sycl_device)=gpu1.reshape(dim2).reshape(dim4);
+  sycl_device.memcpyDeviceToHost(tensor4.data(), gpu_data4,(tensor4.size())*sizeof(DataType));
+  for (IndexType i = 0; i < 2; ++i){
+    for (IndexType j = 0; j < 3; ++j){
+      for (IndexType k = 0; k < 7; ++k){
+        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k));      ///ColMajor
+        if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
+          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k));    ///ColMajor
+          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j+3*k));    ///ColMajor
+        }
+        else{
+          //VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k));      /// RowMajor
+          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j*7 +k));   /// RowMajor
+          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i*3 +j,k));   /// RowMajor
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+  sycl_device.deallocate(gpu_data4);
+}
+
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+  typename Tensor<DataType, 3, DataLayout, IndexType>::Dimensions dim1(2,3,7);
+  typename Tensor<DataType, 2, DataLayout, IndexType>::Dimensions dim2(6,7);
+  typename Tensor<DataType, 5, DataLayout, IndexType>::Dimensions dim3(2,3,1,7,1);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor(dim1);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor2d(dim2);
+  Tensor<DataType, 5, DataLayout, IndexType> tensor5d(dim3);
+
+  tensor.setRandom();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2d.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor5d.size()*sizeof(DataType)));
+
+  TensorMap< Tensor<DataType, 3, DataLayout, IndexType> > gpu1(gpu_data1, dim1);
+  TensorMap< Tensor<DataType, 2, DataLayout, IndexType> > gpu2(gpu_data2, dim2);
+  TensorMap< Tensor<DataType, 5, DataLayout, IndexType> > gpu3(gpu_data3, dim3);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+
+  gpu2.reshape(dim1).device(sycl_device)=gpu1;
+  sycl_device.memcpyDeviceToHost(tensor2d.data(), gpu_data2,(tensor2d.size())*sizeof(DataType));
+
+  gpu3.reshape(dim1).device(sycl_device)=gpu1;
+  sycl_device.memcpyDeviceToHost(tensor5d.data(), gpu_data3,(tensor5d.size())*sizeof(DataType));
+
+
+  for (IndexType i = 0; i < 2; ++i){
+    for (IndexType j = 0; j < 3; ++j){
+      for (IndexType k = 0; k < 7; ++k){
+        VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k));
+        if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
+          VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k));    ///ColMajor
+        }
+        else{
+          VERIFY_IS_EQUAL(tensor2d(i*3 +j,k),tensor(i,j,k));   /// RowMajor
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_slice(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 5,DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+  array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
+  Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
+  sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
+  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+
+
+  array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
+  Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
+  gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
+  sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_strided_slice_as_rhs_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+  typedef Eigen::DSizes<IndexType, 5> Index5;
+  Index5 strides(1L,1L,1L,1L,1L);
+  Index5 indicesStart(1L,2L,3L,4L,5L);
+  Index5 indicesStop(2L,3L,4L,5L,6L);
+  Index5 lengths(1L,1L,1L,1L,1L);
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 5, DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+
+  array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);
+  Tensor<DataType, 5, DataLayout, IndexType> slice_stride1(slice1_range);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
+  DataType* gpu_data_stride2  = static_cast<DataType*>(sycl_device.allocate(slice_stride1.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride2(gpu_data_stride2, slice1_range);
+
+  Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
+  sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
+
+  gpu_stride2.device(sycl_device)=gpu1.stridedSlice(indicesStart,indicesStop,strides);
+  sycl_device.memcpyDeviceToHost(slice_stride1.data(), gpu_data_stride2,(slice_stride1.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+  VERIFY_IS_EQUAL(slice_stride1(0,0,0,0,0), tensor(1,2,3,4,5));
+
+  array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
+  Tensor<DataType, 5, DataLayout, IndexType> strideSlice2(slice2_range);
+
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
+  DataType* gpu_data_stride3  = static_cast<DataType*>(sycl_device.allocate(strideSlice2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride3(gpu_data_stride3, slice2_range);
+  Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
+  Index5 strides2(1L,1L,1L,1L,1L);
+  Index5 indicesStart2(1L,1L,3L,4L,5L);
+  Index5 indicesStop2(2L,2L,5L,6L,8L);
+
+  gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
+  sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
+
+  gpu_stride3.device(sycl_device)=gpu1.stridedSlice(indicesStart2,indicesStop2,strides2);
+  sycl_device.memcpyDeviceToHost(strideSlice2.data(), gpu_data_stride3,(strideSlice2.size())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+        VERIFY_IS_EQUAL(strideSlice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  typedef Tensor<DataType, 2, DataLayout, IndexType> Tensor2f;
+  typedef Eigen::DSizes<IndexType, 2> Index2;
+  IndexType sizeDim1 = 7L;
+  IndexType sizeDim2 = 11L;
+  array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+  Tensor<DataType, 2, DataLayout, IndexType> tensor(tensorRange),tensor2(tensorRange);
+  IndexType sliceDim1 = 2;
+  IndexType sliceDim2 = 3;
+  array<IndexType, 2> sliceRange = {{sliceDim1, sliceDim2}};
+  Tensor2f slice(sliceRange);
+  Index2 strides(1L,1L);
+  Index2 indicesStart(3L,4L);
+  Index2 indicesStop(5L,7L);
+  Index2 lengths(2L,3L);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu2(gpu_data2, tensorRange);
+  TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu3(gpu_data3, sliceRange);
+
+
+  tensor.setRandom();
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1;
+
+  slice.setRandom();
+  sycl_device.memcpyHostToDevice(gpu_data3, slice.data(),(slice.size())*sizeof(DataType));
+
+
+  gpu1.slice(indicesStart,lengths).device(sycl_device)=gpu3;
+  gpu2.stridedSlice(indicesStart,indicesStop,strides).device(sycl_device)=gpu3;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+  for(IndexType i=0;i<sizeDim1;i++)
+    for(IndexType j=0;j<sizeDim2;j++){
+    VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j));
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+template <typename OutIndex, typename DSizes>
+Eigen::array<OutIndex, DSizes::count> To32BitDims(const DSizes& in) {
+  Eigen::array<OutIndex, DSizes::count> out;
+  for (int i = 0; i < DSizes::count; ++i) {
+    out[i] = in[i];
+  }
+  return out;
+}
+
+template <class DataType, int DataLayout, typename IndexType, typename ConvertedIndexType>
+int run_eigen(const SyclDevice& sycl_device) {
+  using TensorI64 = Tensor<DataType, 5, DataLayout, IndexType>;
+  using TensorI32 = Tensor<DataType, 5, DataLayout, ConvertedIndexType>;
+  using TensorMI64 = TensorMap<TensorI64>;
+  using TensorMI32 = TensorMap<TensorI32>;
+  Eigen::array<IndexType, 5> tensor_range{{4, 1, 1, 1, 6}};
+  Eigen::array<IndexType, 5> slice_range{{4, 1, 1, 1, 3}};
+
+  TensorI64 out_tensor_gpu(tensor_range);
+  TensorI64 out_tensor_cpu(tensor_range);
+  out_tensor_cpu.setRandom();
+
+  TensorI64 sub_tensor(slice_range);
+  sub_tensor.setRandom();
+
+  DataType* out_gpu_data = static_cast<DataType*>(sycl_device.allocate(out_tensor_cpu.size() * sizeof(DataType)));
+  DataType* sub_gpu_data = static_cast<DataType*>(sycl_device.allocate(sub_tensor.size() * sizeof(DataType)));
+  TensorMI64 out_gpu(out_gpu_data, tensor_range);
+  TensorMI64 sub_gpu(sub_gpu_data, slice_range);
+
+  sycl_device.memcpyHostToDevice(out_gpu_data, out_tensor_cpu.data(), out_tensor_cpu.size() * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(sub_gpu_data, sub_tensor.data(), sub_tensor.size() * sizeof(DataType));
+
+  Eigen::array<ConvertedIndexType, 5> slice_offset_32{{0, 0, 0, 0, 3}};
+  Eigen::array<ConvertedIndexType, 5> slice_range_32{{4, 1, 1, 1, 3}};
+  TensorMI32 out_cpu_32(out_tensor_cpu.data(), To32BitDims<ConvertedIndexType>(out_tensor_cpu.dimensions()));
+  TensorMI32 sub_cpu_32(sub_tensor.data(), To32BitDims<ConvertedIndexType>(sub_tensor.dimensions()));
+  TensorMI32 out_gpu_32(out_gpu.data(), To32BitDims<ConvertedIndexType>(out_gpu.dimensions()));
+  TensorMI32 sub_gpu_32(sub_gpu.data(), To32BitDims<ConvertedIndexType>(sub_gpu.dimensions()));
+
+  out_gpu_32.slice(slice_offset_32, slice_range_32).device(sycl_device) = sub_gpu_32;
+
+  out_cpu_32.slice(slice_offset_32, slice_range_32) = sub_cpu_32;
+
+  sycl_device.memcpyDeviceToHost(out_tensor_gpu.data(), out_gpu_data, out_tensor_cpu.size() * sizeof(DataType));
+  int has_err = 0;
+  for (IndexType i = 0; i < out_tensor_cpu.size(); ++i) {
+    auto exp = out_tensor_cpu(i);
+    auto val = out_tensor_gpu(i);
+    if (val != exp) {
+      std::cout << "#" << i << " got " << val << " but expected " << exp << std::endl;
+      has_err = 1;
+    }
+  }
+  sycl_device.deallocate(out_gpu_data);
+  sycl_device.deallocate(sub_gpu_data);
+  return has_err;
+}
+
+template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_slice<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_slice<DataType, ColMajor, int64_t>(sycl_device);
+  test_simple_reshape<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_reshape<DataType, ColMajor, int64_t>(sycl_device);
+  test_reshape_as_lvalue<DataType, RowMajor, int64_t>(sycl_device);
+  test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
+  test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_strided_slice_as_rhs_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_strided_slice_as_rhs_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  run_eigen<float, RowMajor, long, int>(sycl_device); 
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_move.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_move.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2982319f49ed8d1c206906ff82bffcfa0d24697
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_move.cpp
@@ -0,0 +1,76 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Viktor Csomor <viktor.csomor@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+#include <utility>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void calc_indices(int i, int& x, int& y, int& z)
+{
+  x = i / 4;
+  y = (i % 4) / 2;
+  z = i % 2;
+}
+
+static void test_move()
+{
+  int x;
+  int y;
+  int z;
+
+  Tensor<int,3> tensor1(2, 2, 2);
+  Tensor<int,3,RowMajor> tensor2(2, 2, 2);
+
+  for (int i = 0; i < 8; i++)
+  {
+    calc_indices(i, x, y, z);
+    tensor1(x,y,z) = i;
+    tensor2(x,y,z) = 2 * i;
+  }
+
+  // Invokes the move constructor.
+  Tensor<int,3> moved_tensor1 = std::move(tensor1);
+  Tensor<int,3,RowMajor> moved_tensor2 = std::move(tensor2);
+
+  VERIFY_IS_EQUAL(tensor1.size(), 0);
+  VERIFY_IS_EQUAL(tensor2.size(), 0);
+
+  for (int i = 0; i < 8; i++)
+  {
+    calc_indices(i, x, y, z);
+    VERIFY_IS_EQUAL(moved_tensor1(x,y,z), i);
+    VERIFY_IS_EQUAL(moved_tensor2(x,y,z), 2 * i);
+  }
+
+  Tensor<int,3> moved_tensor3(2,2,2);
+  Tensor<int,3,RowMajor> moved_tensor4(2,2,2);
+
+  moved_tensor3.setZero();
+  moved_tensor4.setZero();
+
+  // Invokes the move assignment operator.
+  moved_tensor3 = std::move(moved_tensor1);
+  moved_tensor4 = std::move(moved_tensor2);
+
+  for (int i = 0; i < 8; i++)
+  {
+    calc_indices(i, x, y, z);
+    VERIFY_IS_EQUAL(moved_tensor3(x,y,z), i);
+    VERIFY_IS_EQUAL(moved_tensor4(x,y,z), 2 * i);
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_move)
+{
+  CALL_SUBTEST(test_move());
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_notification.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_notification.cpp
index c946007b8ed80887e4033998aa9199c6d8a484e8..8e8165302d61f170a81060a8d002edd256cd09bc 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_notification.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_notification.cpp
@@ -9,38 +9,21 @@
 
 #define EIGEN_USE_THREADS
 
+#include <atomic>
+
 #include <stdlib.h>
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 
-#if EIGEN_OS_WIN || EIGEN_OS_WIN64
-#include <windows.h>
-void sleep(int seconds) {
-  Sleep(seconds*1000);
-}
-#else
-#include <unistd.h>
-#endif
-
-
-namespace {
-
-void WaitAndAdd(Eigen::Notification* n, int* counter) {
-  n->Wait();
-  *counter = *counter + 1;
-}
-
-}  // namespace
-
 static void test_notification_single()
 {
   ThreadPool thread_pool(1);
 
-  int counter = 0;
+  std::atomic<int> counter(0);
   Eigen::Notification n;
-  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+  auto func = [&n, &counter](){ n.Wait(); ++counter;};
   thread_pool.Schedule(func);
-  sleep(1);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
 
   // The thread should be waiting for the notification.
   VERIFY_IS_EQUAL(counter, 0);
@@ -48,7 +31,7 @@ static void test_notification_single()
   // Unblock the thread
   n.Notify();
 
-  sleep(1);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
 
   // Verify the counter has been incremented
   VERIFY_IS_EQUAL(counter, 1);
@@ -60,21 +43,21 @@ static void test_notification_multiple()
 {
   ThreadPool thread_pool(1);
 
-  int counter = 0;
+  std::atomic<int> counter(0);
   Eigen::Notification n;
-  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+  auto func = [&n, &counter](){ n.Wait(); ++counter;};
   thread_pool.Schedule(func);
   thread_pool.Schedule(func);
   thread_pool.Schedule(func);
   thread_pool.Schedule(func);
-  sleep(1);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
   VERIFY_IS_EQUAL(counter, 0);
   n.Notify();
-  sleep(1);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
   VERIFY_IS_EQUAL(counter, 4);
 }
 
-void test_cxx11_tensor_notification()
+EIGEN_DECLARE_TEST(cxx11_tensor_notification)
 {
   CALL_SUBTEST(test_notification_single());
   CALL_SUBTEST(test_notification_multiple());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_of_complex.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_of_complex.cpp
index e9d1b2d3c514f0a8f7adda38a932371eed139f00..99e18076ac935efb6d66533f6b1e36f95ba6b396 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_of_complex.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_of_complex.cpp
@@ -94,7 +94,7 @@ static void test_contractions()
 }
 
 
-void test_cxx11_tensor_of_complex()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_complex)
 {
   CALL_SUBTEST(test_additions());
   CALL_SUBTEST(test_abs());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_of_const_values.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_of_const_values.cpp
index f179a0c2183792d169fcd02ba7d8bc0b6c7a9a88..344d678ef9f76c8c811672cd1cf7a40e91b80c59 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_of_const_values.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_of_const_values.cpp
@@ -97,7 +97,7 @@ static void test_plus_equal()
 }
 
 
-void test_cxx11_tensor_of_const_values()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_const_values)
 {
   CALL_SUBTEST(test_assign());
   CALL_SUBTEST(test_plus());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/contrib/eigen/unsupported/test/cxx11_tensor_of_float16_gpu.cu
similarity index 84%
rename from contrib/eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu
rename to contrib/eigen/unsupported/test/cxx11_tensor_of_float16_gpu.cu
index e296bf99155019284caf5941aec92ee9da934727..30bcc1d28a535621d04250fd4feb6a8f442cad76 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_of_float16_gpu.cu
@@ -9,18 +9,19 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+
 using Eigen::Tensor;
 
 template<typename>
-void test_cuda_numext() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_numext() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -56,14 +57,14 @@ void test_cuda_numext() {
 }
 
 
-#ifdef EIGEN_HAS_CUDA_FP16
+#ifdef EIGEN_HAS_GPU_FP16
 
 template<typename>
-void test_cuda_conversion() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_conversion() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
-  
+
   float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
   Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
   float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
@@ -94,8 +95,8 @@ void test_cuda_conversion() {
 }
 
 template<typename>
-void test_cuda_unary() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_unary() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -131,8 +132,8 @@ void test_cuda_unary() {
 }
 
 template<typename>
-void test_cuda_elementwise() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_elementwise() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -173,8 +174,8 @@ void test_cuda_elementwise() {
 }
 
 template<typename>
-void test_cuda_trancendental() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_trancendental() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -197,6 +198,8 @@ void test_cuda_trancendental() {
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
 
   gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
   gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
@@ -204,6 +207,7 @@ void test_cuda_trancendental() {
   gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
   gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
   gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
+  gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>();
 
   gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
   gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
@@ -214,6 +218,9 @@ void test_cuda_trancendental() {
   gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
   gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
 
+  gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+  gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1();
+
   Tensor<float, 1> input1(num_elem);
   Tensor<Eigen::half, 1> half_prec1(num_elem);
   Tensor<Eigen::half, 1> full_prec1(num_elem);
@@ -240,7 +247,7 @@ void test_cuda_trancendental() {
   }
   for (int i = 0; i < num_elem; ++i) {
     std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
-    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
+    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accuracy nearby 1
       VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
     else
       VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
@@ -261,8 +268,8 @@ void test_cuda_trancendental() {
 }
 
 template<typename>
-void test_cuda_contractions() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_contractions() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int rows = 23;
   int cols = 23;
@@ -312,36 +319,32 @@ void test_cuda_contractions() {
 }
 
 template<typename>
-void test_cuda_reductions(int size1, int size2, int redux) {
+void test_gpu_reductions(int size1, int size2, int redux) {
 
    std::cout << "Reducing " << size1 << " by " << size2
-             << " tensor along dim " << redux << std::endl; 
+             << " tensor along dim " << redux << std::endl;
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = size1*size2;
   int result_size = (redux == 1 ? size1 : size2);
 
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
   Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
   Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
 
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
-      d_float1, size1, size2);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
-      d_float2, size1, size2);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+      d_float, size1, size2);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
       d_res_half, result_size);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
       d_res_float, result_size);
 
-  gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
-  gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
+  gpu_float.device(gpu_device) = gpu_float.random() * 2.0f;
 
-  Eigen::array<int, 1> redux_dim = {{redux}};
-  gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
+  Eigen::array<int, 1> redux_dim = {redux};
+  gpu_res_float.device(gpu_device) = gpu_float.sum(redux_dim).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().sum(redux_dim);
 
   Tensor<Eigen::half, 1> half_prec(result_size);
   Tensor<Eigen::half, 1> full_prec(result_size);
@@ -354,50 +357,45 @@ void test_cuda_reductions(int size1, int size2, int redux) {
     VERIFY_IS_APPROX(full_prec(i), half_prec(i));
   }
 
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_float);
   gpu_device.deallocate(d_res_half);
   gpu_device.deallocate(d_res_float);
 }
 
 template<typename>
-void test_cuda_reductions() {
-  test_cuda_reductions<void>(13, 13, 0);
-  test_cuda_reductions<void>(13, 13, 1);
+void test_gpu_reductions() {
+  test_gpu_reductions<void>(13, 13, 0);
+  test_gpu_reductions<void>(13, 13, 1);
 
-  test_cuda_reductions<void>(35, 36, 0);
-  test_cuda_reductions<void>(35, 36, 1);
+  test_gpu_reductions<void>(35, 36, 0);
+  test_gpu_reductions<void>(35, 36, 1);
 
-  test_cuda_reductions<void>(36, 35, 0);
-  test_cuda_reductions<void>(36, 35, 1);
+  test_gpu_reductions<void>(36, 35, 0);
+  test_gpu_reductions<void>(36, 35, 1);
 }
 
 template<typename>
-void test_cuda_full_reductions() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_full_reductions() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int size = 13;
   int num_elem = size*size;
 
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
   Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
   Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
 
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
-      d_float1, size, size);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
-      d_float2, size, size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+      d_float, size, size);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
       d_res_half);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
       d_res_float);
 
-  gpu_float1.device(gpu_device) = gpu_float1.random();
-  gpu_float2.device(gpu_device) = gpu_float2.random();
+  gpu_float.device(gpu_device) = gpu_float.random();
 
-  gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
+  gpu_res_float.device(gpu_device) = gpu_float.sum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().sum();
 
   Tensor<Eigen::half, 0> half_prec;
   Tensor<Eigen::half, 0> full_prec;
@@ -407,24 +405,23 @@ void test_cuda_full_reductions() {
 
   VERIFY_IS_APPROX(full_prec(), half_prec());
 
-  gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
+  gpu_res_float.device(gpu_device) = gpu_float.maximum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().maximum();
   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
   gpu_device.synchronize();
 
   VERIFY_IS_APPROX(full_prec(), half_prec());
 
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_float);
   gpu_device.deallocate(d_res_half);
   gpu_device.deallocate(d_res_float);
 }
 
 template<typename>
-void test_cuda_forced_evals() {
+void test_gpu_forced_evals() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -437,7 +434,7 @@ void test_cuda_forced_evals() {
       d_float, num_elem);
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
       d_res_half1, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
       d_res_half2, num_elem);
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
       d_res_float, num_elem);
@@ -454,7 +451,7 @@ void test_cuda_forced_evals() {
   Tensor<float, 1> half_prec2(num_elem);
   Tensor<float, 1> full_prec(num_elem);
   gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half2, num_elem*sizeof(float));
   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
   gpu_device.synchronize();
 
@@ -472,20 +469,20 @@ void test_cuda_forced_evals() {
 #endif
 
 
-void test_cxx11_tensor_of_float16_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_numext<void>());
-
-#ifdef EIGEN_HAS_CUDA_FP16
-  CALL_SUBTEST_1(test_cuda_conversion<void>());
-  CALL_SUBTEST_1(test_cuda_unary<void>());
-  CALL_SUBTEST_1(test_cuda_elementwise<void>());
-  CALL_SUBTEST_1(test_cuda_trancendental<void>());
-  CALL_SUBTEST_2(test_cuda_contractions<void>());
-  CALL_SUBTEST_3(test_cuda_reductions<void>());
-  CALL_SUBTEST_4(test_cuda_full_reductions<void>());
-  CALL_SUBTEST_5(test_cuda_forced_evals<void>());
+  CALL_SUBTEST_1(test_gpu_numext<void>());
+
+#ifdef EIGEN_HAS_GPU_FP16
+  CALL_SUBTEST_1(test_gpu_conversion<void>());
+  CALL_SUBTEST_1(test_gpu_unary<void>());
+  CALL_SUBTEST_1(test_gpu_elementwise<void>());
+  CALL_SUBTEST_1(test_gpu_trancendental<void>());
+  CALL_SUBTEST_2(test_gpu_contractions<void>());
+  CALL_SUBTEST_3(test_gpu_reductions<void>());
+  CALL_SUBTEST_4(test_gpu_full_reductions<void>());
+  CALL_SUBTEST_5(test_gpu_forced_evals<void>());
 #else
-  std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
+  std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl;
 #endif
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_of_strings.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_of_strings.cpp
index 4ef9aed91a5f222f7571218823ef4b1659bd4806..159656276e6797ab1f38f55af3cdbff752b2878e 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_of_strings.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_of_strings.cpp
@@ -141,7 +141,7 @@ static void test_initialization()
 }
 
 
-void test_cxx11_tensor_of_strings()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_strings)
 {
   // Beware: none of this is likely to ever work on a GPU.
   CALL_SUBTEST(test_assign());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_padding.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_padding.cpp
index ffa19896e1368e2be641e236cbd39150a219f2c1..b8a329debcc4b96b2cd8f6fb9f4a77bfc65f2fc5 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_padding.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_padding.cpp
@@ -84,7 +84,7 @@ static void test_padded_expr()
   }
 }
 
-void test_cxx11_tensor_padding()
+EIGEN_DECLARE_TEST(cxx11_tensor_padding)
 {
   CALL_SUBTEST(test_simple_padding<ColMajor>());
   CALL_SUBTEST(test_simple_padding<RowMajor>());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_padding_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_padding_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..727a9ffd72998c0a7b41f2d1c744329bff4aa44c
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_padding_sycl.cpp
@@ -0,0 +1,157 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_simple_padding(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+
+  array<std::pair<IndexType, IndexType>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+
+  IndexType padedSizeDim1 = 2;
+  IndexType padedSizeDim2 = 6;
+  IndexType padedSizeDim3 = 12;
+  IndexType padedSizeDim4 = 7;
+  array<IndexType, 4> padedtensorRange = {{padedSizeDim1, padedSizeDim2, padedSizeDim3, padedSizeDim4}};
+
+  Tensor<DataType, 4, DataLayout, IndexType> padded(padedtensorRange);
+
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(padded.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu2(gpu_data2, padedtensorRange);
+
+  VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
+  VERIFY_IS_EQUAL(padded.dimension(1), 3+3);
+  VERIFY_IS_EQUAL(padded.dimension(2), 5+7);
+  VERIFY_IS_EQUAL(padded.dimension(3), 7+0);
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.pad(paddings);
+  sycl_device.memcpyDeviceToHost(padded.data(), gpu_data2,(padded.size())*sizeof(DataType));
+  for (IndexType i = 0; i < padedSizeDim1; ++i) {
+    for (IndexType j = 0; j < padedSizeDim2; ++j) {
+      for (IndexType k = 0; k < padedSizeDim3; ++k) {
+        for (IndexType l = 0; l < padedSizeDim4; ++l) {
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f);
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_padded_expr(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+
+  array<std::pair<IndexType, IndexType>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+
+  Eigen::DSizes<IndexType, 2> reshape_dims;
+  reshape_dims[0] = 12;
+  reshape_dims[1] = 84;
+
+
+  Tensor<DataType, 2, DataLayout, IndexType>  result(reshape_dims);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(result.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu2(gpu_data2, reshape_dims);
+
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.pad(paddings).reshape(reshape_dims);
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_data2,(result.size())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 6; ++j) {
+      for (IndexType k = 0; k < 12; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          const float result_value = DataLayout == ColMajor ?
+              result(i+2*j,k+12*l) : result(j+6*i,l+7*k);
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(result_value, 0.0f);
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_padding_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_padding<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_padding<DataType, ColMajor, int64_t>(sycl_device);
+  test_padded_expr<DataType, RowMajor, int64_t>(sycl_device);
+  test_padded_expr<DataType, ColMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_padding_test_per_device<float>(device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_patch.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_patch.cpp
index 434359730654a442b9da14c1d601960ccfc8fdd3..498ab8ca7e174f0c50053f3b76f4bbc8cbe18f5c 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_patch.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_patch.cpp
@@ -164,7 +164,7 @@ static void test_simple_patch()
   }
 }
 
-void test_cxx11_tensor_patch()
+EIGEN_DECLARE_TEST(cxx11_tensor_patch)
 {
    CALL_SUBTEST(test_simple_patch<ColMajor>());
    CALL_SUBTEST(test_simple_patch<RowMajor>());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_patch_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_patch_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f92bec78cb6cf0f65085d88409bf893bfe9a258
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_patch_sycl.cpp
@@ -0,0 +1,249 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_patch_sycl(const Eigen::SyclDevice& sycl_device){
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  array<IndexType, 5> patchTensorRange;
+  if (DataLayout == ColMajor) {
+   patchTensorRange = {{1, 1, 1, 1, sizeDim1*sizeDim2*sizeDim3*sizeDim4}};
+  }else{
+     patchTensorRange = {{sizeDim1*sizeDim2*sizeDim3*sizeDim4,1, 1, 1, 1}};
+  }
+
+  Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 5, DataLayout,IndexType> no_patch(patchTensorRange);
+
+  tensor.setRandom();
+
+  array<ptrdiff_t, 4> patch_dims;
+  patch_dims[0] = 1;
+  patch_dims[1] = 1;
+  patch_dims[2] = 1;
+  patch_dims[3] = 1;
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  size_t patchTensorBuffSize =no_patch.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_no_patch  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_no_patch(gpu_data_no_patch, patchTensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_no_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+  sycl_device.memcpyDeviceToHost(no_patch.data(), gpu_data_no_patch, patchTensorBuffSize);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(no_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size());
+  } else {
+    VERIFY_IS_EQUAL(no_patch.dimension(0), tensor.size());
+    VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(4), 1);
+  }
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]);
+  }
+
+  patch_dims[0] = 2;
+  patch_dims[1] = 3;
+  patch_dims[2] = 5;
+  patch_dims[3] = 7;
+
+  if (DataLayout == ColMajor) {
+   patchTensorRange = {{sizeDim1,sizeDim2,sizeDim3,sizeDim4,1}};
+  }else{
+     patchTensorRange = {{1,sizeDim1,sizeDim2,sizeDim3,sizeDim4}};
+  }
+  Tensor<DataType, 5, DataLayout,IndexType> single_patch(patchTensorRange);
+  patchTensorBuffSize =single_patch.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_single_patch(gpu_data_single_patch, patchTensorRange);
+
+  gpu_single_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+  sycl_device.memcpyDeviceToHost(single_patch.data(), gpu_data_single_patch, patchTensorBuffSize);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(single_patch.dimension(0), 2);
+    VERIFY_IS_EQUAL(single_patch.dimension(1), 3);
+    VERIFY_IS_EQUAL(single_patch.dimension(2), 5);
+    VERIFY_IS_EQUAL(single_patch.dimension(3), 7);
+    VERIFY_IS_EQUAL(single_patch.dimension(4), 1);
+  } else {
+    VERIFY_IS_EQUAL(single_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(single_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(single_patch.dimension(2), 3);
+    VERIFY_IS_EQUAL(single_patch.dimension(3), 5);
+    VERIFY_IS_EQUAL(single_patch.dimension(4), 7);
+  }
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]);
+  }
+  patch_dims[0] = 1;
+  patch_dims[1] = 2;
+  patch_dims[2] = 2;
+  patch_dims[3] = 1;
+  
+  if (DataLayout == ColMajor) {
+   patchTensorRange = {{1,2,2,1,2*2*4*7}};
+  }else{
+     patchTensorRange = {{2*2*4*7, 1, 2,2,1}};
+  }
+  Tensor<DataType, 5, DataLayout,IndexType> twod_patch(patchTensorRange);
+  patchTensorBuffSize =twod_patch.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_twod_patch(gpu_data_twod_patch, patchTensorRange);
+
+  gpu_twod_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+  sycl_device.memcpyDeviceToHost(twod_patch.data(), gpu_data_twod_patch, patchTensorBuffSize);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(twod_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7);
+  } else {
+    VERIFY_IS_EQUAL(twod_patch.dimension(0), 2*2*4*7);
+    VERIFY_IS_EQUAL(twod_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(3), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(4), 1);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 4; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          int patch_loc;
+          if (DataLayout == ColMajor) {
+            patch_loc = i + 2 * (j + 2 * (k + 4 * l));
+          } else {
+            patch_loc = l + 7 * (k + 4 * (j + 2 * i));
+          }
+          for (int x = 0; x < 2; ++x) {
+            for (int y = 0; y < 2; ++y) {
+              if (DataLayout == ColMajor) {
+                VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc));
+              } else {
+                VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(patch_loc,0,x,y,0));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  patch_dims[0] = 1;
+  patch_dims[1] = 2;
+  patch_dims[2] = 3;
+  patch_dims[3] = 5;
+
+  if (DataLayout == ColMajor) {
+   patchTensorRange = {{1,2,3,5,2*2*3*3}};
+  }else{
+     patchTensorRange = {{2*2*3*3, 1, 2,3,5}};
+  }
+  Tensor<DataType, 5, DataLayout,IndexType> threed_patch(patchTensorRange);
+  patchTensorBuffSize =threed_patch.size()*sizeof(DataType);
+  DataType* gpu_data_threed_patch  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_threed_patch(gpu_data_threed_patch, patchTensorRange);
+
+  gpu_threed_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+  sycl_device.memcpyDeviceToHost(threed_patch.data(), gpu_data_threed_patch, patchTensorBuffSize);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(threed_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(threed_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(threed_patch.dimension(2), 3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(3), 5);
+    VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3);
+  } else {
+    VERIFY_IS_EQUAL(threed_patch.dimension(0), 2*2*3*3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(threed_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(threed_patch.dimension(3), 3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(4), 5);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          int patch_loc;
+          if (DataLayout == ColMajor) {
+            patch_loc = i + 2 * (j + 2 * (k + 3 * l));
+          } else {
+            patch_loc = l + 3 * (k + 3 * (j + 2 * i));
+          }
+          for (int x = 0; x < 2; ++x) {
+            for (int y = 0; y < 3; ++y) {
+              for (int z = 0; z < 5; ++z) {
+                if (DataLayout == ColMajor) {
+                  VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc));
+                } else {
+                  VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(patch_loc,0,x,y,z));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_no_patch);
+  sycl_device.deallocate(gpu_data_single_patch);
+  sycl_device.deallocate(gpu_data_twod_patch);
+  sycl_device.deallocate(gpu_data_threed_patch);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_patch_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_patch_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_patch_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_random.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_random.cpp
index 0f3dc5787ef0f352caf18dfaa08f671a293258aa..b9d4c558476afcbee00d7ec8bcc8cb46e365889a 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_random.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_random.cpp
@@ -11,9 +11,10 @@
 
 #include <Eigen/CXX11/Tensor>
 
+template<typename Scalar>
 static void test_default()
 {
-  Tensor<float, 1> vec(6);
+  Tensor<Scalar, 1> vec(6);
   vec.setRandom();
 
   // Fixme: we should check that the generated numbers follow a uniform
@@ -23,10 +24,11 @@ static void test_default()
   }
 }
 
+template<typename Scalar>
 static void test_normal()
 {
-  Tensor<float, 1> vec(6);
-  vec.setRandom<Eigen::internal::NormalRandomGenerator<float>>();
+  Tensor<Scalar, 1> vec(6);
+  vec.template setRandom<Eigen::internal::NormalRandomGenerator<Scalar>>();
 
   // Fixme: we should check that the generated numbers follow a gaussian
   // distribution instead.
@@ -70,9 +72,15 @@ static void test_custom()
   }
 }
 
-void test_cxx11_tensor_random()
+EIGEN_DECLARE_TEST(cxx11_tensor_random)
 {
-  CALL_SUBTEST(test_default());
-  CALL_SUBTEST(test_normal());
+  CALL_SUBTEST((test_default<float>()));
+  CALL_SUBTEST((test_normal<float>()));
+  CALL_SUBTEST((test_default<double>()));
+  CALL_SUBTEST((test_normal<double>()));
+  CALL_SUBTEST((test_default<Eigen::half>()));
+  CALL_SUBTEST((test_normal<Eigen::half>()));
+  CALL_SUBTEST((test_default<Eigen::bfloat16>()));
+  CALL_SUBTEST((test_normal<Eigen::bfloat16>()));
   CALL_SUBTEST(test_custom());
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_random_cuda.cu b/contrib/eigen/unsupported/test/cxx11_tensor_random_gpu.cu
similarity index 65%
rename from contrib/eigen/unsupported/test/cxx11_tensor_random_cuda.cu
rename to contrib/eigen/unsupported/test/cxx11_tensor_random_gpu.cu
index fa1a46732681aae24e977f1b5d1ba9b7e02ea282..090986ebcd7b8d2ba7406a11dc6c38317ae2b8ef 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_random_cuda.cu
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_random_gpu.cu
@@ -9,15 +9,16 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_random_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 
+#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
 
-void test_cuda_random_uniform()
+void test_gpu_random_uniform()
 {
   Tensor<float, 2> out(72,97);
   out.setZero();
@@ -25,24 +26,24 @@ void test_cuda_random_uniform()
   std::size_t out_bytes = out.size() * sizeof(float);
 
   float* d_out;
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
 
   gpu_out.device(gpu_device) = gpu_out.random();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
-  // For now we just check thes code doesn't crash.
+  // For now we just check this code doesn't crash.
   // TODO: come up with a valid test of randomness
 }
 
 
-void test_cuda_random_normal()
+void test_gpu_random_normal()
 {
   Tensor<float, 2> out(72,97);
   out.setZero();
@@ -50,9 +51,9 @@ void test_cuda_random_normal()
   std::size_t out_bytes = out.size() * sizeof(float);
 
   float* d_out;
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
@@ -60,8 +61,8 @@ void test_cuda_random_normal()
   Eigen::internal::NormalRandomGenerator<float> gen(true);
   gpu_out.device(gpu_device) = gpu_out.random(gen);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 }
 
 static void test_complex()
@@ -77,9 +78,9 @@ static void test_complex()
 }
 
 
-void test_cxx11_tensor_random_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_random_gpu)
 {
-  CALL_SUBTEST(test_cuda_random_uniform());
-  CALL_SUBTEST(test_cuda_random_normal());
+  CALL_SUBTEST(test_gpu_random_uniform());
+  CALL_SUBTEST(test_gpu_random_normal());
   CALL_SUBTEST(test_complex());
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_random_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_random_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6c83894a33a4e4b0d68b93ff12a81664711b7e3b
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_random_sycl.cpp
@@ -0,0 +1,100 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_sycl_random_uniform(const Eigen::SyclDevice& sycl_device)
+{
+  Tensor<DataType, 2,DataLayout, IndexType> out(72,97);
+  out.setZero();
+
+  std::size_t out_bytes = out.size() * sizeof(DataType);
+
+  IndexType sizeDim0 = 72;
+  IndexType sizeDim1 = 97;
+
+  array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}};
+
+  DataType* d_out  = static_cast<DataType*>(sycl_device.allocate(out_bytes));
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange);
+
+  gpu_out.device(sycl_device)=gpu_out.random();
+  sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes);
+  for(IndexType i=1; i<sizeDim0; i++)
+    for(IndexType j=1; j<sizeDim1; j++)
+    {
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1));    }
+
+  // For now we just check thes code doesn't crash.
+  // TODO: come up with a valid test of randomness
+  sycl_device.deallocate(d_out);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_random_normal(const Eigen::SyclDevice& sycl_device)
+{
+  Tensor<DataType, 2,DataLayout,IndexType> out(72,97);
+  out.setZero();
+  std::size_t out_bytes = out.size() * sizeof(DataType);
+
+  IndexType sizeDim0 = 72;
+  IndexType sizeDim1 = 97;
+
+  array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}};
+
+  DataType* d_out  = static_cast<DataType*>(sycl_device.allocate(out_bytes));
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange);
+  Eigen::internal::NormalRandomGenerator<DataType> gen(true);
+  gpu_out.device(sycl_device)=gpu_out.random(gen);
+  sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes);
+  for(IndexType i=1; i<sizeDim0; i++)
+    for(IndexType j=1; j<sizeDim1; j++)
+    {
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1));
+
+    }
+
+  // For now we just check thes code doesn't crash.
+  // TODO: come up with a valid test of randomness
+  sycl_device.deallocate(d_out);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_random_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_sycl_random_uniform<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_random_uniform<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_random_normal<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_random_normal<DataType, ColMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_random_test_per_device<float>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+    CALL_SUBTEST(sycl_random_test_per_device<double>(device));
+#endif
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_reduction.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_reduction.cpp
index 1490ec3da5f598a942724d2a5fa1964f31f814ac..c46c4c91d59ef897ea734faa2d430fbe7fa16b88 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_reduction.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_reduction.cpp
@@ -53,20 +53,22 @@ static void test_trivial_reductions() {
   }
 }
 
-template <int DataLayout>
+template <typename Scalar,int DataLayout>
 static void test_simple_reductions() {
-  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  Tensor<Scalar, 4, DataLayout> tensor(2, 3, 5, 7);
   tensor.setRandom();
+  // Add a little offset so that the product reductions won't be close to zero.
+  tensor += tensor.constant(Scalar(0.5f));
   array<ptrdiff_t, 2> reduction_axis2;
   reduction_axis2[0] = 1;
   reduction_axis2[1] = 3;
 
-  Tensor<float, 2, DataLayout> result = tensor.sum(reduction_axis2);
+  Tensor<Scalar, 2, DataLayout> result = tensor.sum(reduction_axis2);
   VERIFY_IS_EQUAL(result.dimension(0), 2);
   VERIFY_IS_EQUAL(result.dimension(1), 5);
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 5; ++j) {
-      float sum = 0.0f;
+      Scalar sum = Scalar(0.0f);
       for (int k = 0; k < 3; ++k) {
         for (int l = 0; l < 7; ++l) {
           sum += tensor(i, k, j, l);
@@ -77,7 +79,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> sum1 = tensor.sum();
+    Tensor<Scalar, 0, DataLayout> sum1 = tensor.sum();
     VERIFY_IS_EQUAL(sum1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -85,7 +87,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> sum2 = tensor.sum(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> sum2 = tensor.sum(reduction_axis4);
     VERIFY_IS_EQUAL(sum2.rank(), 0);
 
     VERIFY_IS_APPROX(sum1(), sum2());
@@ -98,7 +100,7 @@ static void test_simple_reductions() {
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 7; ++j) {
-      float prod = 1.0f;
+      Scalar prod = Scalar(1.0f);
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 5; ++l) {
           prod *= tensor(k, i, l, j);
@@ -109,7 +111,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> prod1 = tensor.prod();
+    Tensor<Scalar, 0, DataLayout> prod1 = tensor.prod();
     VERIFY_IS_EQUAL(prod1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -117,7 +119,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> prod2 = tensor.prod(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> prod2 = tensor.prod(reduction_axis4);
     VERIFY_IS_EQUAL(prod2.rank(), 0);
 
     VERIFY_IS_APPROX(prod1(), prod2());
@@ -130,7 +132,7 @@ static void test_simple_reductions() {
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 7; ++j) {
-      float max_val = std::numeric_limits<float>::lowest();
+      Scalar max_val = std::numeric_limits<Scalar>::lowest();
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 5; ++l) {
           max_val = (std::max)(max_val, tensor(k, i, l, j));
@@ -141,7 +143,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> max1 = tensor.maximum();
+    Tensor<Scalar, 0, DataLayout> max1 = tensor.maximum();
     VERIFY_IS_EQUAL(max1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -149,7 +151,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> max2 = tensor.maximum(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> max2 = tensor.maximum(reduction_axis4);
     VERIFY_IS_EQUAL(max2.rank(), 0);
 
     VERIFY_IS_APPROX(max1(), max2());
@@ -162,7 +164,7 @@ static void test_simple_reductions() {
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   for (int i = 0; i < 5; ++i) {
     for (int j = 0; j < 7; ++j) {
-      float min_val = (std::numeric_limits<float>::max)();
+      Scalar min_val = (std::numeric_limits<Scalar>::max)();
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 3; ++l) {
           min_val = (std::min)(min_val, tensor(k, l, i, j));
@@ -173,7 +175,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> min1 = tensor.minimum();
+    Tensor<Scalar, 0, DataLayout> min1 = tensor.minimum();
     VERIFY_IS_EQUAL(min1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -181,7 +183,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> min2 = tensor.minimum(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> min2 = tensor.minimum(reduction_axis4);
     VERIFY_IS_EQUAL(min2.rank(), 0);
 
     VERIFY_IS_APPROX(min1(), min2());
@@ -194,7 +196,7 @@ static void test_simple_reductions() {
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   for (int i = 0; i < 5; ++i) {
     for (int j = 0; j < 7; ++j) {
-      float sum = 0.0f;
+      Scalar sum = Scalar(0.0f);
       int count = 0;
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 3; ++l) {
@@ -202,12 +204,12 @@ static void test_simple_reductions() {
           ++count;
         }
       }
-      VERIFY_IS_APPROX(result(i, j), sum / count);
+      VERIFY_IS_APPROX(result(i, j), sum / Scalar(count));
     }
   }
 
   {
-    Tensor<float, 0, DataLayout> mean1 = tensor.mean();
+    Tensor<Scalar, 0, DataLayout> mean1 = tensor.mean();
     VERIFY_IS_EQUAL(mean1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -215,7 +217,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> mean2 = tensor.mean(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> mean2 = tensor.mean(reduction_axis4);
     VERIFY_IS_EQUAL(mean2.rank(), 0);
 
     VERIFY_IS_APPROX(mean1(), mean2());
@@ -225,11 +227,11 @@ static void test_simple_reductions() {
     Tensor<int, 1> ints(10);
     std::iota(ints.data(), ints.data() + ints.dimension(0), 0);
 
-    TensorFixedSize<bool, Sizes<> > all;
-    all = ints.all();
-    VERIFY(!all());
-    all = (ints >= ints.constant(0)).all();
-    VERIFY(all());
+    TensorFixedSize<bool, Sizes<> > all_;
+    all_ = ints.all();
+    VERIFY(!all_());
+    all_ = (ints >= ints.constant(0)).all();
+    VERIFY(all_());
 
     TensorFixedSize<bool, Sizes<> > any;
     any = (ints > ints.constant(10)).any();
@@ -368,7 +370,7 @@ static void test_static_dims() {
   Tensor<float, 2, DataLayout> out(72, 97);
   in.setRandom();
 
-#if !EIGEN_HAS_CONSTEXPR 
+#if !EIGEN_HAS_CONSTEXPR
   array<int, 2> reduction_axis;
   reduction_axis[0] = 1;
   reduction_axis[1] = 3;
@@ -386,7 +388,7 @@ static void test_static_dims() {
           expected = (std::max)(expected, in(i, k, j, l));
         }
       }
-      VERIFY_IS_APPROX(out(i, j), expected);
+      VERIFY_IS_EQUAL(out(i, j), expected);
     }
   }
 }
@@ -417,7 +419,7 @@ static void test_innermost_last_dims() {
           expected = (std::max)(expected, in(l, k, i, j));
         }
       }
-      VERIFY_IS_APPROX(out(i, j), expected);
+      VERIFY_IS_EQUAL(out(i, j), expected);
     }
   }
 }
@@ -448,7 +450,7 @@ static void test_innermost_first_dims() {
           expected = (std::max)(expected, in(i, j, k, l));
         }
       }
-      VERIFY_IS_APPROX(out(i, j), expected);
+      VERIFY_IS_EQUAL(out(i, j), expected);
     }
   }
 }
@@ -479,16 +481,37 @@ static void test_reduce_middle_dims() {
           expected = (std::max)(expected, in(i, k, l, j));
         }
       }
-      VERIFY_IS_APPROX(out(i, j), expected);
+      VERIFY_IS_EQUAL(out(i, j), expected);
+    }
+  }
+}
+
+static void test_sum_accuracy() {
+  Tensor<float, 3> tensor(101, 101, 101);
+  for (float prescribed_mean : {1.0f, 10.0f, 100.0f, 1000.0f, 10000.0f}) {
+    tensor.setRandom();
+    tensor += tensor.constant(prescribed_mean);
+
+    Tensor<float, 0> sum = tensor.sum();
+    double expected_sum = 0.0;
+    for (int i = 0; i < 101; ++i) {
+      for (int j = 0; j < 101; ++j) {
+        for (int k = 0; k < 101; ++k) {
+          expected_sum += static_cast<double>(tensor(i, j, k));
+        }
+      }
     }
+    VERIFY_IS_APPROX(sum(), static_cast<float>(expected_sum));
   }
 }
 
-void test_cxx11_tensor_reduction() {
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction) {
   CALL_SUBTEST(test_trivial_reductions<ColMajor>());
   CALL_SUBTEST(test_trivial_reductions<RowMajor>());
-  CALL_SUBTEST(test_simple_reductions<ColMajor>());
-  CALL_SUBTEST(test_simple_reductions<RowMajor>());
+  CALL_SUBTEST(( test_simple_reductions<float,ColMajor>() ));
+  CALL_SUBTEST(( test_simple_reductions<float,RowMajor>() ));
+  CALL_SUBTEST(( test_simple_reductions<Eigen::half,ColMajor>() ));
+  CALL_SUBTEST(( test_simple_reductions<Eigen::bfloat16,ColMajor>() ));
   CALL_SUBTEST(test_reductions_in_expr<ColMajor>());
   CALL_SUBTEST(test_reductions_in_expr<RowMajor>());
   CALL_SUBTEST(test_full_reductions<ColMajor>());
@@ -505,4 +528,5 @@ void test_cxx11_tensor_reduction() {
   CALL_SUBTEST(test_innermost_first_dims<RowMajor>());
   CALL_SUBTEST(test_reduce_middle_dims<ColMajor>());
   CALL_SUBTEST(test_reduce_middle_dims<RowMajor>());
+  CALL_SUBTEST(test_sum_accuracy());
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_reduction_cuda.cu b/contrib/eigen/unsupported/test/cxx11_tensor_reduction_gpu.cu
similarity index 96%
rename from contrib/eigen/unsupported/test/cxx11_tensor_reduction_cuda.cu
rename to contrib/eigen/unsupported/test/cxx11_tensor_reduction_gpu.cu
index ec0669704e0915832d85ddde83fd340fb4256d37..122ac946b259359111bbf547221471dd9fcbc494 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_reduction_cuda.cu
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_reduction_gpu.cu
@@ -9,7 +9,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
+
 #define EIGEN_USE_GPU
 
 #include "main.h"
@@ -19,7 +19,7 @@
 template<typename Type, int DataLayout>
 static void test_full_reductions() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   const int num_rows = internal::random<int>(1024, 5*1024);
@@ -67,7 +67,7 @@ static void test_first_dim_reductions() {
   Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
 
   // Create device
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice dev(&stream);
   
   // Create data(T)
@@ -107,7 +107,7 @@ static void test_last_dim_reductions() {
   Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
 
   // Create device
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice dev(&stream);
   
   // Create data
@@ -134,7 +134,7 @@ static void test_last_dim_reductions() {
 }
 
 
-void test_cxx11_tensor_reduction_cuda() {
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction_gpu) {
   CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
   CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
   CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp
index a9ef829071c836a6904e9d9ff4015c8a425f2f91..a297716e4d0af6f72c8deaa490a09b595ffe1358 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@@ -13,38 +13,168 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
+#define EIGEN_HAS_CONSTEXPR 1
 
 #include "main.h"
+
 #include <unsupported/Eigen/CXX11/Tensor>
 
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 753;
+  const IndexType num_cols = 537;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
 
+  array<IndexType, 2> outRange = {{1, 1}};
 
-static void test_full_reductions_sycl(const Eigen::SyclDevice&  sycl_device) {
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange);
+  Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange);
 
-  const int num_rows = 452;
-  const int num_cols = 765;
-  array<int, 2> tensorRange = {{num_rows, num_cols}};
+  in.setRandom();
+  auto dim = DSizes<IndexType, 2>(1, 1);
+  full_redux = in.sum().reshape(dim);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(
+      sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize()));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data,
+                                                                outRange);
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim);
+  sycl_device.memcpyDeviceToHost(
+      full_redux_gpu.data(), gpu_out_data,
+      (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  std::cout << "SYCL FULL :" << full_redux_gpu(0, 0)
+            << ", CPU FULL: " << full_redux(0, 0) << "\n";
+  VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0));
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_sum_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
 
-  Tensor<float, 2> in(tensorRange);
-  Tensor<float, 0> full_redux;
-  Tensor<float, 0> full_redux_gpu;
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
 
   in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.sum();
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.sum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
 
-  full_redux = in.sum();
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
 
-  float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
-  float* gpu_out_data =(float*)sycl_device.allocate(sizeof(float));
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
 
-  TensorMap<Tensor<float, 2> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<float, 0> >  out_gpu(gpu_out_data);
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_max_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 4096;
+  const IndexType num_cols = 4096;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+  in.setRandom();
+
+  full_redux = in.maximum();
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set the initial value to be the max.
+  // As we don't include this in the reduction the result should not be 2.
+  in(0) = static_cast<DataType>(2);
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.maximum();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
-  out_gpu.device(sycl_device) = in_gpu.sum();
-  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(float));
   // Check that the CPU and GPU reductions return the same result.
   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
 
@@ -52,87 +182,833 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice&  sycl_device) {
   sycl_device.deallocate(gpu_out_data);
 }
 
-static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 4096;
+  const IndexType num_cols = 4096;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+  array<IndexType, 1> argRange = {{num_cols}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  //  red_axis[1]=1;
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+  in.setRandom();
+  in_arg1.setRandom();
+  in_arg2.setRandom();
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_in_arg1_data = static_cast<DataType*>(sycl_device.allocate(
+      in_arg1.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_in_arg2_data = static_cast<DataType*>(sycl_device.allocate(
+      in_arg2.dimensions().TotalSize() * sizeof(DataType)));
+  bool* gpu_out_arg__gpu_helper_data = static_cast<bool*>(sycl_device.allocate(
+      out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
+  bool* gpu_out_arg_data = static_cast<bool*>(sycl_device.allocate(
+      out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu(
+      gpu_in_arg1_data, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu(
+      gpu_in_arg2_data, tensorRange);
+  TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu(
+      gpu_out_arg_data, argRange);
+  TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper(
+      gpu_out_arg__gpu_helper_data, argRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+
+  // CPU VERSION
+  out_arg_cpu =
+      (in_arg1.argmax(1) == in_arg2.argmax(1))
+          .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false));
+  full_redux = (out_arg_cpu.template cast<float>())
+                   .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+
+  // GPU VERSION
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_arg1_data, in_arg1.data(),
+      (in_arg1.dimensions().TotalSize()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_arg2_data, in_arg2.data(),
+      (in_arg2.dimensions().TotalSize()) * sizeof(DataType));
+  out_Argout_gpu_helper.device(sycl_device) =
+      (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1));
+  out_Argout_gpu.device(sycl_device) =
+      (out_Argout_gpu_helper)
+          .select(out_Argout_gpu.constant(true),
+                  out_Argout_gpu.constant(false));
+  out_gpu.device(sycl_device) =
+      (out_Argout_gpu.template cast<float>())
+          .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux()
+            << '\n';
+  VERIFY_IS_EQUAL(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_in_arg1_data);
+  sycl_device.deallocate(gpu_in_arg2_data);
+  sycl_device.deallocate(gpu_out_arg__gpu_helper_data);
+  sycl_device.deallocate(gpu_out_arg_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.mean();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_with_odd_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  // This is a particular case which illustrates a possible problem when the
+  // number of local threads in a workgroup is even, but is not a power of two.
+  using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  // 2177 = (17 * 128) + 1 gives rise to 18 local threads.
+  // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads.
+  const IndexType n_elems = 8707;
+  array<IndexType, 1> tensor_range = {{n_elems}};
+
+  data_tensor in(tensor_range);
+  DataType full_redux;
+  DataType full_redux_gpu;
+  TensorMap<scalar_tensor> red_cpu(&full_redux);
+  TensorMap<scalar_tensor> red_gpu(&full_redux_gpu);
+
+  const DataType const_val = static_cast<DataType>(0.6391);
+  in = in.constant(const_val);
+
+  Eigen::IndexList<Eigen::type2index<0>> red_axis;
+  red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  VERIFY_IS_APPROX(const_val, red_cpu());
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) =
+      in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu, full_redux);
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_min_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 876;
+  const IndexType num_cols = 953;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+  in.setRandom();
+
+  full_redux = in.minimum();
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.minimum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_min_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set the initial value to be the min.
+  // As we don't include this in the reduction the result should not be -2.
+  in(0) = static_cast<DataType>(-2);
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.minimum();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.minimum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_max_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 145;
+  IndexType dim_y = 1;
+  IndexType dim_z = 67;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.maximum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  array<IndexType, 1> reduced_range = {{num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+  const IndexType n_reduced = num_cols;
 
-  int dim_x = 145;
-  int dim_y = 1;
-  int dim_z = 67;
+  data_tensor in(tensor_range);
+  reduced_tensor redux;
+  reduced_tensor redux_gpu(reduced_range);
 
-  array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
-  Eigen::array<int, 1> red_axis;
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set maximum value outside of the considered range.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    in(i) = static_cast<DataType>(2);
+  }
+
+  Eigen::array<IndexType, 1> red_axis;
   red_axis[0] = 0;
-  array<int, 2> reduced_tensorRange = {{dim_y, dim_z}};
 
-  Tensor<float, 3> in(tensorRange);
-  Tensor<float, 2> redux(reduced_tensorRange);
-  Tensor<float, 2> redux_gpu(reduced_tensorRange);
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  redux = in_offset.maximum(red_axis);
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_NOT_EQUAL(redux(i), in(i));
+  }
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(
+      sycl_device.allocate(n_reduced * sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data,
+                                 n_reduced * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_APPROX(redux_gpu(i), redux(i));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  array<IndexType, 1> full_reduced_range = {{num_rows}};
+  array<IndexType, 1> reduced_range = {{num_rows - 1}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+  const IndexType n_reduced = reduced_range[0];
+
+  data_tensor in(tensor_range);
+  reduced_tensor redux(full_reduced_range);
+  reduced_tensor redux_gpu(reduced_range);
 
   in.setRandom();
+  redux.setZero();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set maximum value outside of the considered range.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    in(i) = static_cast<DataType>(2);
+  }
+
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 1;
+
+  const IndexType offset = 64;
+  // Introduce an offset in both the input and the output.
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range);
+  red_offset = in_offset.maximum(red_axis);
+
+  // Check that the first value hasn't been changed and that the reduced values
+  // are not equal to the previously set maximum in the input outside the range.
+  VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0));
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_NOT_EQUAL(red_offset(i), in(i));
+  }
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(
+      sycl_device.allocate((n_reduced + 1) * sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+  sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(),
+                                 n_reduced * sizeof(DataType));
 
-  redux= in.sum(red_axis);
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_APPROX(redux_gpu(i), red_offset(i));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
 
-  float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
-  float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float)));
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) {
+  array<IndexType, 2> tensorRange = {{dim_x, dim_y}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 1> reduced_tensorRange = {{dim_y}};
 
-  TensorMap<Tensor<float, 3> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<float, 2> >  out_gpu(gpu_out_data, reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
+  in.setRandom();
+  redux = in.sum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
-  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float));
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
 
   // Check that the CPU and GPU reductions return the same result.
-  for(int j=0; j<reduced_tensorRange[0]; j++ )
-    for(int k=0; k<reduced_tensorRange[1]; k++ )
-      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
+  for (IndexType i = 0; i < redux.size(); i++) {
+    VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]);
+  }
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 145;
+  IndexType dim_y = 1;
+  IndexType dim_z = 67;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
 
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }
 
-static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) {
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 64;
+  IndexType dim_y = 1;
+  IndexType dim_z = 32;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 2;
+  array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
 
-  int dim_x = 567;
-  int dim_y = 1;
-  int dim_z = 47;
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
 
-  array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
-  Eigen::array<int, 1> red_axis;
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 64;
+  IndexType dim_y = 1;
+  IndexType dim_z = 32;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
   red_axis[0] = 2;
-  array<int, 2> reduced_tensorRange = {{dim_x, dim_y}};
+  array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
 
-  Tensor<float, 3> in(tensorRange);
-  Tensor<float, 2> redux(reduced_tensorRange);
-  Tensor<float, 2> redux_gpu(reduced_tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
 
   in.setRandom();
 
-  redux= in.sum(red_axis);
+  redux = in.sum(red_axis);
 
-  float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
-  float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float)));
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
 
-  TensorMap<Tensor<float, 3> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<float, 2> >  out_gpu(gpu_out_data, reduced_tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
-  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float));
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
-  for(int j=0; j<reduced_tensorRange[0]; j++ )
-    for(int k=0; k<reduced_tensorRange[1]; k++ )
-      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
 
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  auto tensorRange = Sizes<64, 32>(64, 32);
+  // auto red_axis =  Sizes<0,1>(0,1);
+  Eigen::IndexList<Eigen::type2index<1>> red_axis;
+  auto reduced_tensorRange = Sizes<64>(64);
+  TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
+
+  in_fix.setRandom();
+
+  redux_fix = in_fix.sum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in_fix.data(),
+      (in_fix.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu_fix.data(), gpu_out_data,
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
+    VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
+  }
 
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  auto tensorRange = Sizes<64, 32>(64, 32);
+  Eigen::IndexList<Eigen::type2index<1>> red_axis;
+  auto reduced_tensorRange = Sizes<64>(64);
+  TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
+
+  in_fix.setRandom();
+  redux_fix = in_fix.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in_fix.data(),
+      (in_fix.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu_fix.data(), gpu_out_data,
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
+  sycl_device.synchronize();
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
+    VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+// SYCL supports a generic case of reduction where the accumulator is a
+// different type than the input data This is an example on how to get if a
+// Tensor contains nan and/or inf in one reduction
+template <typename InT, typename OutT>
+struct CustomReducer {
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
+  static constexpr OutT InfBit = 1;
+  static constexpr OutT NanBit = 2;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x,
+                                                    OutT* accum) const {
+    if (Eigen::numext::isinf(x))
+      *accum |= InfBit;
+    else if (Eigen::numext::isnan(x))
+      *accum |= NanBit;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x,
+                                                    OutT* accum) const {
+    *accum |= x;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const {
+    return OutT(0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const {
+    return accum;
+  }
+};
+
+template <typename DataType, typename AccumType, int DataLayout,
+          typename IndexType>
+static void test_full_reductions_custom_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  constexpr IndexType InSize = 64;
+  auto tensorRange = Sizes<InSize>(InSize);
+  Eigen::IndexList<Eigen::type2index<0>> dims;
+  auto reduced_tensorRange = Sizes<>();
+  TensorFixedSize<DataType, Sizes<InSize>, DataLayout> in_fix;
+  TensorFixedSize<AccumType, Sizes<>, DataLayout> redux_gpu_fix;
+
+  CustomReducer<DataType, AccumType> reducer;
+
+  in_fix.setRandom();
+
+  size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType);
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(in_size_bytes));
+  AccumType* gpu_out_data =
+      static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes);
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer);
+  sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data,
+                                 sizeof(AccumType));
+  VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_full_per_device(const Dev& sycl_device) {
+  test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
+
+  test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
 }
 
-void test_cxx11_tensor_reduction_sycl() {
-  cl::sycl::gpu_selector s;
-  Eigen::SyclDevice sycl_device(s);
-  CALL_SUBTEST((test_full_reductions_sycl(sycl_device)));
-  CALL_SUBTEST((test_first_dim_reductions_sycl(sycl_device)));
-  CALL_SUBTEST((test_last_dim_reductions_sycl(sycl_device)));
+template <typename DataType, typename Dev>
+void sycl_reduction_full_offset_per_device(const Dev& sycl_device) {
+  test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) {
+  test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device,
+                                                                  4197, 4097);
+  test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
+                                                                  4197, 4097);
+  test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
+                                                                  129, 8);
+  test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) {
+  test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  sycl_device.synchronize();
+}
 
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device));
+    CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device));
+    CALL_SUBTEST_3(
+        sycl_reduction_test_first_dim_per_device<float>(sycl_device));
+    CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device));
+  }
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_ref.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_ref.cpp
index c8f105e3d877b9ab1036fc7ce52f29636b4f7bee..7dbd0478caeee58ddac3be33f307e3d06d66e88d 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_ref.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_ref.cpp
@@ -235,7 +235,7 @@ static void test_nested_ops_with_ref()
 }
 
 
-void test_cxx11_tensor_ref()
+EIGEN_DECLARE_TEST(cxx11_tensor_ref)
 {
   CALL_SUBTEST(test_simple_lvalue_ref());
   CALL_SUBTEST(test_simple_rvalue_ref());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_reverse.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_reverse.cpp
index b35b8d29ede527a555e120f9370c11fc6464d5ea..5e44ec007da547dc24839ec489300f93dcdf172c 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_reverse.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_reverse.cpp
@@ -179,7 +179,7 @@ static void test_expr_reverse(bool LValue)
 }
 
 
-void test_cxx11_tensor_reverse()
+EIGEN_DECLARE_TEST(cxx11_tensor_reverse)
 {
   CALL_SUBTEST(test_simple_reverse<ColMajor>());
   CALL_SUBTEST(test_simple_reverse<RowMajor>());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_reverse_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd30c235d81ca1a0bec7157d57a825ad76c5eb90
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_reverse_sycl.cpp
@@ -0,0 +1,253 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) {
+  IndexType dim1 = 2;
+  IndexType dim2 = 3;
+  IndexType dim3 = 5;
+  IndexType dim4 = 7;
+
+  array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> reversed_tensor(tensorRange);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = true;
+  dim_rev[3] = false;
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data,
+                                                                tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu(gpu_out_data,
+                                                                 tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, tensor.data(),
+      (tensor.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l),
+                          reversed_tensor(i, 2 - j, 4 - k, l));
+        }
+      }
+    }
+  }
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = false;
+
+  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), reversed_tensor(1 - i, j, k, l));
+        }
+      }
+    }
+  }
+
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l),
+                          reversed_tensor(1 - i, j, k, 6 - l));
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_expr_reverse(const Eigen::SyclDevice& sycl_device,
+                              bool LValue) {
+  IndexType dim1 = 2;
+  IndexType dim2 = 3;
+  IndexType dim3 = 5;
+  IndexType dim4 = 7;
+
+  array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> expected(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> result(tensorRange);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data_expected = static_cast<DataType*>(sycl_device.allocate(
+      expected.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data_result = static_cast<DataType*>(
+      sycl_device.allocate(result.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data,
+                                                                tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_expected(
+      gpu_out_data_expected, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_result(
+      gpu_out_data_result, tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, tensor.data(),
+      (tensor.dimensions().TotalSize()) * sizeof(DataType));
+
+  if (LValue) {
+    out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu;
+  } else {
+    out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev);
+  }
+  sycl_device.memcpyDeviceToHost(
+      expected.data(), gpu_out_data_expected,
+      expected.dimensions().TotalSize() * sizeof(DataType));
+
+  array<IndexType, 4> src_slice_dim;
+  src_slice_dim[0] = 2;
+  src_slice_dim[1] = 3;
+  src_slice_dim[2] = 1;
+  src_slice_dim[3] = 7;
+  array<IndexType, 4> src_slice_start;
+  src_slice_start[0] = 0;
+  src_slice_start[1] = 0;
+  src_slice_start[2] = 0;
+  src_slice_start[3] = 0;
+  array<IndexType, 4> dst_slice_dim = src_slice_dim;
+  array<IndexType, 4> dst_slice_start = src_slice_start;
+
+  for (IndexType i = 0; i < 5; ++i) {
+    if (LValue) {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim)
+          .reverse(dim_rev)
+          .device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim);
+    } else {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+          in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
+    }
+    src_slice_start[2] += 1;
+    dst_slice_start[2] += 1;
+  }
+  sycl_device.memcpyDeviceToHost(
+      result.data(), gpu_out_data_result,
+      result.dimensions().TotalSize() * sizeof(DataType));
+
+  for (IndexType i = 0; i < expected.dimension(0); ++i) {
+    for (IndexType j = 0; j < expected.dimension(1); ++j) {
+      for (IndexType k = 0; k < expected.dimension(2); ++k) {
+        for (IndexType l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l));
+        }
+      }
+    }
+  }
+
+  dst_slice_start[2] = 0;
+  result.setRandom();
+  sycl_device.memcpyHostToDevice(
+      gpu_out_data_result, result.data(),
+      (result.dimensions().TotalSize()) * sizeof(DataType));
+  for (IndexType i = 0; i < 5; ++i) {
+    if (LValue) {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim)
+          .reverse(dim_rev)
+          .device(sycl_device) = in_gpu.slice(dst_slice_start, dst_slice_dim);
+    } else {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+          in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
+    }
+    dst_slice_start[2] += 1;
+  }
+  sycl_device.memcpyDeviceToHost(
+      result.data(), gpu_out_data_result,
+      result.dimensions().TotalSize() * sizeof(DataType));
+
+  for (IndexType i = 0; i < expected.dimension(0); ++i) {
+    for (IndexType j = 0; j < expected.dimension(1); ++j) {
+      for (IndexType k = 0; k < expected.dimension(2); ++k) {
+        for (IndexType l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l));
+        }
+      }
+    }
+  }
+}
+
+template <typename DataType>
+void sycl_reverse_test_per_device(const cl::sycl::device& d) {
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_reverse<DataType, ColMajor, int64_t>(sycl_device);
+  test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, false);
+  test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, false);
+  test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, true);
+  test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.get_info<cl::sycl::info::device::name>() << std::endl;
+    CALL_SUBTEST_1(sycl_reverse_test_per_device<short>(device));
+    CALL_SUBTEST_2(sycl_reverse_test_per_device<int>(device));
+    CALL_SUBTEST_3(sycl_reverse_test_per_device<unsigned int>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+    CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device));
+#endif
+    CALL_SUBTEST_5(sycl_reverse_test_per_device<float>(device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_roundings.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_roundings.cpp
index 2c26151ab657bca071b991403845e1d3aa07fb9a..83b592384b02fecd6b7f918b57ed41b800dda94c 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_roundings.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_roundings.cpp
@@ -54,7 +54,7 @@ static void test_float_ceiling()
   }
 }
 
-void test_cxx11_tensor_roundings()
+EIGEN_DECLARE_TEST(cxx11_tensor_roundings)
 {
    CALL_SUBTEST(test_float_rounding());
    CALL_SUBTEST(test_float_ceiling());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_scan.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_scan.cpp
index af59aa3ef228ab5a4fac0ff171caf6a4365018b0..dccee9e848ccbdf9d1cd70b1226b445127d7c754 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_scan.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_scan.cpp
@@ -98,7 +98,7 @@ static void test_tensor_maps() {
   }
 }
 
-void test_cxx11_tensor_scan() {
+EIGEN_DECLARE_TEST(cxx11_tensor_scan) {
   CALL_SUBTEST((test_1d_scan<ColMajor, float, true>()));
   CALL_SUBTEST((test_1d_scan<ColMajor, float, false>()));
   CALL_SUBTEST((test_1d_scan<RowMajor, float, true>()));
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_scan_cuda.cu b/contrib/eigen/unsupported/test/cxx11_tensor_scan_gpu.cu
similarity index 75%
rename from contrib/eigen/unsupported/test/cxx11_tensor_scan_cuda.cu
rename to contrib/eigen/unsupported/test/cxx11_tensor_scan_gpu.cu
index de1c0ac9595caf020dbd33fbf7784e639357a8a0..770a144f12278b6cb0c12c13bceb520e523cc1eb 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_scan_cuda.cu
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_scan_gpu.cu
@@ -9,18 +9,20 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
 using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;
 
 template<int DataLayout>
-void test_cuda_cumsum(int m_size, int k_size, int n_size)
+void test_gpu_cumsum(int m_size, int k_size, int n_size)
 {
   std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
   Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
@@ -35,12 +37,12 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
   float* d_t_input;
   float* d_t_result;
 
-  cudaMalloc((void**)(&d_t_input), t_input_bytes);
-  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+  gpuMalloc((void**)(&d_t_input), t_input_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
 
-  cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_t_input, t_input.data(), t_input_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
@@ -51,7 +53,7 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
   gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
   t_result = t_input.cumsum(1);
 
-  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
   for (DenseIndex i = 0; i < t_result.size(); i++) {
     if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
       continue;
@@ -64,13 +66,13 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
     assert(false);
   }
 
-  cudaFree((void*)d_t_input);
-  cudaFree((void*)d_t_result);
+  gpuFree((void*)d_t_input);
+  gpuFree((void*)d_t_result);
 }
 
 
-void test_cxx11_tensor_scan_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_scan_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128));
-  CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_gpu_cumsum<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_2(test_gpu_cumsum<RowMajor>(128, 128, 128));
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_scan_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_scan_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..09c45fce5ede65a3bb3ccd14a2343a40b6d80945
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_scan_sycl.cpp
@@ -0,0 +1,141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_cumsum(const Eigen::SyclDevice& sycl_device, IndexType m_size,
+                      IndexType k_size, IndexType n_size, int consume_dim,
+                      bool exclusive) {
+  static const DataType error_threshold = 1e-4f;
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size
+            << " consume_dim : " << consume_dim << ")" << std::endl;
+  Tensor<DataType, 3, DataLayout, IndexType> t_input(m_size, k_size, n_size);
+  Tensor<DataType, 3, DataLayout, IndexType> t_result(m_size, k_size, n_size);
+  Tensor<DataType, 3, DataLayout, IndexType> t_result_gpu(m_size, k_size,
+                                                          n_size);
+
+  t_input.setRandom();
+  std::size_t t_input_bytes = t_input.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType* gpu_data_in =
+      static_cast<DataType*>(sycl_device.allocate(t_input_bytes));
+  DataType* gpu_data_out =
+      static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
+
+  array<IndexType, 3> tensorRange = {{m_size, k_size, n_size}};
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_input(
+      gpu_data_in, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_result(
+      gpu_data_out, tensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_in, t_input.data(), t_input_bytes);
+  sycl_device.memcpyHostToDevice(gpu_data_out, t_input.data(), t_input_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_input.cumsum(consume_dim, exclusive);
+
+  t_result = t_input.cumsum(consume_dim, exclusive);
+
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), gpu_data_out,
+                                 t_result_bytes);
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << " CPU : " << t_result(i)
+              << " vs SYCL : " << t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+  sycl_device.deallocate(gpu_data_in);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim0_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim1_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim2_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim0_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                false);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim1_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                false);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim2_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                false);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_scan_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(
+        sycl_scan_test_exclusive_dim0_per_device<float>(sycl_device));
+    CALL_SUBTEST_2(
+        sycl_scan_test_exclusive_dim1_per_device<float>(sycl_device));
+    CALL_SUBTEST_3(
+        sycl_scan_test_exclusive_dim2_per_device<float>(sycl_device));
+    CALL_SUBTEST_4(
+        sycl_scan_test_inclusive_dim0_per_device<float>(sycl_device));
+    CALL_SUBTEST_5(
+        sycl_scan_test_inclusive_dim1_per_device<float>(sycl_device));
+    CALL_SUBTEST_6(
+        sycl_scan_test_inclusive_dim2_per_device<float>(sycl_device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_shuffling.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_shuffling.cpp
index d11444a14ae320e0ab7da2a249e236722c4b99c0..89a64c02172d1d259e88a163c3bba6aadd21c4a9 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_shuffling.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_shuffling.cpp
@@ -81,12 +81,12 @@ static void test_expr_shuffling()
   Tensor<float, 4, DataLayout> expected;
   expected = tensor.shuffle(shuffles);
 
-  Tensor<float, 4, DataLayout> result(5,7,3,2);
+  Tensor<float, 4, DataLayout> result(5, 7, 3, 2);
 
-  array<int, 4> src_slice_dim{{2,3,1,7}};
-  array<int, 4> src_slice_start{{0,0,0,0}};
-  array<int, 4> dst_slice_dim{{1,7,3,2}};
-  array<int, 4> dst_slice_start{{0,0,0,0}};
+  array<ptrdiff_t, 4> src_slice_dim{{2, 3, 1, 7}};
+  array<ptrdiff_t, 4> src_slice_start{{0, 0, 0, 0}};
+  array<ptrdiff_t, 4> dst_slice_dim{{1, 7, 3, 2}};
+  array<ptrdiff_t, 4> dst_slice_start{{0, 0, 0, 0}};
 
   for (int i = 0; i < 5; ++i) {
     result.slice(dst_slice_start, dst_slice_dim) =
@@ -215,7 +215,60 @@ static void test_shuffle_unshuffle()
 }
 
 
-void test_cxx11_tensor_shuffling()
+template <int DataLayout>
+static void test_empty_shuffling()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,0,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[0] = 0;
+  shuffles[1] = 1;
+  shuffles[2] = 2;
+  shuffles[3] = 3;
+
+  Tensor<float, 4, DataLayout> no_shuffle;
+  no_shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(2), 0);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 0; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  Tensor<float, 4, DataLayout> shuffle;
+  shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), 0);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), 7);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), 3);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 0; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+        }
+      }
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_shuffling)
 {
   CALL_SUBTEST(test_simple_shuffling<ColMajor>());
   CALL_SUBTEST(test_simple_shuffling<RowMajor>());
@@ -225,4 +278,6 @@ void test_cxx11_tensor_shuffling()
   CALL_SUBTEST(test_shuffling_as_value<RowMajor>());
   CALL_SUBTEST(test_shuffle_unshuffle<ColMajor>());
   CALL_SUBTEST(test_shuffle_unshuffle<RowMajor>());
+  CALL_SUBTEST(test_empty_shuffling<ColMajor>());
+  CALL_SUBTEST(test_empty_shuffling<RowMajor>());
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ca4e8b5ef33fa3ecad950edaf5da6b70fc3c9a39
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
@@ -0,0 +1,117 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) {
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> no_shuffle(tensorRange);
+  tensor.setRandom();
+
+  const size_t buffSize = tensor.size() * sizeof(DataType);
+  array<IndexType, 4> shuffles;
+  shuffles[0] = 0;
+  shuffles[1] = 1;
+  shuffles[2] = 2;
+  shuffles[3] = 3;
+  DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+  DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu1(gpu_data1,
+                                                             tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu2(gpu_data2,
+                                                             tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize);
+
+  gpu2.device(sycl_device) = gpu1.shuffle(shuffles);
+  sycl_device.memcpyDeviceToHost(no_shuffle.data(), gpu_data2, buffSize);
+  sycl_device.synchronize();
+
+  VERIFY_IS_EQUAL(no_shuffle.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), no_shuffle(i, j, k, l));
+        }
+      }
+    }
+  }
+
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  array<IndexType, 4> tensorrangeShuffle = {
+      {sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
+  Tensor<DataType, 4, DataLayout, IndexType> shuffle(tensorrangeShuffle);
+  DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu3(
+      gpu_data3, tensorrangeShuffle);
+
+  gpu3.device(sycl_device) = gpu1.shuffle(shuffles);
+  sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize);
+  sycl_device.synchronize();
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), sizeDim3);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), sizeDim4);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), sizeDim2);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), sizeDim1);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), shuffle(k, l, j, i));
+        }
+      }
+    }
+  }
+}
+
+template <typename DataType, typename dev_Selector>
+void sycl_shuffling_test_per_device(dev_Selector s) {
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_simple.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_simple.cpp
index 5a0d339ef9a65a7bfaa170f877901acabfe96855..6d70f5435fc632295d68db17d60b53ff475a321c 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_simple.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_simple.cpp
@@ -316,7 +316,7 @@ static void test_resize()
   VERIFY_IS_EQUAL(epsilon.size(), 3*5*7);
 }
 
-void test_cxx11_tensor_simple()
+EIGEN_DECLARE_TEST(cxx11_tensor_simple)
 {
   CALL_SUBTEST(test_0d());
   CALL_SUBTEST(test_1d());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_striding.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_striding.cpp
index 935b908cca7154b88170ac4584c3f3ecb45ac6d0..aefdfa9b4412439c497e60c20355426aa39b8361 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_striding.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_striding.cpp
@@ -110,7 +110,7 @@ static void test_striding_as_lvalue()
 }
 
 
-void test_cxx11_tensor_striding()
+EIGEN_DECLARE_TEST(cxx11_tensor_striding)
 {
   CALL_SUBTEST(test_simple_striding<ColMajor>());
   CALL_SUBTEST(test_simple_striding<RowMajor>());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_striding_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_striding_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d3b1fa77cdf70b23e8c8108569af408526920de6
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_striding_sycl.cpp
@@ -0,0 +1,203 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_striding(const Eigen::SyclDevice& sycl_device)
+{
+
+  Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
+  Eigen::array<IndexType, 4> stride_dims = {{1,1,3,3}};
+
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensor_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
+
+
+  std::size_t tensor_bytes = tensor.size()  * sizeof(DataType);
+  std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
+  std::size_t stride_bytes = stride.size() * sizeof(DataType);
+  DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
+  DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
+  DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, tensor_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
+
+
+  tensor.setRandom();
+  array<IndexType, 4> strides;
+  strides[0] = 1;
+  strides[1] = 1;
+  strides[2] = 1;
+  strides[3] = 1;
+  sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
+  gpu_no_stride.device(sycl_device)=gpu_tensor.stride(strides);
+  sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
+
+  //no_stride = tensor.stride(strides);
+
+  VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_stride.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_stride.dimension(3), 7);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+//Tensor<float, 4, DataLayout> stride;
+//  stride = tensor.stride(strides);
+
+  gpu_stride.device(sycl_device)=gpu_tensor.stride(strides);
+  sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
+
+  VERIFY_IS_EQUAL(stride.dimension(0), 1);
+  VERIFY_IS_EQUAL(stride.dimension(1), 1);
+  VERIFY_IS_EQUAL(stride.dimension(2), 3);
+  VERIFY_IS_EQUAL(stride.dimension(3), 3);
+
+  for (IndexType i = 0; i < 1; ++i) {
+    for (IndexType j = 0; j < 1; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
+        for (IndexType l = 0; l < 3; ++l) {
+          VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(d_tensor);
+  sycl_device.deallocate(d_no_stride);
+  sycl_device.deallocate(d_stride);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+
+  Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
+  Eigen::array<IndexType, 4> stride_dims = {{3,12,10,21}};
+
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> no_stride(stride_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
+
+
+  std::size_t tensor_bytes = tensor.size()  * sizeof(DataType);
+  std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
+  std::size_t stride_bytes = stride.size() * sizeof(DataType);
+
+  DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
+  DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
+  DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, stride_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
+
+  //Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<IndexType, 4> strides;
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+
+//  Tensor<float, 4, DataLayout> result(3, 12, 10, 21);
+//  result.stride(strides) = tensor;
+  sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
+  gpu_stride.stride(strides).device(sycl_device)=gpu_tensor;
+  sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> no_strides;
+  no_strides[0] = 1;
+  no_strides[1] = 1;
+  no_strides[2] = 1;
+  no_strides[3] = 1;
+//  Tensor<float, 4, DataLayout> result2(3, 12, 10, 21);
+//  result2.stride(strides) = tensor.stride(no_strides);
+
+  gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides);
+  sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(d_tensor);
+  sycl_device.deallocate(d_no_stride);
+  sycl_device.deallocate(d_stride);
+}
+
+
+template <typename Dev_selector> void tensorStridingPerDevice(Dev_selector& s){
+  QueueInterface queueInterface(s);
+  auto sycl_device=Eigen::SyclDevice(&queueInterface);
+  test_simple_striding<float, ColMajor, int64_t>(sycl_device);
+  test_simple_striding<float, RowMajor, int64_t>(sycl_device);
+  test_striding_as_lvalue<float, ColMajor, int64_t>(sycl_device);
+  test_striding_as_lvalue<float, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_striding_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorStridingPerDevice(device));
+  }
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_sugar.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_sugar.cpp
index 2f56eb495fd0b4e1e2bec413288257da878b99d8..2ca5c47db62d2bbb7e00c700a8179076c5f607a4 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_sugar.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_sugar.cpp
@@ -73,7 +73,7 @@ static void test_scalar_sugar_sub_div() {
   }
 }
 
-void test_cxx11_tensor_sugar()
+EIGEN_DECLARE_TEST(cxx11_tensor_sugar)
 {
   CALL_SUBTEST(test_comparison_sugar());
   CALL_SUBTEST(test_scalar_sugar_add_mul());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_sycl.cpp
index 6a9c334223e4538bf84e1532730fcf196932ca97..e6c5e23782b3bbcf3bbcd6560fd2a90dfaf7b61e 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_sycl.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_sycl.cpp
@@ -15,8 +15,8 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
 #include "main.h"
@@ -27,36 +27,188 @@ using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
 
-void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
+  IndexType sizeDim1 = 5;
+  IndexType sizeDim2 = 5;
+  IndexType sizeDim3 = 1;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out2(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out3(tensorRange);
+
+  in1 = in1.random();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out1.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType));
+  gpu1.device(sycl_device) = gpu1 * 3.14f;
+  gpu2.device(sycl_device) = gpu2 * 2.7f;
+  sycl_device.memcpyDeviceToHost(out1.data(), gpu_data1,(out1.size())*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out2.data(), gpu_data1,(out2.size())*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < in1.size(); ++i) {
+  //  std::cout << "SYCL DATA : " << out1(i) << "  vs  CPU DATA : " << in1(i) * 3.14f << "\n";
+    VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f);
+    VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f);
+    VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f);
+  }
+
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) {
+  IndexType size = 20;
+  array<IndexType, 1> tensorRange = {{size}};
+  Tensor<DataType, 1, DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> in2(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> out(tensorRange);
+
+  in1 = in1.random();
+  in2 = in1;
+
+  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> gpu1(gpu_data, tensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data(),(in1.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  in1.setZero();
+
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size()*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < in1.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), in2(i));
+  }
+
+  sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_sync_offsets(const Eigen::SyclDevice &sycl_device) {
+  using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
+  IndexType full_size = 32;
+  IndexType half_size = full_size / 2;
+  array<IndexType, 1> tensorRange = {{full_size}};
+  tensor_type in1(tensorRange);
+  tensor_type out(tensorRange);
+
+  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
+
+  in1 = in1.random();
+  // Copy all data to device, then permute on copy back to host
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data() + half_size, gpu_data, half_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  in1 = in1.random();
+  out.setZero();
+  // Permute copies to device, then copy all back to host
+  sycl_device.memcpyHostToDevice(gpu_data + half_size, in1.data(), half_size * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data() + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  in1 = in1.random();
+  out.setZero();
+  DataType* gpu_data_out  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu2(gpu_data_out, tensorRange);
+  // Copy all to device, permute copies on device, then copy all back to host
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
+  sycl_device.memcpy(gpu_data_out + half_size, gpu_data, half_size * sizeof(DataType));
+  sycl_device.memcpy(gpu_data_out, gpu_data + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  sycl_device.deallocate(gpu_data_out);
+  sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_memset_offsets(const Eigen::SyclDevice &sycl_device) {
+  using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
+  IndexType full_size = 32;
+  IndexType half_size = full_size / 2;
+  array<IndexType, 1> tensorRange = {{full_size}};
+  tensor_type cpu_out(tensorRange);
+  tensor_type out(tensorRange);
+
+  cpu_out.setZero();
+
+  std::memset(cpu_out.data(), 0, half_size * sizeof(DataType));
+  std::memset(cpu_out.data() + half_size, 1, half_size * sizeof(DataType));
+
+  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
+
+  sycl_device.memset(gpu_data, 0, half_size * sizeof(DataType));
+  sycl_device.memset(gpu_data + half_size, 1, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < full_size; ++i) {
+    VERIFY_IS_APPROX(out(i), cpu_out(i));
+  }
+
+  sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
 
-  int sizeDim1 = 100;
-  int sizeDim2 = 100;
-  int sizeDim3 = 100;
-  array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  Tensor<float, 3> in1(tensorRange);
-  Tensor<float, 3> in2(tensorRange);
-  Tensor<float, 3> in3(tensorRange);
-  Tensor<float, 3> out(tensorRange);
+  IndexType sizeDim1 = 100;
+  IndexType sizeDim2 = 10;
+  IndexType sizeDim3 = 20;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> in3(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange);
 
   in2 = in2.random();
   in3 = in3.random();
 
-  float * gpu_in1_data  = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_in2_data  = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_in3_data  = static_cast<float*>(sycl_device.allocate(in3.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_out_data =  static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.size()*sizeof(DataType)));
+  DataType * gpu_in3_data  = static_cast<DataType*>(sycl_device.allocate(in3.size()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
 
-  TensorMap<Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange);
-  TensorMap<Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange);
-  TensorMap<Tensor<float, 3>> gpu_in3(gpu_in3_data, tensorRange);
-  TensorMap<Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in3(gpu_in3_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
 
   /// a=1.2f
   gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f);
-  sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(in1(i,j,k), 1.2f);
       }
     }
@@ -65,10 +217,12 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
 
   /// a=b*1.2f
   gpu_out.device(sycl_device) = gpu_in1 * 1.2f;
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i,j,k),
                          in1(i,j,k) * 1.2f);
       }
@@ -77,12 +231,14 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
   printf("a=b*1.2f Test Passed\n");
 
   /// c=a*b
-  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(float));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType));
   gpu_out.device(sycl_device) = gpu_in1 * gpu_in2;
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i,j,k),
                          in1(i,j,k) *
                              in2(i,j,k));
@@ -93,10 +249,11 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
 
   /// c=a+b
   gpu_out.device(sycl_device) = gpu_in1 + gpu_in2;
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i,j,k),
                          in1(i,j,k) +
                              in2(i,j,k));
@@ -107,10 +264,11 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
 
   /// c=a*a
   gpu_out.device(sycl_device) = gpu_in1 * gpu_in1;
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i,j,k),
                          in1(i,j,k) *
                              in1(i,j,k));
@@ -121,10 +279,11 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
 
   //a*3.14f + b*2.7f
   gpu_out.device(sycl_device) =  gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f);
-  sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i,j,k),
                          in1(i,j,k) * 3.14f
                        + in2(i,j,k) * 2.7f);
@@ -134,12 +293,13 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
   printf("a*3.14f + b*2.7f Test Passed\n");
 
   ///d= (a>0.5? b:c)
-  sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.dimensions().TotalSize())*sizeof(float));
+  sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.size())*sizeof(DataType));
   gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3);
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f)
                                                 ? in2(i, j, k)
                                                 : in3(i, j, k));
@@ -152,8 +312,50 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
   sycl_device.deallocate(gpu_in3_data);
   sycl_device.deallocate(gpu_out_data);
 }
-void test_cxx11_tensor_sycl() {
-  cl::sycl::gpu_selector s;
-  Eigen::SyclDevice sycl_device(s);
-  CALL_SUBTEST(test_sycl_cpu(sycl_device));
+template<typename Scalar1, typename Scalar2,  int DataLayout, typename IndexType>
+static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){
+    IndexType size = 20;
+    array<IndexType, 1> tensorRange = {{size}};
+    Tensor<Scalar1, 1, DataLayout, IndexType> in(tensorRange);
+    Tensor<Scalar2, 1, DataLayout, IndexType> out(tensorRange);
+    Tensor<Scalar2, 1, DataLayout, IndexType> out_host(tensorRange);
+
+    in = in.random();
+
+    Scalar1* gpu_in_data  = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1)));
+    Scalar2 * gpu_out_data =  static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2)));
+
+    TensorMap<Tensor<Scalar1, 1, DataLayout, IndexType>> gpu_in(gpu_in_data, tensorRange);
+    TensorMap<Tensor<Scalar2, 1, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+    sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1));
+    gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>();
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2));
+    out_host = in. template cast<Scalar2>();
+    for(IndexType i=0; i< size; i++)
+    {
+      VERIFY_IS_APPROX(out(i), out_host(i));
+    }
+    printf("cast Test Passed\n");
+    sycl_device.deallocate(gpu_in_data);
+    sycl_device.deallocate(gpu_out_data);
+}
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync_offsets<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_memset_offsets<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_cast<DataType, int, RowMajor, int64_t>(sycl_device);
+  test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+  }
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_symmetry.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_symmetry.cpp
index d680e9b3b9eaa19ff7724f09a886de32d96fd129..fed269a9a29b7866db5d8219b34a749288867a72 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_symmetry.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_symmetry.cpp
@@ -801,7 +801,7 @@ static void test_tensor_randacc()
   }
 }
 
-void test_cxx11_tensor_symmetry()
+EIGEN_DECLARE_TEST(cxx11_tensor_symmetry)
 {
   CALL_SUBTEST(test_symgroups_static());
   CALL_SUBTEST(test_symgroups_dynamic());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_thread_local.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_thread_local.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7e866f6d1c8b818ecc6a6759667f46c324231e6a
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_thread_local.cpp
@@ -0,0 +1,149 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include <iostream>
+#include <unordered_set>
+
+#include "main.h"
+#include <Eigen/CXX11/ThreadPool>
+
+struct Counter {
+  Counter() = default;
+
+  void inc() {
+    // Check that mutation happens only in a thread that created this counter.
+    VERIFY_IS_EQUAL(std::this_thread::get_id(), created_by);
+    counter_value++;
+  }
+  int value() { return counter_value; }
+
+  std::thread::id created_by;
+  int counter_value = 0;
+};
+
+struct InitCounter {
+  void operator()(Counter& counter) {
+    counter.created_by = std::this_thread::get_id();
+  }
+};
+
+void test_simple_thread_local() {
+  int num_threads = internal::random<int>(4, 32);
+  Eigen::ThreadPool thread_pool(num_threads);
+  Eigen::ThreadLocal<Counter, InitCounter> counter(num_threads, InitCounter());
+
+  int num_tasks = 3 * num_threads;
+  Eigen::Barrier barrier(num_tasks);
+
+  for (int i = 0; i < num_tasks; ++i) {
+    thread_pool.Schedule([&counter, &barrier]() {
+      Counter& local = counter.local();
+      local.inc();
+
+      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+      barrier.Notify();
+    });
+  }
+
+  barrier.Wait();
+
+  counter.ForEach(
+      [](std::thread::id, Counter& cnt) { VERIFY_IS_EQUAL(cnt.value(), 3); });
+}
+
+void test_zero_sized_thread_local() {
+  Eigen::ThreadLocal<Counter, InitCounter> counter(0, InitCounter());
+
+  Counter& local = counter.local();
+  local.inc();
+
+  int total = 0;
+  counter.ForEach([&total](std::thread::id, Counter& cnt) {
+    total += cnt.value();
+    VERIFY_IS_EQUAL(cnt.value(), 1);
+  });
+
+  VERIFY_IS_EQUAL(total, 1);
+}
+
+// All thread local values fits into the lock-free storage.
+void test_large_number_of_tasks_no_spill() {
+  int num_threads = internal::random<int>(4, 32);
+  Eigen::ThreadPool thread_pool(num_threads);
+  Eigen::ThreadLocal<Counter, InitCounter> counter(num_threads, InitCounter());
+
+  int num_tasks = 10000;
+  Eigen::Barrier barrier(num_tasks);
+
+  for (int i = 0; i < num_tasks; ++i) {
+    thread_pool.Schedule([&counter, &barrier]() {
+      Counter& local = counter.local();
+      local.inc();
+      barrier.Notify();
+    });
+  }
+
+  barrier.Wait();
+
+  int total = 0;
+  std::unordered_set<std::thread::id> unique_threads;
+
+  counter.ForEach([&](std::thread::id id, Counter& cnt) {
+    total += cnt.value();
+    unique_threads.insert(id);
+  });
+
+  VERIFY_IS_EQUAL(total, num_tasks);
+  // Not all threads in a pool might be woken up to execute submitted tasks.
+  // Also thread_pool.Schedule() might use current thread if queue is full.
+  VERIFY_IS_EQUAL(
+      unique_threads.size() <= (static_cast<size_t>(num_threads + 1)), true);
+}
+
+// Lock free thread local storage is too small to fit all the unique threads,
+// and it spills to a map guarded by a mutex.
+void test_large_number_of_tasks_with_spill() {
+  int num_threads = internal::random<int>(4, 32);
+  Eigen::ThreadPool thread_pool(num_threads);
+  Eigen::ThreadLocal<Counter, InitCounter> counter(1, InitCounter());
+
+  int num_tasks = 10000;
+  Eigen::Barrier barrier(num_tasks);
+
+  for (int i = 0; i < num_tasks; ++i) {
+    thread_pool.Schedule([&counter, &barrier]() {
+      Counter& local = counter.local();
+      local.inc();
+      barrier.Notify();
+    });
+  }
+
+  barrier.Wait();
+
+  int total = 0;
+  std::unordered_set<std::thread::id> unique_threads;
+
+  counter.ForEach([&](std::thread::id id, Counter& cnt) {
+    total += cnt.value();
+    unique_threads.insert(id);
+  });
+
+  VERIFY_IS_EQUAL(total, num_tasks);
+  // Not all threads in a pool might be woken up to execute submitted tasks.
+  // Also thread_pool.Schedule() might use current thread if queue is full.
+  VERIFY_IS_EQUAL(
+      unique_threads.size() <= (static_cast<size_t>(num_threads + 1)), true);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_thread_local) {
+  CALL_SUBTEST(test_simple_thread_local());
+  CALL_SUBTEST(test_zero_sized_thread_local());
+  CALL_SUBTEST(test_large_number_of_tasks_no_spill());
+  CALL_SUBTEST(test_large_number_of_tasks_with_spill());
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_thread_pool.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_thread_pool.cpp
index 2ef665f303feed6c711f2cabc896e5c0ca75947f..b772a1d6054e5f9d69fcab8836cee47e8e826b7f 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -16,29 +16,72 @@
 
 using Eigen::Tensor;
 
+class TestAllocator : public Allocator {
+ public:
+  ~TestAllocator() EIGEN_OVERRIDE {}
+  EIGEN_DEVICE_FUNC void* allocate(size_t num_bytes) const EIGEN_OVERRIDE {
+    const_cast<TestAllocator*>(this)->alloc_count_++;
+    return internal::aligned_malloc(num_bytes);
+  }
+  EIGEN_DEVICE_FUNC void deallocate(void* buffer) const EIGEN_OVERRIDE {
+    const_cast<TestAllocator*>(this)->dealloc_count_++;
+    internal::aligned_free(buffer);
+  }
+
+  int alloc_count() const { return alloc_count_; }
+  int dealloc_count() const { return dealloc_count_; }
+
+ private:
+  int alloc_count_ = 0;
+  int dealloc_count_ = 0;
+};
 
 void test_multithread_elementwise()
 {
-  Tensor<float, 3> in1(2,3,7);
-  Tensor<float, 3> in2(2,3,7);
-  Tensor<float, 3> out(2,3,7);
+  Tensor<float, 3> in1(200, 30, 70);
+  Tensor<float, 3> in2(200, 30, 70);
+  Tensor<double, 3> out(200, 30, 70);
 
   in1.setRandom();
   in2.setRandom();
 
   Eigen::ThreadPool tp(internal::random<int>(3, 11));
   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
-  out.device(thread_pool_device) = in1 + in2 * 3.14f;
+  out.device(thread_pool_device) = (in1 + in2 * 3.14f).cast<double>();
 
-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
-      for (int k = 0; k < 7; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
+  for (int i = 0; i < 200; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
       }
     }
   }
 }
 
+void test_async_multithread_elementwise()
+{
+  Tensor<float, 3> in1(200, 30, 70);
+  Tensor<float, 3> in2(200, 30, 70);
+  Tensor<double, 3> out(200, 30, 70);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPool tp(internal::random<int>(3, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
+
+  Eigen::Barrier b(1);
+  out.device(thread_pool_device, [&b]() { b.Notify(); }) = (in1 + in2 * 3.14f).cast<double>();
+  b.Wait();
+
+  for (int i = 0; i < 200; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
+      }
+    }
+  }
+}
 
 void test_multithread_compound_assignment()
 {
@@ -232,6 +275,273 @@ void test_multithread_contraction_agrees_with_singlethread() {
   }
 }
 
+// Apply Sqrt to all output elements.
+struct SqrtOutputKernel {
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+      const TensorContractionParams&, Index, Index, Index num_rows,
+      Index num_cols) const {
+    for (int i = 0; i < num_rows; ++i) {
+      for (int j = 0; j < num_cols; ++j) {
+        output_mapper(i, j) = std::sqrt(output_mapper(i, j));
+      }
+    }
+  }
+};
+
+template <int DataLayout>
+static void test_multithread_contraction_with_output_kernel() {
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(2, 11);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+  Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in mat4 to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 248);
+  MapXf m_right(t_right.data(), 248, 1400);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+  // compute results by separate methods
+  t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
+
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
+
+template<int DataLayout>
+void test_async_multithread_contraction_agrees_with_singlethread()
+{
+  int contract_size = internal::random<int>(100, 500);
+
+  Tensor<float, 3, DataLayout> left(internal::random<int>(10, 40),
+                                    contract_size,
+                                    internal::random<int>(10, 40));
+
+  Tensor<float, 4, DataLayout> right(
+      internal::random<int>(1, 20), internal::random<int>(1, 20), contract_size,
+      internal::random<int>(1, 20));
+
+  left.setRandom();
+  right.setRandom();
+
+  // add constants to shift values away from 0 for more precision
+  left += left.constant(1.5f);
+  right += right.constant(1.5f);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
+
+  Eigen::ThreadPool tp(internal::random<int>(2, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(8, 32));
+
+  Tensor<float, 5, DataLayout> st_result;
+  st_result = left.contract(right, dims);
+
+  Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
+
+  Eigen::Barrier barrier(1);
+  tp_result.device(thread_pool_device, [&barrier]() { barrier.Notify(); }) =
+      left.contract(right, dims);
+  barrier.Wait();
+
+  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  for (ptrdiff_t i = 0; i < st_result.size(); i++) {
+    // if both of the values are very small, then do nothing (because the test
+    // will fail due to numerical precision issues when values are small)
+    if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
+      VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
+    }
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization.
+template <int DataLayout>
+static void test_sharded_by_inner_dim_contraction()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  t_result.device(device) = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization with output kernel.
+template <int DataLayout>
+static void test_sharded_by_inner_dim_contraction_with_output_kernel()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization.
+template <int DataLayout>
+static void test_async_sharded_by_inner_dim_contraction()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  Eigen::Barrier barrier(1);
+  t_result.device(device, [&barrier]() { barrier.Notify(); }) =
+      t_left.contract(t_right, dims);
+  barrier.Wait();
+
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization with output kernel.
+template <int DataLayout>
+static void test_async_sharded_by_inner_dim_contraction_with_output_kernel()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  Eigen::Barrier barrier(1);
+  t_result.device(device, [&barrier]() { barrier.Notify(); }) =
+      t_left.contract(t_right, dims, SqrtOutputKernel());
+  barrier.Wait();
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
 
 template<int DataLayout>
 void test_full_contraction() {
@@ -320,14 +630,14 @@ void test_multithread_random()
 }
 
 template<int DataLayout>
-void test_multithread_shuffle()
+void test_multithread_shuffle(Allocator* allocator)
 {
   Tensor<float, 4, DataLayout> tensor(17,5,7,11);
   tensor.setRandom();
 
   const int num_threads = internal::random<int>(2, 11);
   ThreadPool threads(num_threads);
-  Eigen::ThreadPoolDevice device(&threads, num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
 
   Tensor<float, 4, DataLayout> shuffle(7,5,11,17);
   array<ptrdiff_t, 4> shuffles = {{2,1,3,0}};
@@ -344,10 +654,26 @@ void test_multithread_shuffle()
   }
 }
 
+void test_threadpool_allocate(TestAllocator* allocator)
+{
+  const int num_threads = internal::random<int>(2, 11);
+  const int num_allocs = internal::random<int>(2, 11);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
+
+  for (int a = 0; a < num_allocs; ++a) {
+    void* ptr = device.allocate(512);
+    device.deallocate(ptr);
+  }
+  VERIFY(allocator != NULL);
+  VERIFY_IS_EQUAL(allocator->alloc_count(), num_allocs);
+  VERIFY_IS_EQUAL(allocator->dealloc_count(), num_allocs);
+}
 
-void test_cxx11_tensor_thread_pool()
+EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
 {
   CALL_SUBTEST_1(test_multithread_elementwise());
+  CALL_SUBTEST_1(test_async_multithread_elementwise());
   CALL_SUBTEST_1(test_multithread_compound_assignment());
 
   CALL_SUBTEST_2(test_multithread_contraction<ColMajor>());
@@ -355,19 +681,41 @@ void test_cxx11_tensor_thread_pool()
 
   CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
   CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>());
+
+  CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<ColMajor>());
+  CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<RowMajor>());
+
+  // Test EvalShardedByInnerDimContext parallelization strategy.
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<ColMajor>());
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<RowMajor>());
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
+
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<ColMajor>());
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<RowMajor>());
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
 
   // Exercise various cases that have been problematic in the past.
-  CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>());
-  CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>());
+  CALL_SUBTEST_7(test_contraction_corner_cases<ColMajor>());
+  CALL_SUBTEST_7(test_contraction_corner_cases<RowMajor>());
+
+  CALL_SUBTEST_8(test_full_contraction<ColMajor>());
+  CALL_SUBTEST_8(test_full_contraction<RowMajor>());
+
+  CALL_SUBTEST_9(test_multithreaded_reductions<ColMajor>());
+  CALL_SUBTEST_9(test_multithreaded_reductions<RowMajor>());
 
-  CALL_SUBTEST_4(test_full_contraction<ColMajor>());
-  CALL_SUBTEST_4(test_full_contraction<RowMajor>());
+  CALL_SUBTEST_10(test_memcpy());
+  CALL_SUBTEST_10(test_multithread_random());
 
-  CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>());
-  CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>());
+  TestAllocator test_allocator;
+  CALL_SUBTEST_11(test_multithread_shuffle<ColMajor>(NULL));
+  CALL_SUBTEST_11(test_multithread_shuffle<RowMajor>(&test_allocator));
+  CALL_SUBTEST_11(test_threadpool_allocate(&test_allocator));
 
-  CALL_SUBTEST_6(test_memcpy());
-  CALL_SUBTEST_6(test_multithread_random());
-  CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>());
-  CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>());
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11
 }
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_trace.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_trace.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..00972289528ec5077535d846e80ea28812dca523
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_trace.cpp
@@ -0,0 +1,172 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+template <int DataLayout>
+static void test_0D_trace() {
+  Tensor<float, 0, DataLayout> tensor;
+  tensor.setRandom();
+  array<ptrdiff_t, 0> dims;
+  Tensor<float, 0, DataLayout> result = tensor.trace(dims);
+  VERIFY_IS_EQUAL(result(), tensor());
+}
+
+
+template <int DataLayout>
+static void test_all_dimensions_trace() {
+  Tensor<float, 3, DataLayout> tensor1(5, 5, 5);
+  tensor1.setRandom();
+  Tensor<float, 0, DataLayout> result1 = tensor1.trace();
+  VERIFY_IS_EQUAL(result1.rank(), 0);
+  float sum = 0.0f;
+  for (int i = 0; i < 5; ++i) {
+    sum += tensor1(i, i, i);
+  }
+  VERIFY_IS_EQUAL(result1(), sum);
+
+  Tensor<float, 5, DataLayout> tensor2(7, 7, 7, 7, 7);
+  tensor2.setRandom();
+  array<ptrdiff_t, 5> dims = { { 2, 1, 0, 3, 4 } };
+  Tensor<float, 0, DataLayout> result2 = tensor2.trace(dims);
+  VERIFY_IS_EQUAL(result2.rank(), 0);
+  sum = 0.0f;
+  for (int i = 0; i < 7; ++i) {
+    sum += tensor2(i, i, i, i, i);
+  }
+  VERIFY_IS_EQUAL(result2(), sum);
+}
+
+
+template <int DataLayout>
+static void test_simple_trace() {
+  Tensor<float, 3, DataLayout> tensor1(3, 5, 3);
+  tensor1.setRandom();
+  array<ptrdiff_t, 2> dims1 = { { 0, 2 } };
+  Tensor<float, 1, DataLayout> result1 = tensor1.trace(dims1);
+  VERIFY_IS_EQUAL(result1.rank(), 1);
+  VERIFY_IS_EQUAL(result1.dimension(0), 5);
+  float sum = 0.0f;
+  for (int i = 0; i < 5; ++i) {
+    sum = 0.0f;
+    for (int j = 0; j < 3; ++j) {
+      sum += tensor1(j, i, j);
+    }
+    VERIFY_IS_EQUAL(result1(i), sum);
+  }
+
+  Tensor<float, 4, DataLayout> tensor2(5, 5, 7, 7);
+  tensor2.setRandom();
+  array<ptrdiff_t, 2> dims2 = { { 2, 3 } };
+  Tensor<float, 2, DataLayout> result2 = tensor2.trace(dims2);
+  VERIFY_IS_EQUAL(result2.rank(), 2);
+  VERIFY_IS_EQUAL(result2.dimension(0), 5);
+  VERIFY_IS_EQUAL(result2.dimension(1), 5);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      sum = 0.0f;
+      for (int k = 0; k < 7; ++k) {
+        sum += tensor2(i, j, k, k);
+      }
+      VERIFY_IS_EQUAL(result2(i, j), sum);
+    }
+  }
+
+  array<ptrdiff_t, 2> dims3 = { { 1, 0 } };
+  Tensor<float, 2, DataLayout> result3 = tensor2.trace(dims3);
+  VERIFY_IS_EQUAL(result3.rank(), 2);
+  VERIFY_IS_EQUAL(result3.dimension(0), 7);
+  VERIFY_IS_EQUAL(result3.dimension(1), 7);
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      sum = 0.0f;
+      for (int k = 0; k < 5; ++k) {
+        sum += tensor2(k, k, i, j);
+      }
+      VERIFY_IS_EQUAL(result3(i, j), sum);
+    }
+  }
+
+  Tensor<float, 5, DataLayout> tensor3(3, 7, 3, 7, 3);
+  tensor3.setRandom();
+  array<ptrdiff_t, 3> dims4 = { { 0, 2, 4 } };
+  Tensor<float, 2, DataLayout> result4 = tensor3.trace(dims4);
+  VERIFY_IS_EQUAL(result4.rank(), 2);
+  VERIFY_IS_EQUAL(result4.dimension(0), 7);
+  VERIFY_IS_EQUAL(result4.dimension(1), 7);
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        sum += tensor3(k, i, k, j, k);
+      }
+      VERIFY_IS_EQUAL(result4(i, j), sum);
+    }
+  }
+
+  Tensor<float, 5, DataLayout> tensor4(3, 7, 4, 7, 5);
+  tensor4.setRandom();
+  array<ptrdiff_t, 2> dims5 = { { 1, 3 } };
+  Tensor<float, 3, DataLayout> result5 = tensor4.trace(dims5);
+  VERIFY_IS_EQUAL(result5.rank(), 3);
+  VERIFY_IS_EQUAL(result5.dimension(0), 3);
+  VERIFY_IS_EQUAL(result5.dimension(1), 4);
+  VERIFY_IS_EQUAL(result5.dimension(2), 5);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        sum = 0.0f;
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor4(i, l, j, l, k);
+        }
+        VERIFY_IS_EQUAL(result5(i, j, k), sum);
+      }
+    }
+  }
+}
+
+
+template<int DataLayout>
+static void test_trace_in_expr() {
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 3);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> dims = { { 1, 3 } };
+  Tensor<float, 2, DataLayout> result(2, 5);
+  result = result.constant(1.0f) - tensor.trace(dims);
+  VERIFY_IS_EQUAL(result.rank(), 2);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 5);
+  float sum = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        sum += tensor(i, k, j, k);
+      }
+      VERIFY_IS_EQUAL(result(i, j), 1.0f - sum);
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_trace) {
+  CALL_SUBTEST(test_0D_trace<ColMajor>());
+  CALL_SUBTEST(test_0D_trace<RowMajor>());
+  CALL_SUBTEST(test_all_dimensions_trace<ColMajor>());
+  CALL_SUBTEST(test_all_dimensions_trace<RowMajor>());
+  CALL_SUBTEST(test_simple_trace<ColMajor>());
+  CALL_SUBTEST(test_simple_trace<RowMajor>());
+  CALL_SUBTEST(test_trace_in_expr<ColMajor>());
+  CALL_SUBTEST(test_trace_in_expr<RowMajor>());
+}
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_uint128.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_uint128.cpp
index d2a1e8673e05bcf965a044d3df0309ef5a325caa..46fceaa19cb5e1f8e261d52f2cdabd463aba332c 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_uint128.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_uint128.cpp
@@ -12,7 +12,7 @@
 #include <Eigen/CXX11/Tensor>
 
 
-#if EIGEN_COMP_MSVC
+#if EIGEN_COMP_MSVC || !defined(__SIZEOF_INT128__)
 #define EIGEN_NO_INT128
 #else
 typedef __uint128_t uint128_t;
@@ -144,7 +144,7 @@ void test_misc2() {
 #endif
 
 
-void test_cxx11_tensor_uint128()
+EIGEN_DECLARE_TEST(cxx11_tensor_uint128)
 {
 #ifdef EIGEN_NO_INT128
   // Skip the test on compilers that don't support 128bit integers natively
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_volume_patch.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_volume_patch.cpp
index ca6840f3bfe127d7d52a92e48e146582ae9b9bf1..862212e8236fccbe0953ebac78517f609f433075 100644
--- a/contrib/eigen/unsupported/test/cxx11_tensor_volume_patch.cpp
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_volume_patch.cpp
@@ -70,9 +70,9 @@ static void test_entire_volume_patch()
   const int dy = patch_y - 1;
   const int dx = patch_x - 1;
 
-  const int forward_pad_z = dz - dz / 2;
-  const int forward_pad_y = dy - dy / 2;
-  const int forward_pad_x = dx - dx / 2;
+  const int forward_pad_z = dz / 2;
+  const int forward_pad_y = dy / 2;
+  const int forward_pad_x = dx / 2;
 
   for (int pz = 0; pz < patch_z; pz++) {
     for (int py = 0; py < patch_y; py++) {
@@ -105,7 +105,7 @@ static void test_entire_volume_patch()
   }
 }
 
-void test_cxx11_tensor_volume_patch()
+EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch)
 {
   CALL_SUBTEST(test_single_voxel_patch());
   CALL_SUBTEST(test_entire_volume_patch());
diff --git a/contrib/eigen/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/contrib/eigen/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8d99a48ed847a5eb652dc8f9507c0a2b911dffd5
--- /dev/null
+++ b/contrib/eigen/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
@@ -0,0 +1,222 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+static const int DataLayout = ColMajor;
+
+template <typename DataType, typename IndexType>
+static void test_single_voxel_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+IndexType sizeDim0 = 4;
+IndexType sizeDim1 = 2;
+IndexType sizeDim2 = 3;
+IndexType sizeDim3 = 5;
+IndexType sizeDim4 = 7;
+array<IndexType, 5> tensorColMajorRange = {{sizeDim0, sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+array<IndexType, 5> tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1, sizeDim0}};
+Tensor<DataType, 5, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+Tensor<DataType, 5, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+tensor_col_major.setRandom();
+
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 5, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+
+
+  // single volume patch: ColMajor
+  array<IndexType, 6> patchColMajorTensorRange={{sizeDim0,1, 1, 1, sizeDim1*sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 6, DataLayout,IndexType> single_voxel_patch_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =single_voxel_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_voxel_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 6, DataLayout,IndexType>> gpu_single_voxel_patch_col_major(gpu_data_single_voxel_patch_col_major, patchColMajorTensorRange);
+  gpu_single_voxel_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(1, 1, 1);
+  sycl_device.memcpyDeviceToHost(single_voxel_patch_col_major.data(), gpu_data_single_voxel_patch_col_major, patchTensorBuffSize);
+
+
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(0), 4);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(4), 2 * 3 * 5);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(5), 7);
+
+  array<IndexType, 6> patchRowMajorTensorRange={{sizeDim4, sizeDim1*sizeDim2*sizeDim3, 1, 1, 1, sizeDim0}};
+  Tensor<DataType, 6, RowMajor,IndexType> single_voxel_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =single_voxel_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_voxel_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 6, RowMajor,IndexType>> gpu_single_voxel_patch_row_major(gpu_data_single_voxel_patch_row_major, patchRowMajorTensorRange);
+  gpu_single_voxel_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(1, 1, 1);
+  sycl_device.memcpyDeviceToHost(single_voxel_patch_row_major.data(), gpu_data_single_voxel_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4);
+
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+ for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+       VERIFY_IS_EQUAL(tensor_col_major.data()[i], single_voxel_patch_col_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+  }
+
+
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_single_voxel_patch_col_major);
+  sycl_device.deallocate(gpu_data_single_voxel_patch_row_major);
+}
+
+template <typename DataType, typename IndexType>
+static void test_entire_volume_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  const int depth = 4;
+  const int patch_z = 2;
+  const int patch_y = 3;
+  const int patch_x = 5;
+  const int batch = 7;
+
+  array<IndexType, 5> tensorColMajorRange = {{depth, patch_z, patch_y, patch_x, batch}};
+  array<IndexType, 5> tensorRowMajorRange = {{batch, patch_x, patch_y, patch_z, depth}};
+  Tensor<DataType, 5, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 5, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+  tensor_col_major.setRandom();
+
+
+    DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+    DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+    TensorMap<Tensor<DataType, 5, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+    TensorMap<Tensor<DataType, 5, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+    sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+    gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+    sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+
+    // single volume patch: ColMajor
+    array<IndexType, 6> patchColMajorTensorRange={{depth,patch_z, patch_y, patch_x, patch_z*patch_y*patch_x, batch}};
+    Tensor<DataType, 6, DataLayout,IndexType> entire_volume_patch_col_major(patchColMajorTensorRange);
+    size_t patchTensorBuffSize =entire_volume_patch_col_major.size()*sizeof(DataType);
+    DataType* gpu_data_entire_volume_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+    TensorMap<Tensor<DataType, 6, DataLayout,IndexType>> gpu_entire_volume_patch_col_major(gpu_data_entire_volume_patch_col_major, patchColMajorTensorRange);
+    gpu_entire_volume_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(patch_z, patch_y, patch_x);
+    sycl_device.memcpyDeviceToHost(entire_volume_patch_col_major.data(), gpu_data_entire_volume_patch_col_major, patchTensorBuffSize);
+
+
+//  Tensor<float, 5> tensor(depth, patch_z, patch_y, patch_x, batch);
+//  tensor.setRandom();
+//  Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout();
+
+  //Tensor<float, 6> entire_volume_patch;
+  //entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(0), depth);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(1), patch_z);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(2), patch_y);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(3), patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(4), patch_z * patch_y * patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(5), batch);
+
+//  Tensor<float, 6, RowMajor> entire_volume_patch_row_major;
+  //entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
+
+  array<IndexType, 6> patchRowMajorTensorRange={{batch,patch_z*patch_y*patch_x, patch_x, patch_y, patch_z, depth}};
+  Tensor<DataType, 6, RowMajor,IndexType> entire_volume_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =entire_volume_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_entire_volume_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 6, RowMajor,IndexType>> gpu_entire_volume_patch_row_major(gpu_data_entire_volume_patch_row_major, patchRowMajorTensorRange);
+  gpu_entire_volume_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
+  sycl_device.memcpyDeviceToHost(entire_volume_patch_row_major.data(), gpu_data_entire_volume_patch_row_major, patchTensorBuffSize);
+
+
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth);
+
+  const int dz = patch_z - 1;
+  const int dy = patch_y - 1;
+  const int dx = patch_x - 1;
+
+  const int forward_pad_z = dz / 2;
+  const int forward_pad_y = dy / 2;
+  const int forward_pad_x = dx / 2;
+
+  for (int pz = 0; pz < patch_z; pz++) {
+    for (int py = 0; py < patch_y; py++) {
+      for (int px = 0; px < patch_x; px++) {
+        const int patchId = pz + patch_z * (py + px * patch_y);
+        for (int z = 0; z < patch_z; z++) {
+          for (int y = 0; y < patch_y; y++) {
+            for (int x = 0; x < patch_x; x++) {
+              for (int b = 0; b < batch; b++) {
+                for (int d = 0; d < depth; d++) {
+                  float expected = 0.0f;
+                  float expected_row_major = 0.0f;
+                  const int eff_z = z - forward_pad_z + pz;
+                  const int eff_y = y - forward_pad_y + py;
+                  const int eff_x = x - forward_pad_x + px;
+                  if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 &&
+                      eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) {
+                    expected = tensor_col_major(d, eff_z, eff_y, eff_x, b);
+                    expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d);
+                  }
+                  VERIFY_IS_EQUAL(entire_volume_patch_col_major(d, z, y, x, patchId, b), expected);
+                  VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_entire_volume_patch_col_major);
+  sycl_device.deallocate(gpu_data_entire_volume_patch_row_major);
+}
+
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_volume_patch_test_per_device(dev_Selector s){
+QueueInterface queueInterface(s);
+auto sycl_device = Eigen::SyclDevice(&queueInterface);
+std::cout << "Running on " << s.template get_info<cl::sycl::info::device::name>() << std::endl;
+test_single_voxel_patch_sycl<DataType, int64_t>(sycl_device);
+test_entire_volume_patch_sycl<DataType, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl)
+{
+for (const auto& device :Eigen::get_sycl_supported_devices()) {
+  CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device));
+}
+}
diff --git a/contrib/eigen/unsupported/test/dgmres.cpp b/contrib/eigen/unsupported/test/dgmres.cpp
index 2b11807c8b25ad9662750b6dc9fbd7a64450d53d..5f63161b26d0eb66ba5c17de789c8b5cd9a615c1 100644
--- a/contrib/eigen/unsupported/test/dgmres.cpp
+++ b/contrib/eigen/unsupported/test/dgmres.cpp
@@ -9,7 +9,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "../../test/sparse_solver.h"
-#include <Eigen/src/IterativeSolvers/DGMRES.h>
+#include <unsupported/Eigen/IterativeSolvers>
 
 template<typename T> void test_dgmres_T()
 {
@@ -24,7 +24,7 @@ template<typename T> void test_dgmres_T()
   //CALL_SUBTEST( check_sparse_square_solving(dgmres_colmajor_ssor)     );
 }
 
-void test_dgmres()
+EIGEN_DECLARE_TEST(dgmres)
 {
   CALL_SUBTEST_1(test_dgmres_T<double>());
   CALL_SUBTEST_2(test_dgmres_T<std::complex<double> >());
diff --git a/contrib/eigen/unsupported/test/forward_adolc.cpp b/contrib/eigen/unsupported/test/forward_adolc.cpp
index 866db8e86f0b67c4c703c9964bf1b730663cbcf5..14a909d3b9d49159708293af048ca17f20834a6e 100644
--- a/contrib/eigen/unsupported/test/forward_adolc.cpp
+++ b/contrib/eigen/unsupported/test/forward_adolc.cpp
@@ -35,7 +35,7 @@ struct TestFunc1
   int m_inputs, m_values;
 
   TestFunc1() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
-  TestFunc1(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+  TestFunc1(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {}
 
   int inputs() const { return m_inputs; }
   int values() const { return m_values; }
@@ -119,7 +119,7 @@ template<typename Func> void adolc_forward_jacobian(const Func& f)
     VERIFY_IS_APPROX(j, jref);
 }
 
-void test_forward_adolc()
+EIGEN_DECLARE_TEST(forward_adolc)
 {
   adtl::setNumDir(NUMBER_DIRECTIONS);
 
@@ -132,7 +132,7 @@ void test_forward_adolc()
   }
 
   {
-    // simple instanciation tests
+    // simple instantiation tests
     Matrix<adtl::adouble,2,1> x;
     foo(x);
     Matrix<adtl::adouble,Dynamic,Dynamic> A(4,4);;
diff --git a/contrib/eigen/unsupported/test/gmres.cpp b/contrib/eigen/unsupported/test/gmres.cpp
index f2969116b572ab5adb0e9fa02b35e07a74350281..8d2254b5b6ba0f81a27fd447290cd6c4a73c4228 100644
--- a/contrib/eigen/unsupported/test/gmres.cpp
+++ b/contrib/eigen/unsupported/test/gmres.cpp
@@ -24,7 +24,7 @@ template<typename T> void test_gmres_T()
   //CALL_SUBTEST( check_sparse_square_solving(gmres_colmajor_ssor)     );
 }
 
-void test_gmres()
+EIGEN_DECLARE_TEST(gmres)
 {
   CALL_SUBTEST_1(test_gmres_T<double>());
   CALL_SUBTEST_2(test_gmres_T<std::complex<double> >());
diff --git a/contrib/eigen/unsupported/test/idrs.cpp b/contrib/eigen/unsupported/test/idrs.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f88c01632fbc6726d2c73c7832923ae259905940
--- /dev/null
+++ b/contrib/eigen/unsupported/test/idrs.cpp
@@ -0,0 +1,27 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2012 Kolja Brix <brix@igpm.rwth-aaachen.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "../../test/sparse_solver.h"
+#include <Eigen/IterativeSolvers>
+
+template<typename T> void test_idrs_T()
+{
+  IDRS<SparseMatrix<T>, DiagonalPreconditioner<T> > idrs_colmajor_diag;
+  IDRS<SparseMatrix<T>, IncompleteLUT<T> >           idrs_colmajor_ilut;
+
+  CALL_SUBTEST( check_sparse_square_solving(idrs_colmajor_diag)  );
+  CALL_SUBTEST( check_sparse_square_solving(idrs_colmajor_ilut)     );
+}
+
+EIGEN_DECLARE_TEST(idrs)
+{
+  CALL_SUBTEST_1(test_idrs_T<double>());
+  CALL_SUBTEST_2(test_idrs_T<std::complex<double> >());
+}
diff --git a/contrib/eigen/unsupported/test/kronecker_product.cpp b/contrib/eigen/unsupported/test/kronecker_product.cpp
index e770049e5d5af91e5533e973273b613eda543bb4..b5b764c65369f012188b9c5a00d2d822cef4322b 100644
--- a/contrib/eigen/unsupported/test/kronecker_product.cpp
+++ b/contrib/eigen/unsupported/test/kronecker_product.cpp
@@ -9,6 +9,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+
 #ifdef EIGEN_TEST_PART_1
 
 #include "sparse.h"
@@ -83,7 +84,7 @@ void check_sparse_kronecker_product(const MatrixType& ab)
 }
 
 
-void test_kronecker_product()
+EIGEN_DECLARE_TEST(kronecker_product)
 {
   // DM = dense matrix; SM = sparse matrix
 
@@ -95,7 +96,7 @@ void test_kronecker_product()
   SM_a.insert(1,0) = DM_a.coeffRef(1,0) = -0.9076572187376921;
   SM_a.insert(1,1) = DM_a.coeffRef(1,1) =  0.6469156566545853;
   SM_a.insert(1,2) = DM_a.coeffRef(1,2) = -0.3658010398782789;
- 
+
   MatrixXd             DM_b(3,2);
   SparseMatrix<double> SM_b(3,2);
   SM_b.insert(0,0) = DM_b.coeffRef(0,0) =  0.9004440976767099;
@@ -165,7 +166,7 @@ void test_kronecker_product()
   SM_a.insert(0,3) = -0.2;
   SM_a.insert(2,4) =  0.3;
   SM_a.finalize();
-  
+
   SM_b.insert(0,0) =  0.4;
   SM_b.insert(2,1) = -0.5;
   SM_b.finalize();
@@ -183,7 +184,7 @@ void test_kronecker_product()
   DM_b2.resize(4,8);
   DM_ab2 = kroneckerProduct(DM_a2,DM_b2);
   CALL_SUBTEST(check_dimension(DM_ab2,10*4,9*8));
-  
+
   for(int i = 0; i < g_repeat; i++)
   {
     double density = Eigen::internal::random<double>(0.01,0.5);
@@ -196,35 +197,35 @@ void test_kronecker_product()
     MatrixXf dA(ra,ca), dB(rb,cb), dC;
     initSparse(density, dA, sA);
     initSparse(density, dB, sB);
-    
+
     sC = kroneckerProduct(sA,sB);
     dC = kroneckerProduct(dA,dB);
     VERIFY_IS_APPROX(MatrixXf(sC),dC);
-    
+
     sC = kroneckerProduct(sA.transpose(),sB);
     dC = kroneckerProduct(dA.transpose(),dB);
     VERIFY_IS_APPROX(MatrixXf(sC),dC);
-    
+
     sC = kroneckerProduct(sA.transpose(),sB.transpose());
     dC = kroneckerProduct(dA.transpose(),dB.transpose());
     VERIFY_IS_APPROX(MatrixXf(sC),dC);
-    
+
     sC = kroneckerProduct(sA,sB.transpose());
     dC = kroneckerProduct(dA,dB.transpose());
     VERIFY_IS_APPROX(MatrixXf(sC),dC);
-    
+
     sC2 = kroneckerProduct(sA,sB);
     dC = kroneckerProduct(dA,dB);
     VERIFY_IS_APPROX(MatrixXf(sC2),dC);
-    
+
     sC2 = kroneckerProduct(dA,sB);
     dC = kroneckerProduct(dA,dB);
     VERIFY_IS_APPROX(MatrixXf(sC2),dC);
-    
+
     sC2 = kroneckerProduct(sA,dB);
     dC = kroneckerProduct(dA,dB);
     VERIFY_IS_APPROX(MatrixXf(sC2),dC);
-    
+
     sC2 = kroneckerProduct(2*sA,sB);
     dC = kroneckerProduct(2*dA,dB);
     VERIFY_IS_APPROX(MatrixXf(sC2),dC);
@@ -236,11 +237,10 @@ void test_kronecker_product()
 #ifdef EIGEN_TEST_PART_2
 
 // simply check that for a dense kronecker product, sparse module is not needed
-
 #include "main.h"
 #include <Eigen/KroneckerProduct>
 
-void test_kronecker_product()
+EIGEN_DECLARE_TEST(kronecker_product)
 {
   MatrixXd a(2,2), b(3,3), c;
   a.setRandom();
diff --git a/contrib/eigen/unsupported/test/levenberg_marquardt.cpp b/contrib/eigen/unsupported/test/levenberg_marquardt.cpp
index 64f168c16276724628c89289dec5455ec10fa075..7f9a81cd3063ef98753bd51c4de5984685db88ad 100644
--- a/contrib/eigen/unsupported/test/levenberg_marquardt.cpp
+++ b/contrib/eigen/unsupported/test/levenberg_marquardt.cpp
@@ -1445,7 +1445,7 @@ void testNistEckerle4(void)
   VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
 }
 
-void test_levenberg_marquardt()
+EIGEN_DECLARE_TEST(levenberg_marquardt)
 {
     // Tests using the examples provided by (c)minpack
     CALL_SUBTEST(testLmder1());
diff --git a/contrib/eigen/unsupported/test/matrix_exponential.cpp b/contrib/eigen/unsupported/test/matrix_exponential.cpp
index 50dec083d77ce8a02d47fa8e3cb4b6b3e7527096..b032cbf1dd1677f2e1466d2db3e49c56b9f90235 100644
--- a/contrib/eigen/unsupported/test/matrix_exponential.cpp
+++ b/contrib/eigen/unsupported/test/matrix_exponential.cpp
@@ -119,7 +119,7 @@ void randomTest(const MatrixType& m, double tol)
   }
 }
 
-void test_matrix_exponential()
+EIGEN_DECLARE_TEST(matrix_exponential)
 {
   CALL_SUBTEST_2(test2dRotation<double>(1e-13));
   CALL_SUBTEST_1(test2dRotation<float>(2e-5));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
diff --git a/contrib/eigen/unsupported/test/matrix_function.cpp b/contrib/eigen/unsupported/test/matrix_function.cpp
index 005c9c15f35992a417055745fa49fade8299539d..6d753737d26145080e04feb2328df3850f7146a7 100644
--- a/contrib/eigen/unsupported/test/matrix_function.cpp
+++ b/contrib/eigen/unsupported/test/matrix_function.cpp
@@ -23,7 +23,7 @@ inline bool test_isApprox_abs(const Type1& a, const Type2& b)
 
 // Returns a matrix with eigenvalues clustered around 0, 1 and 2.
 template<typename MatrixType>
-MatrixType randomMatrixWithRealEivals(const typename MatrixType::Index size)
+MatrixType randomMatrixWithRealEivals(const Index size)
 {
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
@@ -41,14 +41,14 @@ template <typename MatrixType, int IsComplex = NumTraits<typename internal::trai
 struct randomMatrixWithImagEivals
 {
   // Returns a matrix with eigenvalues clustered around 0 and +/- i.
-  static MatrixType run(const typename MatrixType::Index size);
+  static MatrixType run(const Index size);
 };
 
 // Partial specialization for real matrices
 template<typename MatrixType>
 struct randomMatrixWithImagEivals<MatrixType, 0>
 {
-  static MatrixType run(const typename MatrixType::Index size)
+  static MatrixType run(const Index size)
   {
     typedef typename MatrixType::Scalar Scalar;
     MatrixType diag = MatrixType::Zero(size, size);
@@ -75,7 +75,7 @@ struct randomMatrixWithImagEivals<MatrixType, 0>
 template<typename MatrixType>
 struct randomMatrixWithImagEivals<MatrixType, 1>
 {
-  static MatrixType run(const typename MatrixType::Index size)
+  static MatrixType run(const Index size)
   {
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
@@ -210,7 +210,7 @@ void testMapRef(const MatrixType& A)
 }
 
 
-void test_matrix_function()
+EIGEN_DECLARE_TEST(matrix_function)
 {
   CALL_SUBTEST_1(testMatrixType(Matrix<float,1,1>()));
   CALL_SUBTEST_2(testMatrixType(Matrix3cf()));
diff --git a/contrib/eigen/unsupported/test/matrix_power.cpp b/contrib/eigen/unsupported/test/matrix_power.cpp
index 7ccfacfdfd0f01b05dea51a684f715b6f47d55c5..dbaf9dbdf70475e6e070b2be8a72e88f6aa7a90d 100644
--- a/contrib/eigen/unsupported/test/matrix_power.cpp
+++ b/contrib/eigen/unsupported/test/matrix_power.cpp
@@ -19,7 +19,7 @@ void test2dRotation(const T& tol)
   MatrixPower<Matrix<T,2,2> > Apow(A);
 
   for (int i=0; i<=20; ++i) {
-    angle = std::pow(T(10), (i-10) / T(5.));
+    angle = std::pow(T(10), T(i-10) / T(5.));
     c = std::cos(angle);
     s = std::sin(angle);
     B << c, s, -s, c;
@@ -61,7 +61,7 @@ void test3dRotation(const T& tol)
   for (int i=0; i<=20; ++i) {
     v = Matrix<T,3,1>::Random();
     v.normalize();
-    angle = std::pow(T(10), (i-10) / T(5.));
+    angle = std::pow(T(10), T(i-10) / T(5.));
     VERIFY(AngleAxis<T>(angle, v).matrix().isApprox(AngleAxis<T>(1,v).matrix().pow(angle), tol));
   }
 }
@@ -150,55 +150,55 @@ typedef Matrix<double,3,3,RowMajor>         Matrix3dRowMajor;
 typedef Matrix<long double,3,3>             Matrix3e;
 typedef Matrix<long double,Dynamic,Dynamic> MatrixXe;
  
-void test_matrix_power()
+EIGEN_DECLARE_TEST(matrix_power)
 {
   CALL_SUBTEST_2(test2dRotation<double>(1e-13));
-  CALL_SUBTEST_1(test2dRotation<float>(2e-5));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
+  CALL_SUBTEST_1(test2dRotation<float>(2e-5f));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
   CALL_SUBTEST_9(test2dRotation<long double>(1e-13L));
   CALL_SUBTEST_2(test2dHyperbolicRotation<double>(1e-14));
-  CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5));
+  CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5f));
   CALL_SUBTEST_9(test2dHyperbolicRotation<long double>(1e-14L));
 
   CALL_SUBTEST_10(test3dRotation<double>(1e-13));
-  CALL_SUBTEST_11(test3dRotation<float>(1e-5));
+  CALL_SUBTEST_11(test3dRotation<float>(1e-5f));
   CALL_SUBTEST_12(test3dRotation<long double>(1e-13L));
 
   CALL_SUBTEST_2(testGeneral(Matrix2d(),         1e-13));
   CALL_SUBTEST_7(testGeneral(Matrix3dRowMajor(), 1e-13));
   CALL_SUBTEST_3(testGeneral(Matrix4cd(),        1e-13));
   CALL_SUBTEST_4(testGeneral(MatrixXd(8,8),      2e-12));
-  CALL_SUBTEST_1(testGeneral(Matrix2f(),         1e-4));
-  CALL_SUBTEST_5(testGeneral(Matrix3cf(),        1e-4));
-  CALL_SUBTEST_8(testGeneral(Matrix4f(),         1e-4));
-  CALL_SUBTEST_6(testGeneral(MatrixXf(2,2),      1e-3)); // see bug 614
+  CALL_SUBTEST_1(testGeneral(Matrix2f(),         1e-4f));
+  CALL_SUBTEST_5(testGeneral(Matrix3cf(),        1e-4f));
+  CALL_SUBTEST_8(testGeneral(Matrix4f(),         1e-4f));
+  CALL_SUBTEST_6(testGeneral(MatrixXf(2,2),      1e-3f)); // see bug 614
   CALL_SUBTEST_9(testGeneral(MatrixXe(7,7),      1e-13L));
   CALL_SUBTEST_10(testGeneral(Matrix3d(),        1e-13));
-  CALL_SUBTEST_11(testGeneral(Matrix3f(),        1e-4));
+  CALL_SUBTEST_11(testGeneral(Matrix3f(),        1e-4f));
   CALL_SUBTEST_12(testGeneral(Matrix3e(),        1e-13L));
 
   CALL_SUBTEST_2(testSingular(Matrix2d(),         1e-13));
   CALL_SUBTEST_7(testSingular(Matrix3dRowMajor(), 1e-13));
   CALL_SUBTEST_3(testSingular(Matrix4cd(),        1e-13));
   CALL_SUBTEST_4(testSingular(MatrixXd(8,8),      2e-12));
-  CALL_SUBTEST_1(testSingular(Matrix2f(),         1e-4));
-  CALL_SUBTEST_5(testSingular(Matrix3cf(),        1e-4));
-  CALL_SUBTEST_8(testSingular(Matrix4f(),         1e-4));
-  CALL_SUBTEST_6(testSingular(MatrixXf(2,2),      1e-3));
+  CALL_SUBTEST_1(testSingular(Matrix2f(),         1e-4f));
+  CALL_SUBTEST_5(testSingular(Matrix3cf(),        1e-4f));
+  CALL_SUBTEST_8(testSingular(Matrix4f(),         1e-4f));
+  CALL_SUBTEST_6(testSingular(MatrixXf(2,2),      1e-3f));
   CALL_SUBTEST_9(testSingular(MatrixXe(7,7),      1e-13L));
   CALL_SUBTEST_10(testSingular(Matrix3d(),        1e-13));
-  CALL_SUBTEST_11(testSingular(Matrix3f(),        1e-4));
+  CALL_SUBTEST_11(testSingular(Matrix3f(),        1e-4f));
   CALL_SUBTEST_12(testSingular(Matrix3e(),        1e-13L));
 
   CALL_SUBTEST_2(testLogThenExp(Matrix2d(),         1e-13));
   CALL_SUBTEST_7(testLogThenExp(Matrix3dRowMajor(), 1e-13));
   CALL_SUBTEST_3(testLogThenExp(Matrix4cd(),        1e-13));
   CALL_SUBTEST_4(testLogThenExp(MatrixXd(8,8),      2e-12));
-  CALL_SUBTEST_1(testLogThenExp(Matrix2f(),         1e-4));
-  CALL_SUBTEST_5(testLogThenExp(Matrix3cf(),        1e-4));
-  CALL_SUBTEST_8(testLogThenExp(Matrix4f(),         1e-4));
-  CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2),      1e-3));
+  CALL_SUBTEST_1(testLogThenExp(Matrix2f(),         1e-4f));
+  CALL_SUBTEST_5(testLogThenExp(Matrix3cf(),        1e-4f));
+  CALL_SUBTEST_8(testLogThenExp(Matrix4f(),         1e-4f));
+  CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2),      1e-3f));
   CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7),      1e-13L));
   CALL_SUBTEST_10(testLogThenExp(Matrix3d(),        1e-13));
-  CALL_SUBTEST_11(testLogThenExp(Matrix3f(),        1e-4));
+  CALL_SUBTEST_11(testLogThenExp(Matrix3f(),        1e-4f));
   CALL_SUBTEST_12(testLogThenExp(Matrix3e(),        1e-13L));
 }
diff --git a/contrib/eigen/unsupported/test/matrix_square_root.cpp b/contrib/eigen/unsupported/test/matrix_square_root.cpp
index ea541e1eaaa51c26fbc8b293d7f77414795beb11..034f29217039af6bf9ceba1f0da89e30479e1bb0 100644
--- a/contrib/eigen/unsupported/test/matrix_square_root.cpp
+++ b/contrib/eigen/unsupported/test/matrix_square_root.cpp
@@ -18,7 +18,7 @@ void testMatrixSqrt(const MatrixType& m)
   VERIFY_IS_APPROX(sqrtA * sqrtA, A);
 }
 
-void test_matrix_square_root()
+EIGEN_DECLARE_TEST(matrix_square_root)
 {
   for (int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(testMatrixSqrt(Matrix3cf()));
diff --git a/contrib/eigen/unsupported/test/minres.cpp b/contrib/eigen/unsupported/test/minres.cpp
index 8b300b78a557fc939bed6d7376237650c1f8c994..2eb40fef646bfc8af70795258170089e2b1f73db 100644
--- a/contrib/eigen/unsupported/test/minres.cpp
+++ b/contrib/eigen/unsupported/test/minres.cpp
@@ -36,7 +36,7 @@ template<typename T> void test_minres_T()
 
 }
 
-void test_minres()
+EIGEN_DECLARE_TEST(minres)
 {
   CALL_SUBTEST_1(test_minres_T<double>());
 //  CALL_SUBTEST_2(test_minres_T<std::compex<double> >());
diff --git a/contrib/eigen/unsupported/test/mpreal/mpreal.h b/contrib/eigen/unsupported/test/mpreal/mpreal.h
deleted file mode 100644
index 8404f1ff837e2b5961aebc16ade8efd2cf59c3ac..0000000000000000000000000000000000000000
--- a/contrib/eigen/unsupported/test/mpreal/mpreal.h
+++ /dev/null
@@ -1,3104 +0,0 @@
-/*
-    MPFR C++: Multi-precision floating point number class for C++.
-    Based on MPFR library:    http://mpfr.org
-
-    Project homepage:    http://www.holoborodko.com/pavel/mpfr
-    Contact e-mail:      pavel@holoborodko.com
-
-    Copyright (c) 2008-2015 Pavel Holoborodko
-
-    Contributors:
-    Dmitriy Gubanov, Konstantin Holoborodko, Brian Gladman,
-    Helmut Jarausch, Fokko Beekhof, Ulrich Mutze, Heinz van Saanen,
-    Pere Constans, Peter van Hoof, Gael Guennebaud, Tsai Chia Cheng,
-    Alexei Zubanov, Jauhien Piatlicki, Victor Berger, John Westwood,
-    Petr Aleksandrov, Orion Poplawski, Charles Karney, Arash Partow,
-    Rodney James, Jorge Leitao.
-
-    Licensing:
-    (A) MPFR C++ is under GNU General Public License ("GPL").
-
-    (B) Non-free licenses may also be purchased from the author, for users who
-        do not want their programs protected by the GPL.
-
-        The non-free licenses are for users that wish to use MPFR C++ in
-        their products but are unwilling to release their software
-        under the GPL (which would require them to release source code
-        and allow free redistribution).
-
-        Such users can purchase an unlimited-use license from the author.
-        Contact us for more details.
-
-    GNU General Public License ("GPL") copyright permissions statement:
-    **************************************************************************
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __MPREAL_H__
-#define __MPREAL_H__
-
-#include <string>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <cfloat>
-#include <cmath>
-#include <cstring>
-#include <limits>
-#include <complex>
-#include <algorithm>
-
-// Options
-#define MPREAL_HAVE_MSVC_DEBUGVIEW              // Enable Debugger Visualizer for "Debug" builds in MSVC.
-#define MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS  // Enable extended std::numeric_limits<mpfr::mpreal> specialization.
-                                                // Meaning that "digits", "round_style" and similar members are defined as functions, not constants.
-                                                // See std::numeric_limits<mpfr::mpreal> at the end of the file for more information.
-
-// Library version
-#define MPREAL_VERSION_MAJOR 3
-#define MPREAL_VERSION_MINOR 6
-#define MPREAL_VERSION_PATCHLEVEL 2
-#define MPREAL_VERSION_STRING "3.6.2"
-
-// Detect compiler using signatures from http://predef.sourceforge.net/
-#if defined(__GNUC__)
-    #define IsInf(x) (isinf)(x)                 // GNU C++/Intel ICC compiler on Linux
-#elif defined(_MSC_VER)                         // Microsoft Visual C++
-    #define IsInf(x) (!_finite(x))
-#else
-    #define IsInf(x) (std::isinf)(x)              // GNU C/C++ (and/or other compilers), just hope for C99 conformance
-#endif
-
-// A Clang feature extension to determine compiler features.
-#ifndef __has_feature
-    #define __has_feature(x) 0
-#endif
-
-// Detect support for r-value references (move semantic). Borrowed from Eigen.
-#if (__has_feature(cxx_rvalue_references) || \
-       defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L || \
-      (defined(_MSC_VER) && _MSC_VER >= 1600))
-
-    #define MPREAL_HAVE_MOVE_SUPPORT
-
-    // Use fields in mpfr_t structure to check if it was initialized / set dummy initialization
-    #define mpfr_is_initialized(x)      (0 != (x)->_mpfr_d)
-    #define mpfr_set_uninitialized(x)   ((x)->_mpfr_d = 0 )
-#endif
-
-// Detect support for explicit converters.
-#if (__has_feature(cxx_explicit_conversions) || \
-       (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC_MINOR__ >= 5) || __cplusplus >= 201103L || \
-       (defined(_MSC_VER) && _MSC_VER >= 1800))
-
-    #define MPREAL_HAVE_EXPLICIT_CONVERTERS
-#endif
-
-#define MPFR_USE_INTMAX_T   // Enable 64-bit integer types - should be defined before mpfr.h
-
-#if defined(MPREAL_HAVE_MSVC_DEBUGVIEW) && defined(_MSC_VER) && defined(_DEBUG)
-    #define MPREAL_MSVC_DEBUGVIEW_CODE     DebugView = toString();
-    #define MPREAL_MSVC_DEBUGVIEW_DATA     std::string DebugView;
-#else
-    #define MPREAL_MSVC_DEBUGVIEW_CODE
-    #define MPREAL_MSVC_DEBUGVIEW_DATA
-#endif
-
-#include <mpfr.h>
-
-#if (MPFR_VERSION < MPFR_VERSION_NUM(3,0,0))
-    #include <cstdlib>                          // Needed for random()
-#endif
-
-// Less important options
-#define MPREAL_DOUBLE_BITS_OVERFLOW -1          // Triggers overflow exception during conversion to double if mpreal
-                                                // cannot fit in MPREAL_DOUBLE_BITS_OVERFLOW bits
-                                                // = -1 disables overflow checks (default)
-
-// Fast replacement for mpfr_set_zero(x, +1):
-// (a) uses low-level data members, might not be compatible with new versions of MPFR
-// (b) sign is not set, add (x)->_mpfr_sign = 1;
-#define mpfr_set_zero_fast(x)  ((x)->_mpfr_exp = __MPFR_EXP_ZERO)
-
-#if defined(__GNUC__)
-  #define MPREAL_PERMISSIVE_EXPR __extension__
-#else
-  #define MPREAL_PERMISSIVE_EXPR
-#endif
-
-namespace mpfr {
-
-class mpreal {
-private:
-    mpfr_t mp;
-
-public:
-
-    // Get default rounding mode & precision
-    inline static mp_rnd_t   get_default_rnd()    {    return (mp_rnd_t)(mpfr_get_default_rounding_mode());       }
-    inline static mp_prec_t  get_default_prec()   {    return mpfr_get_default_prec();                            }
-
-    // Constructors && type conversions
-    mpreal();
-    mpreal(const mpreal& u);
-    mpreal(const mpf_t u);
-    mpreal(const mpz_t u,                  mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const mpq_t u,                  mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const double u,                 mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const long double u,            mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const unsigned long long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const long long int u,          mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const unsigned long int u,      mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const unsigned int u,           mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const long int u,               mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const int u,                    mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-
-    // Construct mpreal from mpfr_t structure.
-    // shared = true allows to avoid deep copy, so that mpreal and 'u' share the same data & pointers.
-    mpreal(const mpfr_t  u, bool shared = false);
-
-    mpreal(const char* s,             mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const std::string& s,      mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
-
-    ~mpreal();
-
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
-    mpreal& operator=(mpreal&& v);
-    mpreal(mpreal&& u);
-#endif
-
-    // Operations
-    // =
-    // +, -, *, /, ++, --, <<, >>
-    // *=, +=, -=, /=,
-    // <, >, ==, <=, >=
-
-    // =
-    mpreal& operator=(const mpreal& v);
-    mpreal& operator=(const mpf_t v);
-    mpreal& operator=(const mpz_t v);
-    mpreal& operator=(const mpq_t v);
-    mpreal& operator=(const long double v);
-    mpreal& operator=(const double v);
-    mpreal& operator=(const unsigned long int v);
-    mpreal& operator=(const unsigned long long int v);
-    mpreal& operator=(const long long int v);
-    mpreal& operator=(const unsigned int v);
-    mpreal& operator=(const long int v);
-    mpreal& operator=(const int v);
-    mpreal& operator=(const char* s);
-    mpreal& operator=(const std::string& s);
-    template <typename real_t> mpreal& operator= (const std::complex<real_t>& z);
-
-    // +
-    mpreal& operator+=(const mpreal& v);
-    mpreal& operator+=(const mpf_t v);
-    mpreal& operator+=(const mpz_t v);
-    mpreal& operator+=(const mpq_t v);
-    mpreal& operator+=(const long double u);
-    mpreal& operator+=(const double u);
-    mpreal& operator+=(const unsigned long int u);
-    mpreal& operator+=(const unsigned int u);
-    mpreal& operator+=(const long int u);
-    mpreal& operator+=(const int u);
-
-    mpreal& operator+=(const long long int  u);
-    mpreal& operator+=(const unsigned long long int u);
-    mpreal& operator-=(const long long int  u);
-    mpreal& operator-=(const unsigned long long int u);
-    mpreal& operator*=(const long long int  u);
-    mpreal& operator*=(const unsigned long long int u);
-    mpreal& operator/=(const long long int  u);
-    mpreal& operator/=(const unsigned long long int u);
-
-    const mpreal operator+() const;
-    mpreal& operator++ ();
-    const mpreal  operator++ (int);
-
-    // -
-    mpreal& operator-=(const mpreal& v);
-    mpreal& operator-=(const mpz_t v);
-    mpreal& operator-=(const mpq_t v);
-    mpreal& operator-=(const long double u);
-    mpreal& operator-=(const double u);
-    mpreal& operator-=(const unsigned long int u);
-    mpreal& operator-=(const unsigned int u);
-    mpreal& operator-=(const long int u);
-    mpreal& operator-=(const int u);
-    const mpreal operator-() const;
-    friend const mpreal operator-(const unsigned long int b, const mpreal& a);
-    friend const mpreal operator-(const unsigned int b,      const mpreal& a);
-    friend const mpreal operator-(const long int b,          const mpreal& a);
-    friend const mpreal operator-(const int b,               const mpreal& a);
-    friend const mpreal operator-(const double b,            const mpreal& a);
-    mpreal& operator-- ();
-    const mpreal  operator-- (int);
-
-    // *
-    mpreal& operator*=(const mpreal& v);
-    mpreal& operator*=(const mpz_t v);
-    mpreal& operator*=(const mpq_t v);
-    mpreal& operator*=(const long double v);
-    mpreal& operator*=(const double v);
-    mpreal& operator*=(const unsigned long int v);
-    mpreal& operator*=(const unsigned int v);
-    mpreal& operator*=(const long int v);
-    mpreal& operator*=(const int v);
-
-    // /
-    mpreal& operator/=(const mpreal& v);
-    mpreal& operator/=(const mpz_t v);
-    mpreal& operator/=(const mpq_t v);
-    mpreal& operator/=(const long double v);
-    mpreal& operator/=(const double v);
-    mpreal& operator/=(const unsigned long int v);
-    mpreal& operator/=(const unsigned int v);
-    mpreal& operator/=(const long int v);
-    mpreal& operator/=(const int v);
-    friend const mpreal operator/(const unsigned long int b, const mpreal& a);
-    friend const mpreal operator/(const unsigned int b,      const mpreal& a);
-    friend const mpreal operator/(const long int b,          const mpreal& a);
-    friend const mpreal operator/(const int b,               const mpreal& a);
-    friend const mpreal operator/(const double b,            const mpreal& a);
-
-    //<<= Fast Multiplication by 2^u
-    mpreal& operator<<=(const unsigned long int u);
-    mpreal& operator<<=(const unsigned int u);
-    mpreal& operator<<=(const long int u);
-    mpreal& operator<<=(const int u);
-
-    //>>= Fast Division by 2^u
-    mpreal& operator>>=(const unsigned long int u);
-    mpreal& operator>>=(const unsigned int u);
-    mpreal& operator>>=(const long int u);
-    mpreal& operator>>=(const int u);
-
-    // Type Conversion operators
-    bool               toBool      (                        )    const;
-    long               toLong      (mp_rnd_t mode = GMP_RNDZ)    const;
-    unsigned long      toULong     (mp_rnd_t mode = GMP_RNDZ)    const;
-    long long          toLLong     (mp_rnd_t mode = GMP_RNDZ)    const;
-    unsigned long long toULLong    (mp_rnd_t mode = GMP_RNDZ)    const;
-    float              toFloat     (mp_rnd_t mode = GMP_RNDN)    const;
-    double             toDouble    (mp_rnd_t mode = GMP_RNDN)    const;
-    long double        toLDouble   (mp_rnd_t mode = GMP_RNDN)    const;
-
-#if defined (MPREAL_HAVE_EXPLICIT_CONVERTERS)
-    explicit operator bool               () const { return toBool();                 }
-    explicit operator int                () const { return int(toLong());            }
-    explicit operator long               () const { return toLong();                 }
-    explicit operator long long          () const { return toLLong();                }
-    explicit operator unsigned           () const { return unsigned(toULong());      }
-    explicit operator unsigned long      () const { return toULong();                }
-    explicit operator unsigned long long () const { return toULLong();               }
-    explicit operator float              () const { return toFloat();                }
-    explicit operator double             () const { return toDouble();               }
-    explicit operator long double        () const { return toLDouble();              }
-#endif
-
-    // Get raw pointers so that mpreal can be directly used in raw mpfr_* functions
-    ::mpfr_ptr    mpfr_ptr();
-    ::mpfr_srcptr mpfr_ptr()    const;
-    ::mpfr_srcptr mpfr_srcptr() const;
-
-    // Convert mpreal to string with n significant digits in base b
-    // n = -1 -> convert with the maximum available digits
-    std::string toString(int n = -1, int b = 10, mp_rnd_t mode = mpreal::get_default_rnd()) const;
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    std::string toString(const std::string& format) const;
-#endif
-
-    std::ostream& output(std::ostream& os) const;
-
-    // Math Functions
-    friend const mpreal sqr (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sqrt(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sqrt(const unsigned long int v, mp_rnd_t rnd_mode);
-    friend const mpreal cbrt(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal root(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const mpreal& a, const long int b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode);
-    friend const mpreal fabs(const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal abs(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode);
-    friend inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
-    friend inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode);
-    friend inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
-    friend inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode);
-    friend int cmpabs(const mpreal& a,const mpreal& b);
-
-    friend const mpreal log  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal log2 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal logb (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal log10(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal exp  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal exp2 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal exp10(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal log1p(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal expm1(const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal cos(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sin(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal tan(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sec(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal csc(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal cot(const mpreal& v, mp_rnd_t rnd_mode);
-    friend int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal acos  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal asin  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal atan  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode);
-    friend const mpreal acot  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal asec  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal acsc  (const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal cosh  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sinh  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal tanh  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sech  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal csch  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal coth  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal acosh (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal asinh (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal atanh (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal acoth (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal asech (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal acsch (const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
-    friend const mpreal fac_ui (unsigned long int v,  mp_prec_t prec, mp_rnd_t rnd_mode);
-    friend const mpreal eint   (const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal gamma    (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal tgamma   (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal lngamma  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal lgamma   (const mpreal& v, int *signp, mp_rnd_t rnd_mode);
-    friend const mpreal zeta     (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal erf      (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal erfc     (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besselj0 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besselj1 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besseljn (long n, const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal bessely0 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal bessely1 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besselyn (long n, const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal fma      (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
-    friend const mpreal fms      (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
-    friend const mpreal agm      (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode);
-    friend const mpreal sum      (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t rnd_mode);
-    friend int sgn(const mpreal& v); // returns -1 or +1
-
-// MPFR 2.4.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    friend int          sinh_cosh   (mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal li2         (const mpreal& v,                       mp_rnd_t rnd_mode);
-    friend const mpreal fmod        (const mpreal& x, const mpreal& y,      mp_rnd_t rnd_mode);
-    friend const mpreal rec_sqrt    (const mpreal& v,                       mp_rnd_t rnd_mode);
-
-    // MATLAB's semantic equivalents
-    friend const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Remainder after division
-    friend const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Modulus after division
-#endif
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-    friend const mpreal digamma (const mpreal& v,        mp_rnd_t rnd_mode);
-    friend const mpreal ai      (const mpreal& v,        mp_rnd_t rnd_mode);
-    friend const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode);     // use gmp_randinit_default() to init state, gmp_randclear() to clear
-#endif
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
-    friend const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode);     // use gmp_randinit_default() to init state, gmp_randclear() to clear
-    friend const mpreal grandom (unsigned int seed);
-#endif
-
-    // Uniformly distributed random number generation in [0,1] using
-    // Mersenne-Twister algorithm by default.
-    // Use parameter to setup seed, e.g.: random((unsigned)time(NULL))
-    // Check urandom() for more precise control.
-    friend const mpreal random(unsigned int seed);
-
-    // Splits mpreal value into fractional and integer parts.
-    // Returns fractional part and stores integer part in n.
-    friend const mpreal modf(const mpreal& v, mpreal& n);
-
-    // Constants
-    // don't forget to call mpfr_free_cache() for every thread where you are using const-functions
-    friend const mpreal const_log2      (mp_prec_t prec, mp_rnd_t rnd_mode);
-    friend const mpreal const_pi        (mp_prec_t prec, mp_rnd_t rnd_mode);
-    friend const mpreal const_euler     (mp_prec_t prec, mp_rnd_t rnd_mode);
-    friend const mpreal const_catalan   (mp_prec_t prec, mp_rnd_t rnd_mode);
-
-    // returns +inf iff sign>=0 otherwise -inf
-    friend const mpreal const_infinity(int sign, mp_prec_t prec);
-
-    // Output/ Input
-    friend std::ostream& operator<<(std::ostream& os, const mpreal& v);
-    friend std::istream& operator>>(std::istream& is, mpreal& v);
-
-    // Integer Related Functions
-    friend const mpreal rint (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal ceil (const mpreal& v);
-    friend const mpreal floor(const mpreal& v);
-    friend const mpreal round(const mpreal& v);
-    friend const mpreal trunc(const mpreal& v);
-    friend const mpreal rint_ceil   (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal rint_floor  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal rint_round  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal rint_trunc  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal frac        (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal remainder   (         const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-    friend const mpreal remquo      (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
-    // Miscellaneous Functions
-    friend const mpreal nexttoward (const mpreal& x, const mpreal& y);
-    friend const mpreal nextabove  (const mpreal& x);
-    friend const mpreal nextbelow  (const mpreal& x);
-
-    // use gmp_randinit_default() to init state, gmp_randclear() to clear
-    friend const mpreal urandomb (gmp_randstate_t& state);
-
-// MPFR < 2.4.2 Specifics
-#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
-    friend const mpreal random2 (mp_size_t size, mp_exp_t exp);
-#endif
-
-    // Instance Checkers
-    friend bool (isnan)    (const mpreal& v);
-    friend bool (isinf)    (const mpreal& v);
-    friend bool (isfinite) (const mpreal& v);
-
-    friend bool isnum    (const mpreal& v);
-    friend bool iszero   (const mpreal& v);
-    friend bool isint    (const mpreal& v);
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-    friend bool isregular(const mpreal& v);
-#endif
-
-    // Set/Get instance properties
-    inline mp_prec_t    get_prec() const;
-    inline void         set_prec(mp_prec_t prec, mp_rnd_t rnd_mode = get_default_rnd());    // Change precision with rounding mode
-
-    // Aliases for get_prec(), set_prec() - needed for compatibility with std::complex<mpreal> interface
-    inline mpreal&      setPrecision(int Precision, mp_rnd_t RoundingMode = get_default_rnd());
-    inline int          getPrecision() const;
-
-    // Set mpreal to +/- inf, NaN, +/-0
-    mpreal&        setInf  (int Sign = +1);
-    mpreal&        setNan  ();
-    mpreal&        setZero (int Sign = +1);
-    mpreal&        setSign (int Sign, mp_rnd_t RoundingMode = get_default_rnd());
-
-    //Exponent
-    mp_exp_t get_exp();
-    int set_exp(mp_exp_t e);
-    int check_range  (int t, mp_rnd_t rnd_mode = get_default_rnd());
-    int subnormalize (int t, mp_rnd_t rnd_mode = get_default_rnd());
-
-    // Inexact conversion from float
-    inline bool fits_in_bits(double x, int n);
-
-    // Set/Get global properties
-    static void            set_default_prec(mp_prec_t prec);
-    static void            set_default_rnd(mp_rnd_t rnd_mode);
-
-    static mp_exp_t  get_emin (void);
-    static mp_exp_t  get_emax (void);
-    static mp_exp_t  get_emin_min (void);
-    static mp_exp_t  get_emin_max (void);
-    static mp_exp_t  get_emax_min (void);
-    static mp_exp_t  get_emax_max (void);
-    static int       set_emin (mp_exp_t exp);
-    static int       set_emax (mp_exp_t exp);
-
-    // Efficient swapping of two mpreal values - needed for std algorithms
-    friend void swap(mpreal& x, mpreal& y);
-
-    friend const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-    friend const mpreal fmin(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
-private:
-    // Human friendly Debug Preview in Visual Studio.
-    // Put one of these lines:
-    //
-    // mpfr::mpreal=<DebugView>                              ; Show value only
-    // mpfr::mpreal=<DebugView>, <mp[0]._mpfr_prec,u>bits    ; Show value & precision
-    //
-    // at the beginning of
-    // [Visual Studio Installation Folder]\Common7\Packages\Debugger\autoexp.dat
-    MPREAL_MSVC_DEBUGVIEW_DATA
-
-    // "Smart" resources deallocation. Checks if instance initialized before deletion.
-    void clear(::mpfr_ptr);
-};
-
-//////////////////////////////////////////////////////////////////////////
-// Exceptions
-class conversion_overflow : public std::exception {
-public:
-    std::string why() { return "inexact conversion from floating point"; }
-};
-
-//////////////////////////////////////////////////////////////////////////
-// Constructors & converters
-// Default constructor: creates mp number and initializes it to 0.
-inline mpreal::mpreal()
-{
-    mpfr_init2(mpfr_ptr(), mpreal::get_default_prec());
-    mpfr_set_zero_fast(mpfr_ptr());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpreal& u)
-{
-    mpfr_init2(mpfr_ptr(),mpfr_get_prec(u.mpfr_srcptr()));
-    mpfr_set  (mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
-inline mpreal::mpreal(mpreal&& other)
-{
-    mpfr_set_uninitialized(mpfr_ptr());     // make sure "other" holds no pointer to actual data
-    mpfr_swap(mpfr_ptr(), other.mpfr_ptr());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal& mpreal::operator=(mpreal&& other)
-{
-    mpfr_swap(mpfr_ptr(), other.mpfr_ptr());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-#endif
-
-inline mpreal::mpreal(const mpfr_t  u, bool shared)
-{
-    if(shared)
-    {
-        std::memcpy(mpfr_ptr(), u, sizeof(mpfr_t));
-    }
-    else
-    {
-        mpfr_init2(mpfr_ptr(), mpfr_get_prec(u));
-        mpfr_set  (mpfr_ptr(), u, mpreal::get_default_rnd());
-    }
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpf_t u)
-{
-    mpfr_init2(mpfr_ptr(),(mp_prec_t) mpf_get_prec(u)); // (gmp: mp_bitcnt_t) unsigned long -> long (mpfr: mp_prec_t)
-    mpfr_set_f(mpfr_ptr(),u,mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpz_t u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2(mpfr_ptr(), prec);
-    mpfr_set_z(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpq_t u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2(mpfr_ptr(), prec);
-    mpfr_set_q(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const double u, mp_prec_t prec, mp_rnd_t mode)
-{
-     mpfr_init2(mpfr_ptr(), prec);
-
-#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1)
-  if(fits_in_bits(u, MPREAL_DOUBLE_BITS_OVERFLOW))
-  {
-    mpfr_set_d(mpfr_ptr(), u, mode);
-  }else
-    throw conversion_overflow();
-#else
-  mpfr_set_d(mpfr_ptr(), u, mode);
-#endif
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long double u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_ld(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned long long int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_uj(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long long int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_sj(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned long int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_ui(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_ui(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_si(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_si(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode)
-{
-    mpfr_init2  (mpfr_ptr(), prec);
-    mpfr_set_str(mpfr_ptr(), s, base, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const std::string& s, mp_prec_t prec, int base, mp_rnd_t mode)
-{
-    mpfr_init2  (mpfr_ptr(), prec);
-    mpfr_set_str(mpfr_ptr(), s.c_str(), base, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline void mpreal::clear(::mpfr_ptr x)
-{
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
-    if(mpfr_is_initialized(x))
-#endif
-    mpfr_clear(x);
-}
-
-inline mpreal::~mpreal()
-{
-    clear(mpfr_ptr());
-}
-
-// internal namespace needed for template magic
-namespace internal{
-
-    // Use SFINAE to restrict arithmetic operations instantiation only for numeric types
-    // This is needed for smooth integration with libraries based on expression templates, like Eigen.
-    // TODO: Do the same for boolean operators.
-    template <typename ArgumentType> struct result_type {};
-
-    template <> struct result_type<mpreal>              {typedef mpreal type;};
-    template <> struct result_type<mpz_t>               {typedef mpreal type;};
-    template <> struct result_type<mpq_t>               {typedef mpreal type;};
-    template <> struct result_type<long double>         {typedef mpreal type;};
-    template <> struct result_type<double>              {typedef mpreal type;};
-    template <> struct result_type<unsigned long int>   {typedef mpreal type;};
-    template <> struct result_type<unsigned int>        {typedef mpreal type;};
-    template <> struct result_type<long int>            {typedef mpreal type;};
-    template <> struct result_type<int>                 {typedef mpreal type;};
-    template <> struct result_type<long long>           {typedef mpreal type;};
-    template <> struct result_type<unsigned long long>  {typedef mpreal type;};
-}
-
-// + Addition
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
-    operator+(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) += rhs;    }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
-    operator+(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) += lhs;    }
-
-// - Subtraction
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
-    operator-(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) -= rhs;    }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
-    operator-(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) -= rhs;    }
-
-// * Multiplication
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
-    operator*(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) *= rhs;    }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
-    operator*(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) *= lhs;    }
-
-// / Division
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
-    operator/(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) /= rhs;    }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
-    operator/(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) /= rhs;    }
-
-//////////////////////////////////////////////////////////////////////////
-// sqrt
-const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const long int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const long double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-// abs
-inline const mpreal abs(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd());
-
-//////////////////////////////////////////////////////////////////////////
-// pow
-const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-//////////////////////////////////////////////////////////////////////////
-// Estimate machine epsilon for the given precision
-// Returns smallest eps such that 1.0 + eps != 1.0
-inline mpreal machine_epsilon(mp_prec_t prec = mpreal::get_default_prec());
-
-// Returns smallest eps such that x + eps != x (relative machine epsilon)
-inline mpreal machine_epsilon(const mpreal& x);
-
-// Gives max & min values for the required precision,
-// minval is 'safe' meaning 1 / minval does not overflow
-// maxval is 'safe' meaning 1 / maxval does not underflow
-inline mpreal minval(mp_prec_t prec = mpreal::get_default_prec());
-inline mpreal maxval(mp_prec_t prec = mpreal::get_default_prec());
-
-// 'Dirty' equality check 1: |a-b| < min{|a|,|b|} * eps
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps);
-
-// 'Dirty' equality check 2: |a-b| < min{|a|,|b|} * eps( min{|a|,|b|} )
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b);
-
-// 'Bitwise' equality check
-//  maxUlps - a and b can be apart by maxUlps binary numbers.
-inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps);
-
-//////////////////////////////////////////////////////////////////////////
-// Convert precision in 'bits' to decimal digits and vice versa.
-//    bits   = ceil(digits*log[2](10))
-//    digits = floor(bits*log[10](2))
-
-inline mp_prec_t digits2bits(int d);
-inline int       bits2digits(mp_prec_t b);
-
-//////////////////////////////////////////////////////////////////////////
-// min, max
-const mpreal (max)(const mpreal& x, const mpreal& y);
-const mpreal (min)(const mpreal& x, const mpreal& y);
-
-//////////////////////////////////////////////////////////////////////////
-// Implementation
-//////////////////////////////////////////////////////////////////////////
-
-//////////////////////////////////////////////////////////////////////////
-// Operators - Assignment
-inline mpreal& mpreal::operator=(const mpreal& v)
-{
-    if (this != &v)
-    {
-    mp_prec_t tp = mpfr_get_prec(  mpfr_srcptr());
-    mp_prec_t vp = mpfr_get_prec(v.mpfr_srcptr());
-
-    if(tp != vp){
-      clear(mpfr_ptr());
-      mpfr_init2(mpfr_ptr(), vp);
-    }
-
-        mpfr_set(mpfr_ptr(), v.mpfr_srcptr(), mpreal::get_default_rnd());
-
-        MPREAL_MSVC_DEBUGVIEW_CODE;
-    }
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpf_t v)
-{
-    mpfr_set_f(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpz_t v)
-{
-    mpfr_set_z(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpq_t v)
-{
-    mpfr_set_q(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const long double v)
-{
-    mpfr_set_ld(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const double v)
-{
-#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1)
-  if(fits_in_bits(v, MPREAL_DOUBLE_BITS_OVERFLOW))
-  {
-    mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd());
-  }else
-    throw conversion_overflow();
-#else
-  mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd());
-#endif
-
-  MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned long int v)
-{
-    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned int v)
-{
-    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned long long int v)
-{
-    mpfr_set_uj(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const long long int v)
-{
-    mpfr_set_sj(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const long int v)
-{
-    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const int v)
-{
-    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const char* s)
-{
-    // Use other converters for more precise control on base & precision & rounding:
-    //
-    //        mpreal(const char* s,        mp_prec_t prec, int base, mp_rnd_t mode)
-    //        mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode)
-    //
-    // Here we assume base = 10 and we use precision of target variable.
-
-    mpfr_t t;
-
-    mpfr_init2(t, mpfr_get_prec(mpfr_srcptr()));
-
-    if(0 == mpfr_set_str(t, s, 10, mpreal::get_default_rnd()))
-    {
-        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd());
-        MPREAL_MSVC_DEBUGVIEW_CODE;
-    }
-
-    clear(t);
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const std::string& s)
-{
-    // Use other converters for more precise control on base & precision & rounding:
-    //
-    //        mpreal(const char* s,        mp_prec_t prec, int base, mp_rnd_t mode)
-    //        mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode)
-    //
-    // Here we assume base = 10 and we use precision of target variable.
-
-    mpfr_t t;
-
-    mpfr_init2(t, mpfr_get_prec(mpfr_srcptr()));
-
-    if(0 == mpfr_set_str(t, s.c_str(), 10, mpreal::get_default_rnd()))
-    {
-        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd());
-        MPREAL_MSVC_DEBUGVIEW_CODE;
-    }
-
-    clear(t);
-    return *this;
-}
-
-template <typename real_t>
-inline mpreal& mpreal::operator= (const std::complex<real_t>& z)
-{
-    return *this = z.real();
-}
-
-//////////////////////////////////////////////////////////////////////////
-// + Addition
-inline mpreal& mpreal::operator+=(const mpreal& v)
-{
-    mpfr_add(mpfr_ptr(), mpfr_srcptr(), v.mpfr_srcptr(), mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpf_t u)
-{
-    *this += mpreal(u);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpz_t u)
-{
-    mpfr_add_z(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpq_t u)
-{
-    mpfr_add_q(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+= (const long double u)
-{
-    *this += mpreal(u);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+= (const double u)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpfr_add_d(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-#else
-    *this += mpreal(u);
-#endif
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const unsigned long int u)
-{
-    mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const unsigned int u)
-{
-    mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const long int u)
-{
-    mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const int u)
-{
-    mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const long long int u)         {    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator+=(const unsigned long long int u){    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator-=(const long long int  u)        {    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator-=(const unsigned long long int u){    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator*=(const long long int  u)        {    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator*=(const unsigned long long int u){    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator/=(const long long int  u)        {    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator/=(const unsigned long long int u){    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-
-inline const mpreal mpreal::operator+()const    {    return mpreal(*this); }
-
-inline const mpreal operator+(const mpreal& a, const mpreal& b)
-{
-  mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
-  mpfr_add(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
-  return c;
-}
-
-inline mpreal& mpreal::operator++()
-{
-    return *this += 1;
-}
-
-inline const mpreal mpreal::operator++ (int)
-{
-    mpreal x(*this);
-    *this += 1;
-    return x;
-}
-
-inline mpreal& mpreal::operator--()
-{
-    return *this -= 1;
-}
-
-inline const mpreal mpreal::operator-- (int)
-{
-    mpreal x(*this);
-    *this -= 1;
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// - Subtraction
-inline mpreal& mpreal::operator-=(const mpreal& v)
-{
-    mpfr_sub(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const mpz_t v)
-{
-    mpfr_sub_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const mpq_t v)
-{
-    mpfr_sub_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const long double v)
-{
-    *this -= mpreal(v);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpfr_sub_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
-    *this -= mpreal(v);
-#endif
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const unsigned long int v)
-{
-    mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const unsigned int v)
-{
-    mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const long int v)
-{
-    mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const int v)
-{
-    mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline const mpreal mpreal::operator-()const
-{
-    mpreal u(*this);
-    mpfr_neg(u.mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd());
-    return u;
-}
-
-inline const mpreal operator-(const mpreal& a, const mpreal& b)
-{
-  mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
-  mpfr_sub(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
-  return c;
-}
-
-inline const mpreal operator-(const double  b, const mpreal& a)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_d_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-#else
-    mpreal x(b, mpfr_get_prec(a.mpfr_ptr()));
-    x -= a;
-    return x;
-#endif
-}
-
-inline const mpreal operator-(const unsigned long int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator-(const unsigned int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator-(const long int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator-(const int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// * Multiplication
-inline mpreal& mpreal::operator*= (const mpreal& v)
-{
-    mpfr_mul(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const mpz_t v)
-{
-    mpfr_mul_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const mpq_t v)
-{
-    mpfr_mul_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const long double v)
-{
-    *this *= mpreal(v);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpfr_mul_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
-    *this *= mpreal(v);
-#endif
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const unsigned long int v)
-{
-    mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const unsigned int v)
-{
-    mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const long int v)
-{
-    mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const int v)
-{
-    mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline const mpreal operator*(const mpreal& a, const mpreal& b)
-{
-  mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
-  mpfr_mul(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
-  return c;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// / Division
-inline mpreal& mpreal::operator/=(const mpreal& v)
-{
-    mpfr_div(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const mpz_t v)
-{
-    mpfr_div_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const mpq_t v)
-{
-    mpfr_div_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const long double v)
-{
-    *this /= mpreal(v);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpfr_div_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
-    *this /= mpreal(v);
-#endif
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const unsigned long int v)
-{
-    mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const unsigned int v)
-{
-    mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const long int v)
-{
-    mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const int v)
-{
-    mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline const mpreal operator/(const mpreal& a, const mpreal& b)
-{
-  mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_srcptr()), mpfr_get_prec(b.mpfr_srcptr())));
-  mpfr_div(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
-  return c;
-}
-
-inline const mpreal operator/(const unsigned long int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator/(const unsigned int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator/(const long int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator/(const int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator/(const double  b, const mpreal& a)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_d_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-#else
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    x /= a;
-    return x;
-#endif
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Shifts operators - Multiplication/Division by power of 2
-inline mpreal& mpreal::operator<<=(const unsigned long int u)
-{
-    mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const unsigned int u)
-{
-    mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast<unsigned long int>(u),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const long int u)
-{
-    mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const int u)
-{
-    mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),static_cast<long int>(u),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const unsigned long int u)
-{
-    mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const unsigned int u)
-{
-    mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast<unsigned long int>(u),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const long int u)
-{
-    mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const int u)
-{
-    mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),static_cast<long int>(u),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline const mpreal operator<<(const mpreal& v, const unsigned long int k)
-{
-    return mul_2ui(v,k);
-}
-
-inline const mpreal operator<<(const mpreal& v, const unsigned int k)
-{
-    return mul_2ui(v,static_cast<unsigned long int>(k));
-}
-
-inline const mpreal operator<<(const mpreal& v, const long int k)
-{
-    return mul_2si(v,k);
-}
-
-inline const mpreal operator<<(const mpreal& v, const int k)
-{
-    return mul_2si(v,static_cast<long int>(k));
-}
-
-inline const mpreal operator>>(const mpreal& v, const unsigned long int k)
-{
-    return div_2ui(v,k);
-}
-
-inline const mpreal operator>>(const mpreal& v, const long int k)
-{
-    return div_2si(v,k);
-}
-
-inline const mpreal operator>>(const mpreal& v, const unsigned int k)
-{
-    return div_2ui(v,static_cast<unsigned long int>(k));
-}
-
-inline const mpreal operator>>(const mpreal& v, const int k)
-{
-    return div_2si(v,static_cast<long int>(k));
-}
-
-// mul_2ui
-inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode)
-{
-    mpreal x(v);
-    mpfr_mul_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
-    return x;
-}
-
-// mul_2si
-inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode)
-{
-    mpreal x(v);
-    mpfr_mul_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
-    return x;
-}
-
-inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode)
-{
-    mpreal x(v);
-    mpfr_div_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
-    return x;
-}
-
-inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode)
-{
-    mpreal x(v);
-    mpfr_div_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-//Relational operators
-
-// WARNING:
-//
-// Please note that following checks for double-NaN are guaranteed to work only in IEEE math mode:
-//
-// isnan(b) =  (b != b)
-// isnan(b) = !(b == b)  (we use in code below)
-//
-// Be cautions if you use compiler options which break strict IEEE compliance (e.g. -ffast-math in GCC).
-// Use std::isnan instead (C++11).
-
-inline bool operator >  (const mpreal& a, const mpreal& b           ){  return (mpfr_greater_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );            }
-inline bool operator >  (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 );                 }
-inline bool operator >  (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 );                 }
-inline bool operator >  (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 );                 }
-inline bool operator >  (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 );                 }
-inline bool operator >  (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) > 0 );    }
-inline bool operator >  (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) > 0 );    }
-
-inline bool operator >= (const mpreal& a, const mpreal& b           ){  return (mpfr_greaterequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );       }
-inline bool operator >= (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 );                }
-// inline bool operator >= (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (isnan()a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 );                }
-inline bool operator >= (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 );                }
-inline bool operator >= (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 );                }
-inline bool operator >= (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) >= 0 );   }
-inline bool operator >= (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) >= 0 );   }
-
-inline bool operator <  (const mpreal& a, const mpreal& b           ){  return (mpfr_less_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );               }
-inline bool operator <  (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 );                 }
-inline bool operator <  (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 );                 }
-inline bool operator <  (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 );                 }
-inline bool operator <  (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 );                 }
-inline bool operator <  (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) < 0 );    }
-inline bool operator <  (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) < 0 );    }
-
-inline bool operator <= (const mpreal& a, const mpreal& b           ){  return (mpfr_lessequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );          }
-inline bool operator <= (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 );                }
-inline bool operator <= (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 );                }
-inline bool operator <= (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 );                }
-inline bool operator <= (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 );                }
-inline bool operator <= (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) <= 0 );   }
-inline bool operator <= (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) <= 0 );   }
-
-inline bool operator == (const mpreal& a, const mpreal& b           ){  return (mpfr_equal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );              }
-inline bool operator == (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );                }
-inline bool operator == (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );                }
-inline bool operator == (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );                }
-inline bool operator == (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );                }
-inline bool operator == (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) == 0 );   }
-inline bool operator == (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) == 0 );   }
-
-inline bool operator != (const mpreal& a, const mpreal& b           ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const unsigned long int b ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const unsigned int b      ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const long int b          ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const int b               ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const long double b       ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const double b            ){  return !(a == b);  }
-
-inline bool (isnan)    (const mpreal& op){    return (mpfr_nan_p    (op.mpfr_srcptr()) != 0 );    }
-inline bool (isinf)    (const mpreal& op){    return (mpfr_inf_p    (op.mpfr_srcptr()) != 0 );    }
-inline bool (isfinite) (const mpreal& op){    return (mpfr_number_p (op.mpfr_srcptr()) != 0 );    }
-inline bool iszero   (const mpreal& op){    return (mpfr_zero_p   (op.mpfr_srcptr()) != 0 );    }
-inline bool isint    (const mpreal& op){    return (mpfr_integer_p(op.mpfr_srcptr()) != 0 );    }
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline bool isregular(const mpreal& op){    return (mpfr_regular_p(op.mpfr_srcptr()));}
-#endif
-
-//////////////////////////////////////////////////////////////////////////
-// Type Converters
-inline bool               mpreal::toBool   (             )  const    {    return  mpfr_zero_p (mpfr_srcptr()) == 0;     }
-inline long               mpreal::toLong   (mp_rnd_t mode)  const    {    return  mpfr_get_si (mpfr_srcptr(), mode);    }
-inline unsigned long      mpreal::toULong  (mp_rnd_t mode)  const    {    return  mpfr_get_ui (mpfr_srcptr(), mode);    }
-inline float              mpreal::toFloat  (mp_rnd_t mode)  const    {    return  mpfr_get_flt(mpfr_srcptr(), mode);    }
-inline double             mpreal::toDouble (mp_rnd_t mode)  const    {    return  mpfr_get_d  (mpfr_srcptr(), mode);    }
-inline long double        mpreal::toLDouble(mp_rnd_t mode)  const    {    return  mpfr_get_ld (mpfr_srcptr(), mode);    }
-inline long long          mpreal::toLLong  (mp_rnd_t mode)  const    {    return  mpfr_get_sj (mpfr_srcptr(), mode);    }
-inline unsigned long long mpreal::toULLong (mp_rnd_t mode)  const    {    return  mpfr_get_uj (mpfr_srcptr(), mode);    }
-
-inline ::mpfr_ptr     mpreal::mpfr_ptr()             { return mp; }
-inline ::mpfr_srcptr  mpreal::mpfr_ptr()    const    { return mp; }
-inline ::mpfr_srcptr  mpreal::mpfr_srcptr() const    { return mp; }
-
-template <class T>
-inline std::string toString(T t, std::ios_base & (*f)(std::ios_base&))
-{
-    std::ostringstream oss;
-    oss << f << t;
-    return oss.str();
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
-inline std::string mpreal::toString(const std::string& format) const
-{
-    char *s = NULL;
-    std::string out;
-
-    if( !format.empty() )
-    {
-        if(!(mpfr_asprintf(&s, format.c_str(), mpfr_srcptr()) < 0))
-        {
-            out = std::string(s);
-
-            mpfr_free_str(s);
-        }
-    }
-
-    return out;
-}
-
-#endif
-
-inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
-{
-    // TODO: Add extended format specification (f, e, rounding mode) as it done in output operator
-    (void)b;
-    (void)mode;
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
-    std::ostringstream format;
-
-    int digits = (n >= 0) ? n : 1 + bits2digits(mpfr_get_prec(mpfr_srcptr()));
-
-    format << "%." << digits << "RNg";
-
-    return toString(format.str());
-
-#else
-
-    char *s, *ns = NULL;
-    size_t slen, nslen;
-    mp_exp_t exp;
-    std::string out;
-
-    if(mpfr_inf_p(mp))
-    {
-        if(mpfr_sgn(mp)>0) return "+Inf";
-        else               return "-Inf";
-    }
-
-    if(mpfr_zero_p(mp)) return "0";
-    if(mpfr_nan_p(mp))  return "NaN";
-
-    s  = mpfr_get_str(NULL, &exp, b, 0, mp, mode);
-    ns = mpfr_get_str(NULL, &exp, b, (std::max)(0,n), mp, mode);
-
-    if(s!=NULL && ns!=NULL)
-    {
-        slen  = strlen(s);
-        nslen = strlen(ns);
-        if(nslen<=slen)
-        {
-            mpfr_free_str(s);
-            s = ns;
-            slen = nslen;
-        }
-        else {
-            mpfr_free_str(ns);
-        }
-
-        // Make human eye-friendly formatting if possible
-        if (exp>0 && static_cast<size_t>(exp)<slen)
-        {
-            if(s[0]=='-')
-            {
-                // Remove zeros starting from right end
-                char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+exp) ptr--;
-
-                if(ptr==s+exp) out = std::string(s,exp+1);
-                else           out = std::string(s,exp+1)+'.'+std::string(s+exp+1,ptr-(s+exp+1)+1);
-
-                //out = string(s,exp+1)+'.'+string(s+exp+1);
-            }
-            else
-            {
-                // Remove zeros starting from right end
-                char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+exp-1) ptr--;
-
-                if(ptr==s+exp-1) out = std::string(s,exp);
-                else             out = std::string(s,exp)+'.'+std::string(s+exp,ptr-(s+exp)+1);
-
-                //out = string(s,exp)+'.'+string(s+exp);
-            }
-
-        }else{ // exp<0 || exp>slen
-            if(s[0]=='-')
-            {
-                // Remove zeros starting from right end
-                char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+1) ptr--;
-
-                if(ptr==s+1) out = std::string(s,2);
-                else         out = std::string(s,2)+'.'+std::string(s+2,ptr-(s+2)+1);
-
-                //out = string(s,2)+'.'+string(s+2);
-            }
-            else
-            {
-                // Remove zeros starting from right end
-                char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s) ptr--;
-
-                if(ptr==s) out = std::string(s,1);
-                else       out = std::string(s,1)+'.'+std::string(s+1,ptr-(s+1)+1);
-
-                //out = string(s,1)+'.'+string(s+1);
-            }
-
-            // Make final string
-            if(--exp)
-            {
-                if(exp>0) out += "e+"+mpfr::toString<mp_exp_t>(exp,std::dec);
-                else       out += "e"+mpfr::toString<mp_exp_t>(exp,std::dec);
-            }
-        }
-
-        mpfr_free_str(s);
-        return out;
-    }else{
-        return "conversion error!";
-    }
-#endif
-}
-
-
-//////////////////////////////////////////////////////////////////////////
-// I/O
-inline std::ostream& mpreal::output(std::ostream& os) const
-{
-    std::ostringstream format;
-    const std::ios::fmtflags flags = os.flags();
-
-    format << ((flags & std::ios::showpos) ? "%+" : "%");
-    if (os.precision() >= 0)
-        format << '.' << os.precision() << "R*"
-               << ((flags & std::ios::floatfield) == std::ios::fixed ? 'f' :
-                   (flags & std::ios::floatfield) == std::ios::scientific ? 'e' :
-                   'g');
-    else
-        format << "R*e";
-
-    char *s = NULL;
-    if(!(mpfr_asprintf(&s, format.str().c_str(),
-                        mpfr::mpreal::get_default_rnd(),
-                        mpfr_srcptr())
-        < 0))
-    {
-        os << std::string(s);
-        mpfr_free_str(s);
-    }
-    return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const mpreal& v)
-{
-    return v.output(os);
-}
-
-inline std::istream& operator>>(std::istream &is, mpreal& v)
-{
-    // TODO: use cout::hexfloat and other flags to setup base
-    std::string tmp;
-    is >> tmp;
-    mpfr_set_str(v.mpfr_ptr(), tmp.c_str(), 10, mpreal::get_default_rnd());
-    return is;
-}
-
-//////////////////////////////////////////////////////////////////////////
-//     Bits - decimal digits relation
-//        bits   = ceil(digits*log[2](10))
-//        digits = floor(bits*log[10](2))
-
-inline mp_prec_t digits2bits(int d)
-{
-    const double LOG2_10 = 3.3219280948873624;
-
-    return mp_prec_t(std::ceil( d * LOG2_10 ));
-}
-
-inline int bits2digits(mp_prec_t b)
-{
-    const double LOG10_2 = 0.30102999566398119;
-
-    return int(std::floor( b * LOG10_2 ));
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Set/Get number properties
-inline int sgn(const mpreal& op)
-{
-    return mpfr_sgn(op.mpfr_srcptr());
-}
-
-inline mpreal& mpreal::setSign(int sign, mp_rnd_t RoundingMode)
-{
-    mpfr_setsign(mpfr_ptr(), mpfr_srcptr(), (sign < 0 ? 1 : 0), RoundingMode);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline int mpreal::getPrecision() const
-{
-    return int(mpfr_get_prec(mpfr_srcptr()));
-}
-
-inline mpreal& mpreal::setPrecision(int Precision, mp_rnd_t RoundingMode)
-{
-    mpfr_prec_round(mpfr_ptr(), Precision, RoundingMode);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::setInf(int sign)
-{
-    mpfr_set_inf(mpfr_ptr(), sign);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::setNan()
-{
-    mpfr_set_nan(mpfr_ptr());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::setZero(int sign)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-    mpfr_set_zero(mpfr_ptr(), sign);
-#else
-    mpfr_set_si(mpfr_ptr(), 0, (mpfr_get_default_rounding_mode)());
-    setSign(sign);
-#endif
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mp_prec_t mpreal::get_prec() const
-{
-    return mpfr_get_prec(mpfr_srcptr());
-}
-
-inline void mpreal::set_prec(mp_prec_t prec, mp_rnd_t rnd_mode)
-{
-    mpfr_prec_round(mpfr_ptr(),prec,rnd_mode);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mp_exp_t mpreal::get_exp ()
-{
-    return mpfr_get_exp(mpfr_srcptr());
-}
-
-inline int mpreal::set_exp (mp_exp_t e)
-{
-    int x = mpfr_set_exp(mpfr_ptr(), e);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return x;
-}
-
-inline const mpreal frexp(const mpreal& x, mp_exp_t* exp, mp_rnd_t mode = mpreal::get_default_rnd())
-{
-    mpreal y(x);
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
-    mpfr_frexp(exp,y.mpfr_ptr(),x.mpfr_srcptr(),mode);
-#else
-    *exp = mpfr_get_exp(y.mpfr_srcptr());
-    mpfr_set_exp(y.mpfr_ptr(),0);
-#endif
-    return y;
-}
-
-inline const mpreal ldexp(const mpreal& v, mp_exp_t exp)
-{
-    mpreal x(v);
-
-    // rounding is not important since we are just increasing the exponent (= exact operation)
-    mpfr_mul_2si(x.mpfr_ptr(), x.mpfr_srcptr(), exp, mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal scalbn(const mpreal& v, mp_exp_t exp)
-{
-    return ldexp(v, exp);
-}
-
-inline mpreal machine_epsilon(mp_prec_t prec)
-{
-    /* the smallest eps such that 1 + eps != 1 */
-    return machine_epsilon(mpreal(1, prec));
-}
-
-inline mpreal machine_epsilon(const mpreal& x)
-{
-    /* the smallest eps such that x + eps != x */
-    if( x < 0)
-    {
-        return nextabove(-x) + x;
-    }else{
-        return nextabove( x) - x;
-    }
-}
-
-// minval is 'safe' meaning 1 / minval does not overflow
-inline mpreal minval(mp_prec_t prec)
-{
-    /* min = 1/2 * 2^emin = 2^(emin - 1) */
-    return mpreal(1, prec) << mpreal::get_emin()-1;
-}
-
-// maxval is 'safe' meaning 1 / maxval does not underflow
-inline mpreal maxval(mp_prec_t prec)
-{
-    /* max = (1 - eps) * 2^emax, eps is machine epsilon */
-    return (mpreal(1, prec) - machine_epsilon(prec)) << mpreal::get_emax();
-}
-
-inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps)
-{
-    return abs(a - b) <= machine_epsilon((max)(abs(a), abs(b))) * maxUlps;
-}
-
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps)
-{
-    return abs(a - b) <= eps;
-}
-
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b)
-{
-    return isEqualFuzzy(a, b, machine_epsilon((max)(1, (min)(abs(a), abs(b)))));
-}
-
-//////////////////////////////////////////////////////////////////////////
-// C++11 sign functions.
-inline mpreal copysign(const mpreal& x, const  mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal rop(0, mpfr_get_prec(x.mpfr_ptr()));
-    mpfr_setsign(rop.mpfr_ptr(), x.mpfr_srcptr(), mpfr_signbit(y.mpfr_srcptr()), rnd_mode);
-    return rop;
-}
-
-inline bool signbit(const mpreal& x)
-{
-    return mpfr_signbit(x.mpfr_srcptr());
-}
-
-inline const mpreal modf(const mpreal& v, mpreal& n)
-{
-    mpreal f(v);
-
-    // rounding is not important since we are using the same number
-    mpfr_frac (f.mpfr_ptr(),f.mpfr_srcptr(),mpreal::get_default_rnd());
-    mpfr_trunc(n.mpfr_ptr(),v.mpfr_srcptr());
-    return f;
-}
-
-inline int mpreal::check_range (int t, mp_rnd_t rnd_mode)
-{
-    return mpfr_check_range(mpfr_ptr(),t,rnd_mode);
-}
-
-inline int mpreal::subnormalize (int t,mp_rnd_t rnd_mode)
-{
-    int r = mpfr_subnormalize(mpfr_ptr(),t,rnd_mode);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return r;
-}
-
-inline mp_exp_t mpreal::get_emin (void)
-{
-    return mpfr_get_emin();
-}
-
-inline int mpreal::set_emin (mp_exp_t exp)
-{
-    return mpfr_set_emin(exp);
-}
-
-inline mp_exp_t mpreal::get_emax (void)
-{
-    return mpfr_get_emax();
-}
-
-inline int mpreal::set_emax (mp_exp_t exp)
-{
-    return mpfr_set_emax(exp);
-}
-
-inline mp_exp_t mpreal::get_emin_min (void)
-{
-    return mpfr_get_emin_min();
-}
-
-inline mp_exp_t mpreal::get_emin_max (void)
-{
-    return mpfr_get_emin_max();
-}
-
-inline mp_exp_t mpreal::get_emax_min (void)
-{
-    return mpfr_get_emax_min();
-}
-
-inline mp_exp_t mpreal::get_emax_max (void)
-{
-    return mpfr_get_emax_max();
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Mathematical Functions
-//////////////////////////////////////////////////////////////////////////
-#define MPREAL_UNARY_MATH_FUNCTION_BODY(f)                    \
-        mpreal y(0, mpfr_get_prec(x.mpfr_srcptr()));          \
-        mpfr_##f(y.mpfr_ptr(), x.mpfr_srcptr(), r);           \
-        return y;
-
-inline const mpreal sqr  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{   MPREAL_UNARY_MATH_FUNCTION_BODY(sqr );    }
-
-inline const mpreal sqrt (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{   MPREAL_UNARY_MATH_FUNCTION_BODY(sqrt);    }
-
-inline const mpreal sqrt(const unsigned long int x, mp_rnd_t r)
-{
-    mpreal y;
-    mpfr_sqrt_ui(y.mpfr_ptr(), x, r);
-    return y;
-}
-
-inline const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode)
-{
-    return sqrt(static_cast<unsigned long int>(v),rnd_mode);
-}
-
-inline const mpreal sqrt(const long int v, mp_rnd_t rnd_mode)
-{
-    if (v>=0)   return sqrt(static_cast<unsigned long int>(v),rnd_mode);
-    else        return mpreal().setNan(); // NaN
-}
-
-inline const mpreal sqrt(const int v, mp_rnd_t rnd_mode)
-{
-    if (v>=0)   return sqrt(static_cast<unsigned long int>(v),rnd_mode);
-    else        return mpreal().setNan(); // NaN
-}
-
-inline const mpreal root(const mpreal& x, unsigned long int k, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal y(0, mpfr_get_prec(x.mpfr_srcptr()));
-    mpfr_root(y.mpfr_ptr(), x.mpfr_srcptr(), k, r);
-    return y;
-}
-
-inline const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal y(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_dim(y.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), r);
-    return y;
-}
-
-inline int cmpabs(const mpreal& a,const mpreal& b)
-{
-    return mpfr_cmpabs(a.mpfr_ptr(), b.mpfr_srcptr());
-}
-
-inline int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    return mpfr_sin_cos(s.mpfr_ptr(), c.mpfr_ptr(), v.mpfr_srcptr(), rnd_mode);
-}
-
-inline const mpreal sqrt  (const long double v, mp_rnd_t rnd_mode)    {   return sqrt(mpreal(v),rnd_mode);    }
-inline const mpreal sqrt  (const double v, mp_rnd_t rnd_mode)         {   return sqrt(mpreal(v),rnd_mode);    }
-
-inline const mpreal cbrt  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cbrt );    }
-inline const mpreal fabs  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(abs  );    }
-inline const mpreal abs   (const mpreal& x, mp_rnd_t r)                             {   MPREAL_UNARY_MATH_FUNCTION_BODY(abs  );    }
-inline const mpreal log   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log  );    }
-inline const mpreal log2  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log2 );    }
-inline const mpreal log10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log10);    }
-inline const mpreal exp   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(exp  );    }
-inline const mpreal exp2  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(exp2 );    }
-inline const mpreal exp10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(exp10);    }
-inline const mpreal cos   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cos  );    }
-inline const mpreal sin   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sin  );    }
-inline const mpreal tan   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(tan  );    }
-inline const mpreal sec   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sec  );    }
-inline const mpreal csc   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(csc  );    }
-inline const mpreal cot   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cot  );    }
-inline const mpreal acos  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(acos );    }
-inline const mpreal asin  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(asin );    }
-inline const mpreal atan  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(atan );    }
-
-inline const mpreal logb  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   return log2 (abs(x),r);                    }
-
-inline const mpreal acot  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return atan (1/v, r);                      }
-inline const mpreal asec  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return acos (1/v, r);                      }
-inline const mpreal acsc  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return asin (1/v, r);                      }
-inline const mpreal acoth (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return atanh(1/v, r);                      }
-inline const mpreal asech (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return acosh(1/v, r);                      }
-inline const mpreal acsch (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return asinh(1/v, r);                      }
-
-inline const mpreal cosh  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cosh );    }
-inline const mpreal sinh  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sinh );    }
-inline const mpreal tanh  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(tanh );    }
-inline const mpreal sech  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sech );    }
-inline const mpreal csch  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(csch );    }
-inline const mpreal coth  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(coth );    }
-inline const mpreal acosh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(acosh);    }
-inline const mpreal asinh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(asinh);    }
-inline const mpreal atanh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(atanh);    }
-
-inline const mpreal log1p   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log1p  );    }
-inline const mpreal expm1   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(expm1  );    }
-inline const mpreal eint    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(eint   );    }
-inline const mpreal gamma   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(gamma  );    }
-inline const mpreal tgamma  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(gamma  );    }
-inline const mpreal lngamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(lngamma);    }
-inline const mpreal zeta    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(zeta   );    }
-inline const mpreal erf     (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(erf    );    }
-inline const mpreal erfc    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(erfc   );    }
-inline const mpreal besselj0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(j0     );    }
-inline const mpreal besselj1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(j1     );    }
-inline const mpreal bessely0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(y0     );    }
-inline const mpreal bessely1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(y1     );    }
-
-inline const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
-    mpfr_atan2(a.mpfr_ptr(), y.mpfr_srcptr(), x.mpfr_srcptr(), rnd_mode);
-    return a;
-}
-
-inline const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
-    mpfr_hypot(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
-    return a;
-}
-
-inline const mpreal remainder (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
-    mpfr_remainder(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
-    return a;
-}
-
-inline const mpreal remquo (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
-    mpfr_remquo(a.mpfr_ptr(),q, x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
-    return a;
-}
-
-inline const mpreal fac_ui (unsigned long int v, mp_prec_t prec     = mpreal::get_default_prec(),
-                                           mp_rnd_t  rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(0, prec);
-    mpfr_fac_ui(x.mpfr_ptr(),v,rnd_mode);
-    return x;
-}
-
-
-inline const mpreal lgamma (const mpreal& v, int *signp = 0, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(v);
-    int tsignp;
-
-    if(signp)   mpfr_lgamma(x.mpfr_ptr(),  signp,v.mpfr_srcptr(),rnd_mode);
-    else        mpfr_lgamma(x.mpfr_ptr(),&tsignp,v.mpfr_srcptr(),rnd_mode);
-
-    return x;
-}
-
-
-inline const mpreal besseljn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal  y(0, x.getPrecision());
-    mpfr_jn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r);
-    return y;
-}
-
-inline const mpreal besselyn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal  y(0, x.getPrecision());
-    mpfr_yn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r);
-    return y;
-}
-
-inline const mpreal fma (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mp_prec_t p1, p2, p3;
-
-    p1 = v1.get_prec();
-    p2 = v2.get_prec();
-    p3 = v3.get_prec();
-
-    a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
-
-    mpfr_fma(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode);
-    return a;
-}
-
-inline const mpreal fms (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mp_prec_t p1, p2, p3;
-
-    p1 = v1.get_prec();
-    p2 = v2.get_prec();
-    p3 = v3.get_prec();
-
-    a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
-
-    mpfr_fms(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode);
-    return a;
-}
-
-inline const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mp_prec_t p1, p2;
-
-    p1 = v1.get_prec();
-    p2 = v2.get_prec();
-
-    a.set_prec(p1>p2?p1:p2);
-
-    mpfr_agm(a.mp, v1.mp, v2.mp, rnd_mode);
-
-    return a;
-}
-
-inline const mpreal sum (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t mode = mpreal::get_default_rnd())
-{
-    mpfr_srcptr *p = new mpfr_srcptr[n];
-
-    for (unsigned long int  i = 0; i < n; i++)
-        p[i] = tab[i].mpfr_srcptr();
-
-    mpreal x;
-    status = mpfr_sum(x.mpfr_ptr(), (mpfr_ptr*)p, n, mode);
-
-    delete [] p;
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// MPFR 2.4.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
-inline int sinh_cosh(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    return mpfr_sinh_cosh(s.mp,c.mp,v.mp,rnd_mode);
-}
-
-inline const mpreal li2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    MPREAL_UNARY_MATH_FUNCTION_BODY(li2);
-}
-
-inline const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    /*  R = rem(X,Y) if Y != 0, returns X - n * Y where n = trunc(X/Y). */
-    return fmod(x, y, rnd_mode);
-}
-
-inline const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    (void)rnd_mode;
-
-    /*
-
-    m = mod(x,y) if y != 0, returns x - n*y where n = floor(x/y)
-
-    The following are true by convention:
-    - mod(x,0) is x
-    - mod(x,x) is 0
-    - mod(x,y) for x != y and y != 0 has the same sign as y.
-
-    */
-
-    if(iszero(y)) return x;
-    if(x == y) return 0;
-
-    mpreal m = x - floor(x / y) * y;
-
-    m.setSign(sgn(y)); // make sure result has the same sign as Y
-
-    return m;
-}
-
-inline const mpreal fmod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mp_prec_t yp, xp;
-
-    yp = y.get_prec();
-    xp = x.get_prec();
-
-    a.set_prec(yp>xp?yp:xp);
-
-    mpfr_fmod(a.mp, x.mp, y.mp, rnd_mode);
-
-    return a;
-}
-
-inline const mpreal rec_sqrt(const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(v);
-    mpfr_rec_sqrt(x.mp,v.mp,rnd_mode);
-    return x;
-}
-#endif //  MPFR 2.4.0 Specifics
-
-//////////////////////////////////////////////////////////////////////////
-// MPFR 3.0.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline const mpreal digamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(digamma);     }
-inline const mpreal ai      (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(ai);          }
-#endif // MPFR 3.0.0 Specifics
-
-//////////////////////////////////////////////////////////////////////////
-// Constants
-inline const mpreal const_log2 (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal x(0, p);
-    mpfr_const_log2(x.mpfr_ptr(), r);
-    return x;
-}
-
-inline const mpreal const_pi (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal x(0, p);
-    mpfr_const_pi(x.mpfr_ptr(), r);
-    return x;
-}
-
-inline const mpreal const_euler (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal x(0, p);
-    mpfr_const_euler(x.mpfr_ptr(), r);
-    return x;
-}
-
-inline const mpreal const_catalan (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal x(0, p);
-    mpfr_const_catalan(x.mpfr_ptr(), r);
-    return x;
-}
-
-inline const mpreal const_infinity (int sign = 1, mp_prec_t p = mpreal::get_default_prec())
-{
-    mpreal x(0, p);
-    mpfr_set_inf(x.mpfr_ptr(), sign);
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Integer Related Functions
-inline const mpreal ceil(const mpreal& v)
-{
-    mpreal x(v);
-    mpfr_ceil(x.mp,v.mp);
-    return x;
-}
-
-inline const mpreal floor(const mpreal& v)
-{
-    mpreal x(v);
-    mpfr_floor(x.mp,v.mp);
-    return x;
-}
-
-inline const mpreal round(const mpreal& v)
-{
-    mpreal x(v);
-    mpfr_round(x.mp,v.mp);
-    return x;
-}
-
-inline const mpreal trunc(const mpreal& v)
-{
-    mpreal x(v);
-    mpfr_trunc(x.mp,v.mp);
-    return x;
-}
-
-inline const mpreal rint       (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint      );     }
-inline const mpreal rint_ceil  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_ceil );     }
-inline const mpreal rint_floor (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_floor);     }
-inline const mpreal rint_round (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_round);     }
-inline const mpreal rint_trunc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_trunc);     }
-inline const mpreal frac       (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(frac      );     }
-
-//////////////////////////////////////////////////////////////////////////
-// Miscellaneous Functions
-inline void         swap (mpreal& a, mpreal& b)            {    mpfr_swap(a.mp,b.mp);   }
-inline const mpreal (max)(const mpreal& x, const mpreal& y){    return (x>y?x:y);       }
-inline const mpreal (min)(const mpreal& x, const mpreal& y){    return (x<y?x:y);       }
-
-inline const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mpfr_max(a.mp,x.mp,y.mp,rnd_mode);
-    return a;
-}
-
-inline const mpreal fmin(const mpreal& x, const mpreal& y,  mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mpfr_min(a.mp,x.mp,y.mp,rnd_mode);
-    return a;
-}
-
-inline const mpreal nexttoward (const mpreal& x, const mpreal& y)
-{
-    mpreal a(x);
-    mpfr_nexttoward(a.mp,y.mp);
-    return a;
-}
-
-inline const mpreal nextabove  (const mpreal& x)
-{
-    mpreal a(x);
-    mpfr_nextabove(a.mp);
-    return a;
-}
-
-inline const mpreal nextbelow  (const mpreal& x)
-{
-    mpreal a(x);
-    mpfr_nextbelow(a.mp);
-    return a;
-}
-
-inline const mpreal urandomb (gmp_randstate_t& state)
-{
-    mpreal x;
-    mpfr_urandomb(x.mpfr_ptr(),state);
-    return x;
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x;
-    mpfr_urandom(x.mpfr_ptr(), state, rnd_mode);
-    return x;
-}
-#endif
-
-#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
-inline const mpreal random2 (mp_size_t size, mp_exp_t exp)
-{
-    mpreal x;
-    mpfr_random2(x.mpfr_ptr(),size,exp);
-    return x;
-}
-#endif
-
-// Uniformly distributed random number generation
-// a = random(seed); <- initialization & first random number generation
-// a = random();     <- next random numbers generation
-// seed != 0
-inline const mpreal random(unsigned int seed = 0)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-    static gmp_randstate_t state;
-    static bool initialize = true;
-
-    if(initialize)
-    {
-        gmp_randinit_default(state);
-        gmp_randseed_ui(state,0);
-        initialize = false;
-    }
-
-    if(seed != 0)    gmp_randseed_ui(state,seed);
-
-    return mpfr::urandom(state);
-#else
-    if(seed != 0)    std::srand(seed);
-    return mpfr::mpreal(std::rand()/(double)RAND_MAX);
-#endif
-
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
-
-inline const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x;
-    mpfr_grandom(x.mpfr_ptr(), NULL, state, rnd_mode);
-    return x;
-}
-
-inline const mpreal grandom(unsigned int seed = 0)
-{
-    static gmp_randstate_t state;
-    static bool initialize = true;
-
-    if(initialize)
-    {
-        gmp_randinit_default(state);
-        gmp_randseed_ui(state,0);
-        initialize = false;
-    }
-
-    if(seed != 0) gmp_randseed_ui(state,seed);
-
-    return mpfr::grandom(state);
-}
-#endif
-
-//////////////////////////////////////////////////////////////////////////
-// Set/Get global properties
-inline void mpreal::set_default_prec(mp_prec_t prec)
-{
-    mpfr_set_default_prec(prec);
-}
-
-inline void mpreal::set_default_rnd(mp_rnd_t rnd_mode)
-{
-    mpfr_set_default_rounding_mode(rnd_mode);
-}
-
-inline bool mpreal::fits_in_bits(double x, int n)
-{
-    int i;
-    double t;
-    return IsInf(x) || (std::modf ( std::ldexp ( std::frexp ( x, &i ), n ), &t ) == 0.0);
-}
-
-inline const mpreal pow(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_pow(x.mp,x.mp,b.mp,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_pow_z(x.mp,x.mp,b,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_pow_ui(x.mp,x.mp,b,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(a,static_cast<unsigned long int>(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_pow_si(x.mp,x.mp,b,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode)
-{
-    return pow(a,static_cast<long int>(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode)
-{
-    return pow(a,mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode)
-{
-    return pow(a,mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_ui_pow(x.mp,a,b.mp,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)     return pow(static_cast<unsigned long int>(a),b,rnd_mode);
-    else          return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)     return pow(static_cast<unsigned long int>(a),b,rnd_mode);
-    else          return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode);
-}
-
-// pow unsigned long int
-inline const mpreal pow(const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    mpreal x(a);
-    mpfr_ui_pow_ui(x.mp,a,b,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode)
-{
-    if(b>0)    return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-    else       return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode)
-{
-    if(b>0)    return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-    else       return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode)
-{
-    return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode)
-{
-    return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-// pow unsigned int
-inline const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode)
-{
-    if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-    else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode)
-{
-    if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-    else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-// pow long int
-inline const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    if (a>0) return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
-    else     return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    if (a>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode);  //mpfr_ui_pow_ui
-    else     return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode)
-{
-    if (a>0)
-    {
-        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    }else{
-        return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-    }
-}
-
-inline const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode)
-{
-    if (a>0)
-    {
-        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    }else{
-        return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-    }
-}
-
-inline const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-inline const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-// pow int
-inline const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    if (a>0) return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
-    else     return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    if (a>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode);  //mpfr_ui_pow_ui
-    else     return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode)
-{
-    if (a>0)
-    {
-        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    }else{
-        return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-    }
-}
-
-inline const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode)
-{
-    if (a>0)
-    {
-        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    }else{
-        return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-    }
-}
-
-inline const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-inline const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-// pow long double
-inline const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode); // mpfr_pow_ui
-}
-
-inline const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); // mpfr_pow_ui
-}
-
-inline const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-}
-} // End of mpfr namespace
-
-// Explicit specialization of std::swap for mpreal numbers
-// Thus standard algorithms will use efficient version of swap (due to Koenig lookup)
-// Non-throwing swap C++ idiom: http://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Non-throwing_swap
-namespace std
-{
-  // we are allowed to extend namespace std with specializations only
-    template <>
-    inline void swap(mpfr::mpreal& x, mpfr::mpreal& y)
-    {
-        return mpfr::swap(x, y);
-    }
-
-    template<>
-    class numeric_limits<mpfr::mpreal>
-    {
-    public:
-        static const bool is_specialized    = true;
-        static const bool is_signed         = true;
-        static const bool is_integer        = false;
-        static const bool is_exact          = false;
-        static const int  radix             = 2;
-
-        static const bool has_infinity      = true;
-        static const bool has_quiet_NaN     = true;
-        static const bool has_signaling_NaN = true;
-
-        static const bool is_iec559         = true;        // = IEEE 754
-        static const bool is_bounded        = true;
-        static const bool is_modulo         = false;
-        static const bool traps             = true;
-        static const bool tinyness_before   = true;
-
-        static const float_denorm_style has_denorm  = denorm_absent;
-
-        inline static mpfr::mpreal (min)    (mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::minval(precision);  }
-        inline static mpfr::mpreal (max)    (mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::maxval(precision);  }
-        inline static mpfr::mpreal lowest   (mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return -mpfr::maxval(precision);  }
-
-        // Returns smallest eps such that 1 + eps != 1 (classic machine epsilon)
-        inline static mpfr::mpreal epsilon(mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::machine_epsilon(precision); }
-
-        // Returns smallest eps such that x + eps != x (relative machine epsilon)
-        inline static mpfr::mpreal epsilon(const mpfr::mpreal& x) {  return mpfr::machine_epsilon(x);  }
-
-        inline static mpfr::mpreal round_error(mp_prec_t precision = mpfr::mpreal::get_default_prec())
-        {
-            mp_rnd_t r = mpfr::mpreal::get_default_rnd();
-
-            if(r == GMP_RNDN)  return mpfr::mpreal(0.5, precision);
-            else               return mpfr::mpreal(1.0, precision);
-        }
-
-        inline static const mpfr::mpreal infinity()         { return mpfr::const_infinity();     }
-        inline static const mpfr::mpreal quiet_NaN()        { return mpfr::mpreal().setNan();    }
-        inline static const mpfr::mpreal signaling_NaN()    { return mpfr::mpreal().setNan();    }
-        inline static const mpfr::mpreal denorm_min()       { return (min)();                    }
-
-        // Please note, exponent range is not fixed in MPFR
-        static const int min_exponent = MPFR_EMIN_DEFAULT;
-        static const int max_exponent = MPFR_EMAX_DEFAULT;
-        MPREAL_PERMISSIVE_EXPR static const int min_exponent10 = (int) (MPFR_EMIN_DEFAULT * 0.3010299956639811);
-        MPREAL_PERMISSIVE_EXPR static const int max_exponent10 = (int) (MPFR_EMAX_DEFAULT * 0.3010299956639811);
-
-#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
-
-        // Following members should be constant according to standard, but they can be variable in MPFR
-        // So we define them as functions here.
-        //
-        // This is preferable way for std::numeric_limits<mpfr::mpreal> specialization.
-        // But it is incompatible with standard std::numeric_limits and might not work with other libraries, e.g. boost.
-        // See below for compatible implementation.
-        inline static float_round_style round_style()
-        {
-            mp_rnd_t r = mpfr::mpreal::get_default_rnd();
-
-            switch (r)
-            {
-            case GMP_RNDN: return round_to_nearest;
-            case GMP_RNDZ: return round_toward_zero;
-            case GMP_RNDU: return round_toward_infinity;
-            case GMP_RNDD: return round_toward_neg_infinity;
-            default: return round_indeterminate;
-            }
-        }
-
-        inline static int digits()                        {    return int(mpfr::mpreal::get_default_prec());    }
-        inline static int digits(const mpfr::mpreal& x)   {    return x.getPrecision();                         }
-
-        inline static int digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec())
-        {
-            return mpfr::bits2digits(precision);
-        }
-
-        inline static int digits10(const mpfr::mpreal& x)
-        {
-            return mpfr::bits2digits(x.getPrecision());
-        }
-
-        inline static int max_digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec())
-        {
-            return digits10(precision);
-        }
-#else
-        // Digits and round_style are NOT constants when it comes to mpreal.
-        // If possible, please use functions digits() and round_style() defined above.
-        //
-        // These (default) values are preserved for compatibility with existing libraries, e.g. boost.
-        // Change them accordingly to your application.
-        //
-        // For example, if you use 256 bits of precision uniformly in your program, then:
-        // digits       = 256
-        // digits10     = 77
-        // max_digits10 = 78
-        //
-        // Approximate formula for decimal digits is: digits10 = floor(log10(2) * digits). See bits2digits() for more details.
-
-        static const std::float_round_style round_style = round_to_nearest;
-        static const int digits       = 53;
-        static const int digits10     = 15;
-        static const int max_digits10 = 16;
-#endif
-    };
-
-}
-
-#endif /* __MPREAL_H__ */
diff --git a/contrib/eigen/unsupported/test/mpreal_support.cpp b/contrib/eigen/unsupported/test/mpreal_support.cpp
index 685e7ea45e3457deac7d4597ddfc1c483d64a193..10beb0714a4d826ffc66e74f2d1701053bd7b21c 100644
--- a/contrib/eigen/unsupported/test/mpreal_support.cpp
+++ b/contrib/eigen/unsupported/test/mpreal_support.cpp
@@ -1,3 +1,4 @@
+#include <mpreal.h>  // Must be included before main.h.
 #include "main.h"
 #include <Eigen/MPRealSupport>
 #include <Eigen/LU>
@@ -7,7 +8,7 @@
 using namespace mpfr;
 using namespace Eigen;
 
-void test_mpreal_support()
+EIGEN_DECLARE_TEST(mpreal_support)
 {
   // set precision to 256 bits (double has only 53 bits)
   mpreal::set_default_prec(256);
diff --git a/contrib/eigen/unsupported/test/openglsupport.cpp b/contrib/eigen/unsupported/test/openglsupport.cpp
index 5f6343427eb826f255e1ada9f6cf42c33d330699..1c44381349d1155426f77c2f680c7732f4e3e637 100644
--- a/contrib/eigen/unsupported/test/openglsupport.cpp
+++ b/contrib/eigen/unsupported/test/openglsupport.cpp
@@ -9,15 +9,24 @@
 
 #include <main.h>
 #include <iostream>
+#include <string>
+
+#if defined(__APPLE_CC__)
+  // Prevent deprecation warnings caused by GLEW on MacOS.
+  #define GL_SILENCE_DEPRECATION 1
+#endif
 #include <GL/glew.h>
 #include <Eigen/OpenGLSupport>
-#include <GL/glut.h>
-using namespace Eigen;
-
-
+#if defined(__APPLE_CC__)
+  #include <GLUT/glut.h>
+#else
+  #include <GL/freeglut.h>
+#endif
 
+using namespace Eigen;
 
 #define VERIFY_MATRIX(CODE,REF) { \
+    glMatrixMode(GL_MODELVIEW); \
     glLoadIdentity(); \
     CODE; \
     Matrix<float,4,4,ColMajor> m; m.setZero(); \
@@ -40,7 +49,7 @@ using namespace Eigen;
     } \
     VERIFY_IS_APPROX(value, data); \
   }
-  
+
 #define VERIFY_UNIFORMi(NAME,TYPE) { \
     TYPE value = TYPE::Random().eval().cast<float>().cast<TYPE::Scalar>(); \
     TYPE data; \
@@ -53,175 +62,324 @@ using namespace Eigen;
     } \
     VERIFY_IS_APPROX(value, data); \
   }
-  
-void printInfoLog(GLuint objectID)
+
+void printProgramInfoLog(GLuint objectID)
 {
     int infologLength, charsWritten;
     GLchar *infoLog;
-    glGetProgramiv(objectID,GL_INFO_LOG_LENGTH, &infologLength);
+    glGetProgramiv(objectID, GL_INFO_LOG_LENGTH, &infologLength);
     if(infologLength > 0)
     {
         infoLog = new GLchar[infologLength];
         glGetProgramInfoLog(objectID, infologLength, &charsWritten, infoLog);
-        if (charsWritten>0)
+        if (charsWritten > 0)
+          std::cerr << "Program info : \n" << infoLog << std::endl;
+        delete[] infoLog;
+    }
+}
+
+void printShaderInfoLog(GLuint objectID)
+{
+    int infologLength, charsWritten;
+    GLchar *infoLog;
+    glGetShaderiv(objectID, GL_INFO_LOG_LENGTH, &infologLength);
+    if(infologLength > 0)
+    {
+        infoLog = new GLchar[infologLength];
+        glGetShaderInfoLog(objectID, infologLength, &charsWritten, infoLog);
+        if (charsWritten > 0)
           std::cerr << "Shader info : \n" << infoLog << std::endl;
         delete[] infoLog;
     }
 }
 
-GLint createShader(const char* vtx, const char* frg)
+GLint createProgram(const char* vtx, const char* frg, bool print_errors = true)
 {
   GLint prg_id = glCreateProgram();
   GLint vtx_id = glCreateShader(GL_VERTEX_SHADER);
   GLint frg_id = glCreateShader(GL_FRAGMENT_SHADER);
   GLint ok;
-  
+
   glShaderSource(vtx_id, 1, &vtx, 0);
   glCompileShader(vtx_id);
-  glGetShaderiv(vtx_id,GL_COMPILE_STATUS,&ok);
+  glGetShaderiv(vtx_id, GL_COMPILE_STATUS, &ok);
   if(!ok)
   {
-    std::cerr << "vtx compilation failed\n";
+    if (print_errors)
+    {
+      std::cerr << "vtx compilation failed\n";
+      std::cerr << "Source:\n" << vtx << "\n";
+      printShaderInfoLog(vtx_id);
+    }
+    glDeleteShader(vtx_id);
+    return GL_ZERO;
   }
-  
+
   glShaderSource(frg_id, 1, &frg, 0);
   glCompileShader(frg_id);
-  glGetShaderiv(frg_id,GL_COMPILE_STATUS,&ok);
+  glGetShaderiv(frg_id, GL_COMPILE_STATUS, &ok);
   if(!ok)
   {
-    std::cerr << "frg compilation failed\n";
+    if (print_errors)
+    {
+      std::cerr << "frg compilation failed.\n";
+      std::cerr << "Source:\n" << frg << "\n";
+      printShaderInfoLog(frg_id);
+    }
+    glDeleteShader(vtx_id);
+    glDeleteShader(frg_id);
+    return GL_ZERO;
   }
-  
+
   glAttachShader(prg_id, vtx_id);
   glAttachShader(prg_id, frg_id);
   glLinkProgram(prg_id);
-  glGetProgramiv(prg_id,GL_LINK_STATUS,&ok);
+
+  // Delete shaders once linked.
+  glDeleteShader(vtx_id);
+  glDeleteShader(frg_id);
+  glGetProgramiv(prg_id, GL_LINK_STATUS, &ok);
   if(!ok)
   {
-    std::cerr << "linking failed\n";
+    if (print_errors)
+    {
+      std::cerr << "linking failed.\n";
+      printProgramInfoLog(prg_id);
+    }
+    glDeleteProgram(prg_id);
+    return GL_ZERO;
   }
-  printInfoLog(prg_id);
-  
+
   glUseProgram(prg_id);
   return prg_id;
 }
 
-void test_openglsupport()
+GLint createProgram(const std::string& vtx, const std::string& frg, bool print_errors = true)
 {
-  int argc = 0;
-  glutInit(&argc, 0);
-  glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH);
-  glutInitWindowPosition (0,0);
-  glutInitWindowSize(10, 10);
+  return createProgram(vtx.c_str(), frg.c_str(), print_errors);
+}
 
-  if(glutCreateWindow("Eigen") <= 0)
+std::string getGlslVersionString(int gl_major_version, int gl_minor_version)
+{
+  switch (gl_major_version)
   {
-    std::cerr << "Error: Unable to create GLUT Window.\n";
-    exit(1);
+    case 2:
+      switch (gl_minor_version)
+      {
+        case 0:
+          return "#version 110";
+        case 1:
+          return "#version 120";
+      }
+      break;
+    case 3:
+      switch (gl_minor_version)
+      {
+        case 0:
+          return "#version 130";
+        case 1:
+          return "#version 140";
+        case 2:
+          return "#version 150";
+        case 3:
+          return "#version 330";
+      }
+      break;
+    case 4:
+      switch (gl_minor_version)
+      {
+        case 0:
+          return "#version 400";
+        case 1:
+          return "#version 410";
+        case 2:
+          return "#version 420";
+        case 3:
+          return "#version 430";
+        case 4:
+          return "#version 440";
+        case 5:
+          return "#version 450";
+        case 6:
+          return "#version 460";
+      }
+      break;
   }
-  
-  glewExperimental = GL_TRUE;
-  if(glewInit() != GLEW_OK)
-  {
-    std::cerr << "Warning: Failed to initialize GLEW\n";
+  return "";
+}
+
+void find_and_replace(
+  std::string& str,
+  const std::string& find,
+  const std::string& replace)
+{
+  size_t loc = 0;
+  size_t flen = find.length();
+  size_t rlen = replace.length();
+  while ( (loc = str.find(find, loc)) != std::string::npos) {
+    str.replace(loc, flen, replace);
+    loc += rlen;
   }
+}
 
-  Vector3f v3f;
-  Matrix3f rot;
-  glBegin(GL_POINTS);
-  
-  glVertex(v3f);
-  glVertex(2*v3f+v3f);
-  glVertex(rot*v3f);
-  
-  glEnd();
-  
-  // 4x4 matrices
-  Matrix4f mf44; mf44.setRandom();
-  VERIFY_MATRIX(glLoadMatrix(mf44), mf44);
-  VERIFY_MATRIX(glMultMatrix(mf44), mf44);
-  Matrix4d md44; md44.setRandom();
-  VERIFY_MATRIX(glLoadMatrix(md44), md44);
-  VERIFY_MATRIX(glMultMatrix(md44), md44);
-  
-  // Quaternion
-  Quaterniond qd(AngleAxisd(internal::random<double>(), Vector3d::Random()));
-  VERIFY_MATRIX(glRotate(qd), Projective3d(qd).matrix());
-  
-  Quaternionf qf(AngleAxisf(internal::random<double>(), Vector3f::Random()));
-  VERIFY_MATRIX(glRotate(qf), Projective3f(qf).matrix());
-  
-  // 3D Transform
-  Transform<float,3,AffineCompact> acf3; acf3.matrix().setRandom();
-  VERIFY_MATRIX(glLoadMatrix(acf3), Projective3f(acf3).matrix());
-  VERIFY_MATRIX(glMultMatrix(acf3), Projective3f(acf3).matrix());
-  
-  Transform<float,3,Affine> af3(acf3);
-  VERIFY_MATRIX(glLoadMatrix(af3), Projective3f(af3).matrix());
-  VERIFY_MATRIX(glMultMatrix(af3), Projective3f(af3).matrix());
-  
-  Transform<float,3,Projective> pf3; pf3.matrix().setRandom();
-  VERIFY_MATRIX(glLoadMatrix(pf3), Projective3f(pf3).matrix());
-  VERIFY_MATRIX(glMultMatrix(pf3), Projective3f(pf3).matrix());
-  
-  Transform<double,3,AffineCompact> acd3; acd3.matrix().setRandom();
-  VERIFY_MATRIX(glLoadMatrix(acd3), Projective3d(acd3).matrix());
-  VERIFY_MATRIX(glMultMatrix(acd3), Projective3d(acd3).matrix());
-  
-  Transform<double,3,Affine> ad3(acd3);
-  VERIFY_MATRIX(glLoadMatrix(ad3), Projective3d(ad3).matrix());
-  VERIFY_MATRIX(glMultMatrix(ad3), Projective3d(ad3).matrix());
-  
-  Transform<double,3,Projective> pd3; pd3.matrix().setRandom();
-  VERIFY_MATRIX(glLoadMatrix(pd3), Projective3d(pd3).matrix());
-  VERIFY_MATRIX(glMultMatrix(pd3), Projective3d(pd3).matrix());
-  
-  // translations (2D and 3D)
-  {
-    Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 0;
-    VERIFY_MATRIX(glTranslate(vf2), Projective3f(Translation3f(vf23)).matrix());
-    Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 0;
-    VERIFY_MATRIX(glTranslate(vd2), Projective3d(Translation3d(vd23)).matrix());
-    
-    Vector3f vf3; vf3.setRandom();
-    VERIFY_MATRIX(glTranslate(vf3), Projective3f(Translation3f(vf3)).matrix());
-    Vector3d vd3; vd3.setRandom();
-    VERIFY_MATRIX(glTranslate(vd3), Projective3d(Translation3d(vd3)).matrix());
-    
-    Translation<float,3> tf3; tf3.vector().setRandom();
-    VERIFY_MATRIX(glTranslate(tf3), Projective3f(tf3).matrix());
-    
-    Translation<double,3> td3;  td3.vector().setRandom();
-    VERIFY_MATRIX(glTranslate(td3), Projective3d(td3).matrix());
+// Finds and replaces a set of substrings in a string.
+std::string format(
+  const std::string& str,
+  const std::vector<std::string>& find,
+  const std::vector<std::string>& replace)
+{
+  std::string out = str;
+  for (std::size_t i=0; i<find.size(); ++i) {
+    find_and_replace(out, find[i], replace[i]);
   }
-  
-  // scaling (2D and 3D)
+  return out;
+}
+
+// GLUT display function that runs test.  Must be run within the display loop
+// in order to properly destroy resources.
+void openglsupport_test_loop()
+{
+  // Get context info.
+  const GLubyte* gl_version_string = glGetString(GL_VERSION);
+  std::cerr << "GL version: " << gl_version_string << std::endl;
+  std::cerr << "GLSL version: " << glGetString(GL_SHADING_LANGUAGE_VERSION) << std::endl;
+  // Parse version from string since GL_MAJOR_VERSION is only supported in GL 3.0+.
+  // Version string guaranteed to be <major>.<minor><vender extension>.
+  GLint gl_major_version = gl_version_string[0] - '0';
+  GLint gl_minor_version = gl_version_string[2] - '0';
+  bool legacy_gl = gl_major_version < 3 || (gl_major_version == 3 && gl_minor_version < 2);
+
+  // Fixed-function pipeline removed in OpenGL 3.2.
+  if (legacy_gl)
   {
-    Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 1;
-    VERIFY_MATRIX(glScale(vf2), Projective3f(Scaling(vf23)).matrix());
-    Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 1;
-    VERIFY_MATRIX(glScale(vd2), Projective3d(Scaling(vd23)).matrix());
-    
-    Vector3f vf3; vf3.setRandom();
-    VERIFY_MATRIX(glScale(vf3), Projective3f(Scaling(vf3)).matrix());
-    Vector3d vd3; vd3.setRandom();
-    VERIFY_MATRIX(glScale(vd3), Projective3d(Scaling(vd3)).matrix());
-    
-    UniformScaling<float> usf(internal::random<float>());
-    VERIFY_MATRIX(glScale(usf), Projective3f(usf).matrix());
-    
-    UniformScaling<double> usd(internal::random<double>());
-    VERIFY_MATRIX(glScale(usd), Projective3d(usd).matrix());
+    // Draw a basic triangle.
+    Vector3f v3f;
+    Matrix3f rot;
+    glBegin(GL_POINTS);
+    {
+      glVertex(v3f);
+      glVertex(2*v3f+v3f);
+      glVertex(rot*v3f);
+    }
+    glEnd();
+
+    // 4x4 matrices
+    Matrix4f mf44; mf44.setRandom();
+    VERIFY_MATRIX(glLoadMatrix(mf44), mf44);
+    VERIFY_MATRIX(glMultMatrix(mf44), mf44);
+    Matrix4d md44; md44.setRandom();
+    VERIFY_MATRIX(glLoadMatrix(md44), md44);
+    VERIFY_MATRIX(glMultMatrix(md44), md44);
+
+    // Quaternion
+    Quaterniond qd(AngleAxisd(internal::random<double>(), Vector3d::Random()));
+    VERIFY_MATRIX(glRotate(qd), Projective3d(qd).matrix());
+
+    Quaternionf qf(AngleAxisf(internal::random<double>(), Vector3f::Random()));
+    VERIFY_MATRIX(glRotate(qf), Projective3f(qf).matrix());
+
+    // 3D Transform
+    Transform<float,3,AffineCompact> acf3; acf3.matrix().setRandom();
+    VERIFY_MATRIX(glLoadMatrix(acf3), Projective3f(acf3).matrix());
+    VERIFY_MATRIX(glMultMatrix(acf3), Projective3f(acf3).matrix());
+
+    Transform<float,3,Affine> af3(acf3);
+    VERIFY_MATRIX(glLoadMatrix(af3), Projective3f(af3).matrix());
+    VERIFY_MATRIX(glMultMatrix(af3), Projective3f(af3).matrix());
+
+    Transform<float,3,Projective> pf3; pf3.matrix().setRandom();
+    VERIFY_MATRIX(glLoadMatrix(pf3), Projective3f(pf3).matrix());
+    VERIFY_MATRIX(glMultMatrix(pf3), Projective3f(pf3).matrix());
+
+    Transform<double,3,AffineCompact> acd3; acd3.matrix().setRandom();
+    VERIFY_MATRIX(glLoadMatrix(acd3), Projective3d(acd3).matrix());
+    VERIFY_MATRIX(glMultMatrix(acd3), Projective3d(acd3).matrix());
+
+    Transform<double,3,Affine> ad3(acd3);
+    VERIFY_MATRIX(glLoadMatrix(ad3), Projective3d(ad3).matrix());
+    VERIFY_MATRIX(glMultMatrix(ad3), Projective3d(ad3).matrix());
+
+    Transform<double,3,Projective> pd3; pd3.matrix().setRandom();
+    VERIFY_MATRIX(glLoadMatrix(pd3), Projective3d(pd3).matrix());
+    VERIFY_MATRIX(glMultMatrix(pd3), Projective3d(pd3).matrix());
+
+    // translations (2D and 3D)
+    {
+      Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 0;
+      VERIFY_MATRIX(glTranslate(vf2), Projective3f(Translation3f(vf23)).matrix());
+      Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 0;
+      VERIFY_MATRIX(glTranslate(vd2), Projective3d(Translation3d(vd23)).matrix());
+
+      Vector3f vf3; vf3.setRandom();
+      VERIFY_MATRIX(glTranslate(vf3), Projective3f(Translation3f(vf3)).matrix());
+      Vector3d vd3; vd3.setRandom();
+      VERIFY_MATRIX(glTranslate(vd3), Projective3d(Translation3d(vd3)).matrix());
+
+      Translation<float,3> tf3; tf3.vector().setRandom();
+      VERIFY_MATRIX(glTranslate(tf3), Projective3f(tf3).matrix());
+
+      Translation<double,3> td3;  td3.vector().setRandom();
+      VERIFY_MATRIX(glTranslate(td3), Projective3d(td3).matrix());
+    }
+
+    // scaling (2D and 3D)
+    {
+      Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 1;
+      VERIFY_MATRIX(glScale(vf2), Projective3f(Scaling(vf23)).matrix());
+      Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 1;
+      VERIFY_MATRIX(glScale(vd2), Projective3d(Scaling(vd23)).matrix());
+
+      Vector3f vf3; vf3.setRandom();
+      VERIFY_MATRIX(glScale(vf3), Projective3f(Scaling(vf3)).matrix());
+      Vector3d vd3; vd3.setRandom();
+      VERIFY_MATRIX(glScale(vd3), Projective3d(Scaling(vd3)).matrix());
+
+      UniformScaling<float> usf(internal::random<float>());
+      VERIFY_MATRIX(glScale(usf), Projective3f(usf).matrix());
+
+      UniformScaling<double> usd(internal::random<double>());
+      VERIFY_MATRIX(glScale(usd), Projective3d(usd).matrix());
+    }
+  } else {
+    std::cerr << "Warning: fixed-function pipeline was not tested.\n";
+  }
+
+  // Dynamic shader substitution variables.
+  // Modern shaders require a version string, and newer runtimes fail to
+  // compile old GLSL versions. Thus, we dynamically set the GLSL version
+  // string based on runtime. Also, pre OpenGL 3.0, the output gl_FragColor was
+  // built-in. This was deprecated in OpenGL 3.0, requiring us to explicitly
+  // define the output variable.
+  std::vector<std::string> glsl_vars;
+  glsl_vars.push_back("${GLSL_VERSION}");
+  glsl_vars.push_back("${FRAG_OUTPUT_DECLARATION}");
+  glsl_vars.push_back("${FRAG_OUTPUT_VARIABLE}");
+
+  std::vector<std::string> glsl_vals;
+  glsl_vals.push_back(getGlslVersionString(gl_major_version, gl_minor_version));
+  if (gl_major_version >= 3) {
+    glsl_vals.push_back("out vec4 fragColor;");
+    glsl_vals.push_back("fragColor");
+  } else {
+    glsl_vals.push_back("");
+    glsl_vals.push_back("gl_FragColor");
   }
-  
+
   // uniform
   {
-    const char* vtx = "void main(void) { gl_Position = gl_Vertex; }\n";
-    
-    if(GLEW_VERSION_2_0)
+    // vertex shader.
+    std::string vtx = format(
+      "${GLSL_VERSION}\n"
+      "void main(void) {\n"
+      "  gl_Position = vec4(0,0,0,1);\n"
+      "}\n",
+      glsl_vars, glsl_vals);
+
+#ifdef GL_VERSION_2_0
+    if(GLEW_VERSION_2_0 && GL_VERSION_2_0)
     {
-      #ifdef GL_VERSION_2_0
-      const char* frg = ""
+      std::string frg = format(
+        "${GLSL_VERSION}\n"
         "uniform vec2 v2f;\n"
         "uniform vec3 v3f;\n"
         "uniform vec4 v4f;\n"
@@ -231,103 +389,212 @@ void test_openglsupport()
         "uniform mat2 m2f;\n"
         "uniform mat3 m3f;\n"
         "uniform mat4 m4f;\n"
-        "void main(void) { gl_FragColor = vec4(v2f[0]+v3f[0]+v4f[0])+vec4(v2i[0]+v3i[0]+v4i[0])+vec4(m2f[0][0]+m3f[0][0]+m4f[0][0]); }\n";
-        
-      GLint prg_id = createShader(vtx,frg);
-      
-      VERIFY_UNIFORM(fv,v2f, Vector2f);
-      VERIFY_UNIFORM(fv,v3f, Vector3f);
-      VERIFY_UNIFORM(fv,v4f, Vector4f);
+        "${FRAG_OUTPUT_DECLARATION}\n"
+        "void main(void) { \n"
+        "  ${FRAG_OUTPUT_VARIABLE} = vec4(v2f[0]+v3f[0]+v4f[0])+vec4(v2i[0]+v3i[0]+v4i[0])+vec4(m2f[0][0]+m3f[0][0]+m4f[0][0]);\n"
+        "}\n",
+        glsl_vars, glsl_vals);
+
+      GLint prg_id = createProgram(vtx, frg);
+      VERIFY(prg_id > 0 && "Failed to create program.");
+      VERIFY_UNIFORM(fv, v2f, Vector2f);
+      VERIFY_UNIFORM(fv, v3f, Vector3f);
+      VERIFY_UNIFORM(fv, v4f, Vector4f);
       VERIFY_UNIFORMi(v2i, Vector2i);
       VERIFY_UNIFORMi(v3i, Vector3i);
       VERIFY_UNIFORMi(v4i, Vector4i);
-      VERIFY_UNIFORM(fv,m2f, Matrix2f);
-      VERIFY_UNIFORM(fv,m3f, Matrix3f);
-      VERIFY_UNIFORM(fv,m4f, Matrix4f);
-      #endif
+      VERIFY_UNIFORM(fv, m2f, Matrix2f);
+      VERIFY_UNIFORM(fv, m3f, Matrix3f);
+      VERIFY_UNIFORM(fv, m4f, Matrix4f);
+      glDeleteProgram(prg_id);
     }
     else
-      std::cerr << "Warning: opengl 2.0 was not tested\n";
-    
-    if(GLEW_VERSION_2_1)
+#endif
+      std::cerr << "Warning: opengl 2.0 was not tested.\n";
+
+#ifdef GL_VERSION_2_1
+    if(GLEW_VERSION_2_1 && GL_VERSION_2_1 &&
+        (gl_major_version > 2 || (gl_major_version == 2 && gl_minor_version >= 1)))
     {
-      #ifdef GL_VERSION_2_1
-      const char* frg = "#version 120\n"
+      std::string frg = format(
+        "${GLSL_VERSION}\n"
         "uniform mat2x3 m23f;\n"
         "uniform mat3x2 m32f;\n"
         "uniform mat2x4 m24f;\n"
         "uniform mat4x2 m42f;\n"
         "uniform mat3x4 m34f;\n"
         "uniform mat4x3 m43f;\n"
-        "void main(void) { gl_FragColor = vec4(m23f[0][0]+m32f[0][0]+m24f[0][0]+m42f[0][0]+m34f[0][0]+m43f[0][0]); }\n";
-        
-      GLint prg_id = createShader(vtx,frg);
-      
+        "${FRAG_OUTPUT_DECLARATION}\n"
+        "void main(void) {\n"
+        "  ${FRAG_OUTPUT_VARIABLE} = vec4(m23f[0][0]+m32f[0][0]+m24f[0][0]+m42f[0][0]+m34f[0][0]+m43f[0][0]);\n"
+        "}\n",
+        glsl_vars, glsl_vals);
+
+      GLint prg_id = createProgram(vtx, frg);
+      VERIFY(prg_id > 0 && "Failed to create program.");
       typedef Matrix<float,2,3> Matrix23f;
       typedef Matrix<float,3,2> Matrix32f;
       typedef Matrix<float,2,4> Matrix24f;
       typedef Matrix<float,4,2> Matrix42f;
       typedef Matrix<float,3,4> Matrix34f;
       typedef Matrix<float,4,3> Matrix43f;
-      
-      VERIFY_UNIFORM(fv,m23f, Matrix23f);
-      VERIFY_UNIFORM(fv,m32f, Matrix32f);
-      VERIFY_UNIFORM(fv,m24f, Matrix24f);
-      VERIFY_UNIFORM(fv,m42f, Matrix42f);
-      VERIFY_UNIFORM(fv,m34f, Matrix34f);
-      VERIFY_UNIFORM(fv,m43f, Matrix43f);
-      #endif
+
+      VERIFY_UNIFORM(fv, m23f, Matrix23f);
+      VERIFY_UNIFORM(fv, m32f, Matrix32f);
+      VERIFY_UNIFORM(fv, m24f, Matrix24f);
+      VERIFY_UNIFORM(fv, m42f, Matrix42f);
+      VERIFY_UNIFORM(fv, m34f, Matrix34f);
+      VERIFY_UNIFORM(fv, m43f, Matrix43f);
+      glDeleteProgram(prg_id);
     }
     else
-      std::cerr << "Warning: opengl 2.1 was not tested\n";
-    
-    if(GLEW_VERSION_3_0)
+#endif
+      std::cerr << "Warning: opengl 2.1 was not tested.\n";
+
+#ifdef GL_VERSION_3_0
+    if(GLEW_VERSION_3_0 && GL_VERSION_3_0 && gl_major_version >= 3)
     {
-      #ifdef GL_VERSION_3_0
-      const char* frg = "#version 150\n"
+      std::string frg = format(
+        "${GLSL_VERSION}\n"
         "uniform uvec2 v2ui;\n"
         "uniform uvec3 v3ui;\n"
         "uniform uvec4 v4ui;\n"
-        "out vec4 data;\n"
-        "void main(void) { data = vec4(v2ui[0]+v3ui[0]+v4ui[0]); }\n";
-        
-      GLint prg_id = createShader(vtx,frg);
-      
+        "${FRAG_OUTPUT_DECLARATION}\n"
+        "void main(void) {\n"
+        "  ${FRAG_OUTPUT_VARIABLE} = vec4(v2ui[0]+v3ui[0]+v4ui[0]);\n"
+        "}\n",
+        glsl_vars, glsl_vals);
+
+      GLint prg_id = createProgram(vtx, frg);
+      VERIFY(prg_id > 0 && "Failed to create program.");
       typedef Matrix<unsigned int,2,1> Vector2ui;
       typedef Matrix<unsigned int,3,1> Vector3ui;
       typedef Matrix<unsigned int,4,1> Vector4ui;
-      
+
       VERIFY_UNIFORMi(v2ui, Vector2ui);
       VERIFY_UNIFORMi(v3ui, Vector3ui);
       VERIFY_UNIFORMi(v4ui, Vector4ui);
-      #endif
+      glDeleteProgram(prg_id);
     }
     else
-      std::cerr << "Warning: opengl 3.0 was not tested\n";
-    
-    #ifdef GLEW_ARB_gpu_shader_fp64
+#endif
+      std::cerr << "Warning: opengl 3.0 was not tested.\n";
+
+    // dvecn supported if >= 4.1 or ARB_vertex_attrib_64bit
+    bool has_fp64_native = (gl_major_version == 4 && gl_minor_version >= 1);
+    bool has_fp64_extension = false;
+#ifdef GLEW_ARB_gpu_shader_fp64
     if(GLEW_ARB_gpu_shader_fp64)
     {
-      #ifdef GL_ARB_gpu_shader_fp64
-      const char* frg = "#version 150\n"
+      // Check that extension can actually be compiled.
+      if (has_fp64_extension)
+      {
+        std::string frg = format(
+          "${GLSL_VERSION}\n"
+          "#extension GL_ARB_gpu_shader_fp64 : enable\n"
+          "uniform dvec2 dv2;\n"
+          "${FRAG_OUTPUT_DECLARATION}\n"
+          "void main(void) {\n"
+          "  ${FRAG_OUTPUT_VARIABLE} = vec4(dv2.x, dv2.y, dv2.x, dv2.y);\n"
+          "}\n",
+          glsl_vars, glsl_vals);
+        GLint prg_id = createProgram(vtx, frg, /*print_errors=*/false);
+        if (prg_id)
+        {
+          has_fp64_extension = true;
+          glDeleteProgram(prg_id);
+        }
+      }
+    }
+#endif
+
+    if( has_fp64_native || has_fp64_extension )
+    {
+      std::vector<std::string> glsl_vars_with_extension = glsl_vars;
+      glsl_vars_with_extension.push_back("${GLSL_EXTENSIONS}");
+      std::vector<std::string> glsl_vals_with_extension = glsl_vals;
+      if (has_fp64_extension)
+      {
+        glsl_vals_with_extension.push_back("#extension GL_ARB_gpu_shader_fp64 : enable");
+      }
+      else
+      {
+        glsl_vals_with_extension.push_back("");
+      }
+
+      std::string frg = format(
+        "${GLSL_VERSION}\n"
+        "${GLSL_EXTENSIONS}\n"
         "uniform dvec2 v2d;\n"
         "uniform dvec3 v3d;\n"
         "uniform dvec4 v4d;\n"
-        "out vec4 data;\n"
-        "void main(void) { data = vec4(v2d[0]+v3d[0]+v4d[0]); }\n";
-        
-      GLint prg_id = createShader(vtx,frg);
-      
-      VERIFY_UNIFORM(dv,v2d, Vector2d);
-      VERIFY_UNIFORM(dv,v3d, Vector3d);
-      VERIFY_UNIFORM(dv,v4d, Vector4d);
-      #endif
+        "${FRAG_OUTPUT_DECLARATION}\n"
+        "void main(void) {\n"
+        "  ${FRAG_OUTPUT_VARIABLE} = vec4(v2d[0]+v3d[0]+v4d[0]);\n"
+        "}\n",
+        glsl_vars_with_extension, glsl_vals_with_extension);
+
+      GLint prg_id = createProgram(vtx,frg);
+      VERIFY(prg_id > 0 && "Failed to create program.");
+      VERIFY_UNIFORM(dv, v2d, Vector2d);
+      VERIFY_UNIFORM(dv, v3d, Vector3d);
+      VERIFY_UNIFORM(dv, v4d, Vector4d);
+      glDeleteProgram(prg_id);
     }
     else
-      std::cerr << "Warning: GLEW_ARB_gpu_shader_fp64 was not tested\n";
-    #else
-      std::cerr << "Warning: GLEW_ARB_gpu_shader_fp64 was not tested\n";
-    #endif
+      std::cerr << "Warning: dvec (fp64) was not tested.\n";
   }
-  
+
+  // Exit loop - Leaving main loop is supported by freeglut, otherwise we
+  // are forced to exit.
+#ifdef FREEGLUT
+  glutLeaveMainLoop();
+  // Trigger another display loop iteration. Otherwise, it just hangs.
+  glutPostRedisplay();
+#else
+  exit(0);
+#endif
+}
+
+EIGEN_DECLARE_TEST(openglsupport)
+{
+  int argc = 0;
+  glutInit(&argc, 0);
+
+  GLint glut_display_mode = GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH;
+
+#ifndef EIGEN_LEGACY_OPENGL
+  // Initialize 3.2+ OpenGL context.
+#if defined(__APPLE_CC__)
+  glut_display_mode |= GLUT_3_2_CORE_PROFILE;
+#elif defined(FREEGLUT)
+  glutInitContextVersion(3, 2);
+  glutInitContextFlags(GLUT_FORWARD_COMPATIBLE);
+  glutInitContextProfile(GLUT_CORE_PROFILE);
+#endif
+#endif
+
+  glutInitDisplayMode(glut_display_mode);
+  glutInitWindowPosition(0, 0);
+  glutInitWindowSize(10, 10);
+
+  int window = glutCreateWindow("Eigen");
+  if(window <= 0)
+  {
+    std::cerr << "Error: Unable to create GLUT Window.\n";
+    exit(1);
+  }
+
+  glewExperimental = GL_TRUE;
+  if(glewInit() != GLEW_OK)
+  {
+    std::cerr << "Warning: Failed to initialize GLEW.\n";
+    exit(1);
+  }
+
+  // Run test in display, otherwise GLUT fails to clean up and leads to memory
+  // access errors on exit.
+  glutDisplayFunc(openglsupport_test_loop);
+  glutMainLoop();
+  glutDestroyWindow(window);
 }
diff --git a/contrib/eigen/unsupported/test/polynomialsolver.cpp b/contrib/eigen/unsupported/test/polynomialsolver.cpp
index db8ad7ddae31b44bfa281af146f29838b6bb44a0..4ff9bda5a7f9946a8b130e90794ded38090d6a7f 100644
--- a/contrib/eigen/unsupported/test/polynomialsolver.cpp
+++ b/contrib/eigen/unsupported/test/polynomialsolver.cpp
@@ -207,7 +207,7 @@ void polynomialsolver(int deg)
       realRoots );
 }
 
-void test_polynomialsolver()
+EIGEN_DECLARE_TEST(polynomialsolver)
 {
   for(int i = 0; i < g_repeat; i++)
   {
diff --git a/contrib/eigen/unsupported/test/polynomialutils.cpp b/contrib/eigen/unsupported/test/polynomialutils.cpp
index 5fc968402ac50a992f39c8effebbf905bd5fba28..8ff4519967d4438cfdcd4ec8e867810748db2b10 100644
--- a/contrib/eigen/unsupported/test/polynomialutils.cpp
+++ b/contrib/eigen/unsupported/test/polynomialutils.cpp
@@ -101,7 +101,7 @@ template<typename _Scalar> void CauchyBounds_scalar()
           internal::random<int>(18,26) )) );
 }
 
-void test_polynomialutils()
+EIGEN_DECLARE_TEST(polynomialutils)
 {
   for(int i = 0; i < g_repeat; i++)
   {
diff --git a/contrib/eigen/unsupported/test/sparse_extra.cpp b/contrib/eigen/unsupported/test/sparse_extra.cpp
index 7a049c8702e5d32bca37fef4552e314a519b0356..602c2cb84de7e8d1df92586ee1c459ce727a30ee 100644
--- a/contrib/eigen/unsupported/test/sparse_extra.cpp
+++ b/contrib/eigen/unsupported/test/sparse_extra.cpp
@@ -9,9 +9,44 @@
 
 
 // import basic and product tests for deprecated DynamicSparseMatrix
+#if 0 // sparse_basic(DynamicSparseMatrix) does not compile at all -> disabled
+static long g_realloc_count = 0;
+#define EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN g_realloc_count++;
+
+static long g_dense_op_sparse_count = 0;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN g_dense_op_sparse_count++;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN g_dense_op_sparse_count+=10;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN g_dense_op_sparse_count+=20;
+
+#define EIGEN_SPARSE_TEST_INCLUDED_FROM_SPARSE_EXTRA 1
+#endif
+
 #define EIGEN_NO_DEPRECATED_WARNING
+// Disable counting of temporaries, since sparse_product(DynamicSparseMatrix)
+// has an extra copy-assignment.
+#define EIGEN_SPARSE_PRODUCT_IGNORE_TEMPORARY_COUNT
 #include "sparse_product.cpp"
+
+#if 0 // sparse_basic(DynamicSparseMatrix) does not compile at all -> disabled
 #include "sparse_basic.cpp"
+#endif
+
+#if EIGEN_HAS_CXX11
+
+#ifdef min
+#undef min
+#endif
+
+#ifdef max
+#undef max
+#endif
+
+#include <unordered_map>
+#define EIGEN_UNORDERED_MAP_SUPPORT
+
+#endif
+
+
 #include <Eigen/SparseExtra>
 
 template<typename SetterType,typename DenseType, typename Scalar, int Options>
@@ -104,10 +139,8 @@ template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& re
     #ifdef EIGEN_UNORDERED_MAP_SUPPORT
     VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, StdUnorderedMapTraits> >(m,refMat,nonzeroCoords) ));
     #endif
-    #ifdef _DENSE_HASH_MAP_H_
+    #ifdef EIGEN_GOOGLEHASH_SUPPORT
     VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleDenseHashMapTraits> >(m,refMat,nonzeroCoords) ));
-    #endif
-    #ifdef _SPARSE_HASH_MAP_H_
     VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleSparseHashMapTraits> >(m,refMat,nonzeroCoords) ));
     #endif
 
@@ -129,7 +162,32 @@ template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& re
 
 }
 
-void test_sparse_extra()
+
+template<typename SparseMatrixType>
+void check_marketio()
+{
+  typedef Matrix<typename SparseMatrixType::Scalar, Dynamic, Dynamic> DenseMatrix;
+  Index rows = internal::random<Index>(1,100);
+  Index cols = internal::random<Index>(1,100);
+  SparseMatrixType m1, m2;
+  m1 = DenseMatrix::Random(rows, cols).sparseView();
+  saveMarket(m1, "sparse_extra.mtx");
+  loadMarket(m2, "sparse_extra.mtx");
+  VERIFY_IS_EQUAL(DenseMatrix(m1),DenseMatrix(m2));
+}
+
+template<typename VectorType>
+void check_marketio_vector()
+{
+  Index size = internal::random<Index>(1,100);
+  VectorType v1, v2;
+  v1 = VectorType::Random(size);
+  saveMarketVector(v1, "vector_extra.mtx");
+  loadMarketVector(v2, "vector_extra.mtx");
+  VERIFY_IS_EQUAL(v1,v2);
+}
+
+EIGEN_DECLARE_TEST(sparse_extra)
 {
   for(int i = 0; i < g_repeat; i++) {
     int s = Eigen::internal::random<int>(1,50);
@@ -143,5 +201,26 @@ void test_sparse_extra()
 
     CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, ColMajor> >()) );
     CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, RowMajor> >()) );
+
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<float,ColMajor,int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<float,ColMajor,long int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,long int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,long int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,long int> >()) );
+
+
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<float,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<double,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<float>,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<double>,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<float,Dynamic,1> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<double,Dynamic,1> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<float>,Dynamic,1> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<double>,Dynamic,1> >()) );
+
+    TEST_SET_BUT_UNUSED_VARIABLE(s);
   }
 }
diff --git a/contrib/eigen/unsupported/test/special_functions.cpp b/contrib/eigen/unsupported/test/special_functions.cpp
index 057fb3e92c76a7aebee4124f3e04c641589f8252..589bb76e10f835f6333ded87b5ed6de0d7b405a0 100644
--- a/contrib/eigen/unsupported/test/special_functions.cpp
+++ b/contrib/eigen/unsupported/test/special_functions.cpp
@@ -7,9 +7,21 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#include <limits.h>
 #include "main.h"
 #include "../Eigen/SpecialFunctions"
 
+// Hack to allow "implicit" conversions from double to Scalar via comma-initialization.
+template<typename Derived>
+Eigen::CommaInitializer<Derived> operator<<(Eigen::DenseBase<Derived>& dense, double v) {
+  return (dense << static_cast<typename Derived::Scalar>(v));
+}
+
+template<typename XprType>
+Eigen::CommaInitializer<XprType>& operator,(Eigen::CommaInitializer<XprType>& ci, double v) {
+  return (ci, static_cast<typename XprType::Scalar>(v));
+}
+
 template<typename X, typename Y>
 void verify_component_wise(const X& x, const Y& y)
 {
@@ -64,8 +76,8 @@ template<typename ArrayType> void array_special_functions()
       //   igamma(a, x) = gamma(a, x) / Gamma(a)
       // where Gamma and gamma are considered the standard unnormalized
       // upper and lower incomplete gamma functions, respectively.
-      ArrayType a = m1.abs() + 2;
-      ArrayType x = m2.abs() + 2;
+      ArrayType a = m1.abs() + Scalar(2);
+      ArrayType x = m2.abs() + Scalar(2);
       ArrayType zero = ArrayType::Zero(rows, cols);
       ArrayType one = ArrayType::Constant(rows, cols, Scalar(1.0));
       ArrayType a_m1 = a - one;
@@ -74,6 +86,7 @@ template<typename ArrayType> void array_special_functions()
       ArrayType gamma_a_x = Eigen::igamma(a, x) * a.lgamma().exp();
       ArrayType gamma_a_m1_x = Eigen::igamma(a_m1, x) * a_m1.lgamma().exp();
 
+
       // Gamma(a, 0) == Gamma(a)
       VERIFY_IS_APPROX(Eigen::igammac(a, zero), one);
 
@@ -81,10 +94,23 @@ template<typename ArrayType> void array_special_functions()
       VERIFY_IS_APPROX(Gamma_a_x + gamma_a_x, a.lgamma().exp());
 
       // Gamma(a, x) == (a - 1) * Gamma(a-1, x) + x^(a-1) * exp(-x)
-      VERIFY_IS_APPROX(Gamma_a_x, (a - 1) * Gamma_a_m1_x + x.pow(a-1) * (-x).exp());
+      VERIFY_IS_APPROX(Gamma_a_x, (a - Scalar(1)) * Gamma_a_m1_x + x.pow(a-Scalar(1)) * (-x).exp());
 
       // gamma(a, x) == (a - 1) * gamma(a-1, x) - x^(a-1) * exp(-x)
-      VERIFY_IS_APPROX(gamma_a_x, (a - 1) * gamma_a_m1_x - x.pow(a-1) * (-x).exp());
+      VERIFY_IS_APPROX(gamma_a_x, (a - Scalar(1)) * gamma_a_m1_x - x.pow(a-Scalar(1)) * (-x).exp());
+    }
+    {
+      // Verify for large a and x that values are between 0 and 1.
+      ArrayType m1 = ArrayType::Random(rows,cols);
+      ArrayType m2 = ArrayType::Random(rows,cols);
+      int max_exponent = std::numeric_limits<Scalar>::max_exponent10;
+      ArrayType a = m1.abs() *  Scalar(pow(10., max_exponent - 1));
+      ArrayType x = m2.abs() *  Scalar(pow(10., max_exponent - 1));
+      for (int i = 0; i < a.size(); ++i) {
+        Scalar igam = numext::igamma(a(i), x(i));
+        VERIFY(0 <= igam);
+        VERIFY(igam <= 1);
+      }
     }
 
     {
@@ -93,27 +119,37 @@ template<typename ArrayType> void array_special_functions()
       Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
 
       // location i*6+j corresponds to a_s[i], x_s[j].
-      Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
-                              {0.0, 0.6321205588285578, 0.7768698398515702,
-                              0.9816843611112658, 9.999500016666262e-05, 1.0},
-                              {0.0, 0.4275932955291202, 0.608374823728911,
-                              0.9539882943107686, 7.522076445089201e-07, 1.0},
-                              {0.0, 0.01898815687615381, 0.06564245437845008,
-                              0.5665298796332909, 4.166333347221828e-18, 1.0},
-                              {0.0, 0.9999780593618628, 0.9999899967080838,
-                              0.9999996219837988, 0.9991370418689945, 1.0},
-                              {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
-      Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
-                              {1.0, 0.36787944117144233, 0.22313016014842982,
-                                0.018315638888734182, 0.9999000049998333, 0.0},
-                              {1.0, 0.5724067044708798, 0.3916251762710878,
-                                0.04601170568923136, 0.9999992477923555, 0.0},
-                              {1.0, 0.9810118431238462, 0.9343575456215499,
-                                0.4334701203667089, 1.0, 0.0},
-                              {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
-                                3.7801620118431334e-07, 0.0008629581310054535,
-                                0.0},
-                              {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+      Scalar igamma_s[][6] = {
+          {Scalar(0.0), nan, nan, nan, nan, nan},
+          {Scalar(0.0), Scalar(0.6321205588285578), Scalar(0.7768698398515702),
+           Scalar(0.9816843611112658), Scalar(9.999500016666262e-05),
+           Scalar(1.0)},
+          {Scalar(0.0), Scalar(0.4275932955291202), Scalar(0.608374823728911),
+           Scalar(0.9539882943107686), Scalar(7.522076445089201e-07),
+           Scalar(1.0)},
+          {Scalar(0.0), Scalar(0.01898815687615381),
+           Scalar(0.06564245437845008), Scalar(0.5665298796332909),
+           Scalar(4.166333347221828e-18), Scalar(1.0)},
+          {Scalar(0.0), Scalar(0.9999780593618628), Scalar(0.9999899967080838),
+           Scalar(0.9999996219837988), Scalar(0.9991370418689945), Scalar(1.0)},
+          {Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0),
+           Scalar(0.5042041932513908)}};
+      Scalar igammac_s[][6] = {
+          {nan, nan, nan, nan, nan, nan},
+          {Scalar(1.0), Scalar(0.36787944117144233),
+           Scalar(0.22313016014842982), Scalar(0.018315638888734182),
+           Scalar(0.9999000049998333), Scalar(0.0)},
+          {Scalar(1.0), Scalar(0.5724067044708798), Scalar(0.3916251762710878),
+           Scalar(0.04601170568923136), Scalar(0.9999992477923555),
+           Scalar(0.0)},
+          {Scalar(1.0), Scalar(0.9810118431238462), Scalar(0.9343575456215499),
+           Scalar(0.4334701203667089), Scalar(1.0), Scalar(0.0)},
+          {Scalar(1.0), Scalar(2.1940638138146658e-05),
+           Scalar(1.0003291916285e-05), Scalar(3.7801620118431334e-07),
+           Scalar(0.0008629581310054535), Scalar(0.0)},
+          {Scalar(1.0), Scalar(1.0), Scalar(1.0), Scalar(1.0), Scalar(1.0),
+           Scalar(0.49579580674813944)}};
+
       for (int i = 0; i < 6; ++i) {
         for (int j = 0; j < 6; ++j) {
           if ((std::isnan)(igamma_s[i][j])) {
@@ -133,12 +169,32 @@ template<typename ArrayType> void array_special_functions()
   }
 #endif  // EIGEN_HAS_C99_MATH
 
+  // Check the ndtri function against scipy.special.ndtri
+  {
+    ArrayType x(7), res(7), ref(7);
+    x << 0.5, 0.2, 0.8, 0.9, 0.1, 0.99, 0.01;
+    ref << 0., -0.8416212335729142, 0.8416212335729142, 1.2815515655446004, -1.2815515655446004, 2.3263478740408408, -2.3263478740408408;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+    CALL_SUBTEST( res = x.ndtri(); verify_component_wise(res, ref); );
+    CALL_SUBTEST( res = ndtri(x); verify_component_wise(res, ref); );
+
+    // ndtri(normal_cdf(x)) ~= x
+    CALL_SUBTEST(
+        ArrayType m1 = ArrayType::Random(32);
+        using std::sqrt;
+
+        ArrayType cdf_val = (m1 / Scalar(sqrt(2.))).erf();
+        cdf_val = (cdf_val + Scalar(1)) / Scalar(2);
+        verify_component_wise(cdf_val.ndtri(), m1););
+
+  }
+
   // Check the zeta function against scipy.special.zeta
   {
-    ArrayType x(7), q(7), res(7), ref(7);
-    x << 1.5,   4, 10.5, 10000.5,    3, 1,        0.9;
-    q << 2,   1.5,    3,  1.0001, -2.5, 1.2345, 1.2345;
-    ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan;
+    ArrayType x(10), q(10), res(10), ref(10);
+    x << 1.5,   4, 10.5, 10000.5,    3,      1,    0.9,  2,  3,  4;
+    q <<   2, 1.5,    3,  1.0001, -2.5, 1.2345, 1.2345, -1, -2, -3;
+    ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan, plusinf, nan, plusinf;
     CALL_SUBTEST( verify_component_wise(ref, ref); );
     CALL_SUBTEST( res = x.zeta(q); verify_component_wise(res, ref); );
     CALL_SUBTEST( res = zeta(x,q); verify_component_wise(res, ref); );
@@ -146,22 +202,21 @@ template<typename ArrayType> void array_special_functions()
 
   // digamma
   {
-    ArrayType x(7), res(7), ref(7);
-    x << 1, 1.5, 4, -10.5, 10000.5, 0, -1;
-    ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, plusinf, plusinf;
+    ArrayType x(9), res(9), ref(9);
+    x << 1, 1.5, 4, -10.5, 10000.5, 0, -1, -2, -3;
+    ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, nan, nan, nan, nan;
     CALL_SUBTEST( verify_component_wise(ref, ref); );
 
     CALL_SUBTEST( res = x.digamma(); verify_component_wise(res, ref); );
     CALL_SUBTEST( res = digamma(x);  verify_component_wise(res, ref); );
   }
 
-
 #if EIGEN_HAS_C99_MATH
   {
-    ArrayType n(11), x(11), res(11), ref(11);
-    n << 1, 1,    1, 1.5,   17,   31,   28,    8, 42, 147, 170;
-    x << 2, 3, 25.5, 1.5,  4.7, 11.8, 17.7, 30.2, 15.8, 54.1, 64;
-    ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927;
+    ArrayType n(16), x(16), res(16), ref(16);
+    n << 1, 1,    1, 1.5,   17,   31,   28,    8,   42,  147, 170, -1,  0,  1,  2,  3;
+    x << 2, 3, 25.5, 1.5,  4.7, 11.8, 17.7, 30.2, 15.8, 54.1,  64, -1, -2, -3, -4, -5;
+    ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927, nan, nan, plusinf, nan, plusinf;
     CALL_SUBTEST( verify_component_wise(ref, ref); );
 
     if(sizeof(RealScalar)>=8) {  // double
@@ -288,8 +343,8 @@ template<typename ArrayType> void array_special_functions()
     ArrayType m3 = ArrayType::Random(32);
     ArrayType one = ArrayType::Constant(32, Scalar(1.0));
     const Scalar eps = std::numeric_limits<Scalar>::epsilon();
-    ArrayType a = (m1 * 4.0).exp();
-    ArrayType b = (m2 * 4.0).exp();
+    ArrayType a = (m1 * Scalar(4)).exp();
+    ArrayType b = (m2 * Scalar(4)).exp();
     ArrayType x = m3.abs();
 
     // betainc(a, 1, x) == x**a
@@ -335,11 +390,108 @@ template<typename ArrayType> void array_special_functions()
         ArrayType test = betainc(a, b + one, x) + eps;
         verify_component_wise(test, expected););
   }
-#endif
+#endif  // EIGEN_HAS_C99_MATH
+
+    /* Code to generate the data for the following two test cases.
+    N = 5
+    np.random.seed(3)
+
+    a = np.logspace(-2, 3, 6)
+    a = np.ravel(np.tile(np.reshape(a, [-1, 1]), [1, N]))
+    x = np.random.gamma(a, 1.0)
+    x = np.maximum(x, np.finfo(np.float32).tiny)
+
+    def igamma(a, x):
+      return mpmath.gammainc(a, 0, x, regularized=True)
+
+    def igamma_der_a(a, x):
+      res = mpmath.diff(lambda a_prime: igamma(a_prime, x), a)
+      return np.float64(res)
+
+    def gamma_sample_der_alpha(a, x):
+      igamma_x = igamma(a, x)
+      def igammainv_of_igamma(a_prime):
+        return mpmath.findroot(lambda x_prime: igamma(a_prime, x_prime) -
+            igamma_x, x, solver='newton')
+      return np.float64(mpmath.diff(igammainv_of_igamma, a))
+
+    v_igamma_der_a = np.vectorize(igamma_der_a)(a, x)
+    v_gamma_sample_der_alpha = np.vectorize(gamma_sample_der_alpha)(a, x)
+  */
+
+#if EIGEN_HAS_C99_MATH
+  // Test igamma_der_a
+  {
+    ArrayType a(30);
+    ArrayType x(30);
+    ArrayType res(30);
+    ArrayType v(30);
+
+    a << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0,
+        1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+        100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+    x << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+        1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16,
+        0.0132865061065, 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06,
+        0.333412038288, 1.18135687766, 0.580629033777, 0.170631439426,
+        0.786686768458, 7.63873279537, 13.1944344379, 11.896042354,
+        10.5830172417, 10.5020942233, 92.8918587747, 95.003720371,
+        86.3715926467, 96.0330217672, 82.6389930677, 968.702906754,
+        969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+    v << -32.7256441441, -36.4394150514, -9.66467612263, -36.4394150514,
+        -36.4394150514, -1.0891900302, -2.66351229645, -2.48666868596,
+        -0.929700494428, -3.56327722764, -0.455320135314, -0.391437214323,
+        -0.491352055991, -0.350454834292, -0.471773162921, -0.104084440522,
+        -0.0723646747909, -0.0992828975532, -0.121638215446, -0.122619605294,
+        -0.0317670267286, -0.0359974812869, -0.0154359225363, -0.0375775365921,
+        -0.00794899153653, -0.00777303219211, -0.00796085782042,
+        -0.0125850719397, -0.00455500206958, -0.00476436993148;
+
+    CALL_SUBTEST(res = igamma_der_a(a, x); verify_component_wise(res, v););
+  }
+
+  // Test gamma_sample_der_alpha
+  {
+    ArrayType alpha(30);
+    ArrayType sample(30);
+    ArrayType res(30);
+    ArrayType v(30);
+
+    alpha << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
+        1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+        100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+    sample << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+        1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16,
+        0.0132865061065, 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06,
+        0.333412038288, 1.18135687766, 0.580629033777, 0.170631439426,
+        0.786686768458, 7.63873279537, 13.1944344379, 11.896042354,
+        10.5830172417, 10.5020942233, 92.8918587747, 95.003720371,
+        86.3715926467, 96.0330217672, 82.6389930677, 968.702906754,
+        969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+    v << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738,
+        1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243,
+        0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302,
+        1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534,
+        0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812,
+        1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061,
+        0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206,
+        1.00106492525, 0.97734200649, 1.02198794179;
+
+    CALL_SUBTEST(res = gamma_sample_der_alpha(alpha, sample);
+                 verify_component_wise(res, v););
+  }
+#endif  // EIGEN_HAS_C99_MATH
 }
 
-void test_special_functions()
+EIGEN_DECLARE_TEST(special_functions)
 {
   CALL_SUBTEST_1(array_special_functions<ArrayXf>());
   CALL_SUBTEST_2(array_special_functions<ArrayXd>());
+  // TODO(cantonios): half/bfloat16 don't have enough precision to reproduce results above.
+  // CALL_SUBTEST_3(array_special_functions<ArrayX<Eigen::half>>());
+  // CALL_SUBTEST_4(array_special_functions<ArrayX<Eigen::bfloat16>>());
 }
diff --git a/contrib/eigen/unsupported/test/special_packetmath.cpp b/contrib/eigen/unsupported/test/special_packetmath.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..31233f1b0b13a962e7714e2514d94e40e929d57b
--- /dev/null
+++ b/contrib/eigen/unsupported/test/special_packetmath.cpp
@@ -0,0 +1,149 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <limits>
+#include "packetmath_test_shared.h"
+#include "../Eigen/SpecialFunctions"
+
+template<typename Scalar,typename Packet> void packetmath_real()
+{
+  using std::abs;
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+
+  const int size = PacketSize*4;
+  EIGEN_ALIGN_MAX Scalar data1[PacketSize*4];
+  EIGEN_ALIGN_MAX Scalar data2[PacketSize*4];
+  EIGEN_ALIGN_MAX Scalar ref[PacketSize*4];
+
+#if EIGEN_HAS_C99_MATH
+  {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    test::packet_helper<internal::packet_traits<Scalar>::HasLGamma,Packet> h;
+    h.store(data2, internal::plgamma(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+  if (internal::packet_traits<Scalar>::HasErf) {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    test::packet_helper<internal::packet_traits<Scalar>::HasErf,Packet> h;
+    h.store(data2, internal::perf(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+  {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    test::packet_helper<internal::packet_traits<Scalar>::HasErfc,Packet> h;
+    h.store(data2, internal::perfc(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+  {
+    for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0),Scalar(1));
+    }
+    CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasNdtri, numext::ndtri, internal::pndtri);
+  }
+#endif  // EIGEN_HAS_C99_MATH
+
+  // For bessel_i*e and bessel_j*, the valid range is negative reals.
+  {
+    const int max_exponent = numext::mini(std::numeric_limits<Scalar>::max_exponent10-1, 6);
+    for (int i=0; i<size; ++i)
+    {
+      data1[i] = internal::random<Scalar>(Scalar(-1),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-max_exponent),Scalar(max_exponent))));
+      data2[i] = internal::random<Scalar>(Scalar(-1),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-max_exponent),Scalar(max_exponent))));
+    }
+
+    CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0e, internal::pbessel_i0e);
+    CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i1e, internal::pbessel_i1e);
+    CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j0, internal::pbessel_j0);
+    CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j1, internal::pbessel_j1);
+  }
+
+  // Use a smaller data range for the bessel_i* as these can become very large.
+  // Following #1693, we also restrict this range further to avoid inf's due to
+  // differences in pexp and exp.
+  for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+      data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+  }
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0, internal::pbessel_i0);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i1, internal::pbessel_i1);
+
+
+  // y_i, and k_i are valid for x > 0.
+  {
+    const int max_exponent = numext::mini(std::numeric_limits<Scalar>::max_exponent10-1, 5);
+    for (int i=0; i<size; ++i)
+    {
+      data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-2),Scalar(max_exponent))));
+      data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-2),Scalar(max_exponent))));
+    }
+  }
+
+  // TODO(srvasude): Re-enable this test once properly investigated why the
+  // scalar and vector paths differ.
+  // CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_y0, internal::pbessel_y0);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_y1, internal::pbessel_y1);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k0e, internal::pbessel_k0e);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k1e, internal::pbessel_k1e);
+
+  // Following #1693, we restrict the range for exp to avoid zeroing out too
+  // fast.
+  for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+      data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+  }
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k0, internal::pbessel_k0);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k1, internal::pbessel_k1);
+
+
+  for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+      data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+  }
+
+#if EIGEN_HAS_C99_MATH && (EIGEN_COMP_CXXVER >= 11)
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc);
+#endif
+
+}
+
+namespace Eigen {
+namespace test {
+
+template<typename Scalar,typename PacketType, bool IsComplex, bool IsInteger>
+struct runall {
+  static void run() {
+    packetmath_real<Scalar,PacketType>();
+  }
+};
+
+}
+}
+
+EIGEN_DECLARE_TEST(special_packetmath)
+{
+  g_first_pass = true;
+  for(int i = 0; i < g_repeat; i++) {
+
+    CALL_SUBTEST_1( test::runner<float>::run() );
+    CALL_SUBTEST_2( test::runner<double>::run() );
+    CALL_SUBTEST_3( test::runner<Eigen::half>::run() );
+    CALL_SUBTEST_4( test::runner<Eigen::bfloat16>::run() );
+    g_first_pass = false;
+  }
+}
diff --git a/contrib/eigen/unsupported/test/splines.cpp b/contrib/eigen/unsupported/test/splines.cpp
index 3be020434ac3502370644cdf3a30e8f394073311..88ec87b97a77fe836432b0e86d52ff311d8f0a5d 100644
--- a/contrib/eigen/unsupported/test/splines.cpp
+++ b/contrib/eigen/unsupported/test/splines.cpp
@@ -268,7 +268,7 @@ void check_global_interpolation_with_derivatives2d()
   }
 }
 
-void test_splines()
+EIGEN_DECLARE_TEST(splines)
 {
   for (int i = 0; i < g_repeat; ++i)
   {
diff --git a/src/common/ColorMap.cpp b/src/common/ColorMap.cpp
index ee81252114c277aace46b6ccfc6ed9899831df34..a5220bc1774812562db552206a73bbb1f10b04ba 100644
--- a/src/common/ColorMap.cpp
+++ b/src/common/ColorMap.cpp
@@ -6,6 +6,7 @@
 #include "ColorMap.h"
 
 #include "Color.h"
+#include "Message.h"
 
 #include <cmath>
 
@@ -44,6 +45,13 @@ namespace gmshfem
           b += c_b[i] * std::pow(x, 3 - i);
         }
       };
+      auto quarticLow = [](const double c_r[], const double c_g[], const double c_b[], double &r, double &g, double &b, const double x) {
+        for(auto i = 0; i < 5; ++i) {
+          r += c_r[i] * std::pow(x, 4 - i);
+          g += c_g[i] * std::pow(x, 4 - i);
+          b += c_b[i] * std::pow(x, 4 - i);
+        }
+      };
       switch(_name) {
       case ColorMapName::Autumn: {
         const double x = 1. - (double)item / (_numItem - 1);
@@ -176,6 +184,16 @@ namespace gmshfem
         cubicLow(c_r, c_g, c_b, r, g, b, x);
         return Color((unsigned char)r, (unsigned char)g, (unsigned char)b);
       } break;
+      case ColorMapName::Twilight: {
+        const double x = 1. - (double)item / (_numItem - 1);
+        const double c_r[5]{2016., -5189.3, 4098., -695.67, 7.};
+        const double c_g[5]{629.33, -1754.7, 1152.7, 55.667, 57.};
+        const double c_b[5]{906.67, -1882.7, 823.33, 121.67, 121.};
+        double r = 0., g = 0., b = 0.;
+        quarticLow(c_r, c_g, c_b, r, g, b, x);
+        if(r < 0.) r = 0.;
+        return Color((unsigned char)r, (unsigned char)g, (unsigned char)b);
+      } break;
       default:
         break;
       }
diff --git a/src/common/ColorMap.h b/src/common/ColorMap.h
index c0d6bc6a3dd0e79bd8292a9382a322fb496ad4ae..a9f2e140a299e775860af51c8294a3a722918bbc 100644
--- a/src/common/ColorMap.h
+++ b/src/common/ColorMap.h
@@ -28,7 +28,8 @@ namespace gmshfem
       Spectrum,
       Summer,
       Thermal,
-      Watermelon
+      Watermelon,
+      Twilight
     };
 
     class ColorMap