diff --git a/CMakeLists.txt b/CMakeLists.txt
index e0e002c..b8507ed 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,10 @@
 cmake_minimum_required (VERSION 2.6)
 
-project (odlpp)
+project (odlcuda)
+
+if(NOT EXISTS "${CMAKE_SOURCE_DIR}/odl_cpp_utils/CMakeLists.txt")
+    message(FATAL_ERROR "odl_cpp_utils not initialized. Run 'git submodule update --init'." )
+endif()
 
 #Set binary dir
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
@@ -92,8 +96,5 @@ include_directories(${PYTHON_NUMPY_INCLUDE_DIR})
 link_directories(${PYTHON_LIBRARIES})
 
 # Add sub directories as needed
-add_subdirectory (odlpp)
-if(NOT EXISTS "${CMAKE_SOURCE_DIR}/odl_cpp_utils/CMakeLists.txt")
-    message(SEND_ERROR "odl_cpp_utils not initialized. Run 'git submodule update --init'." )
-endif()
+add_subdirectory (odlcuda)
 add_subdirectory (odl_cpp_utils)
diff --git a/README.md b/README.md
index a97a277..51d3142 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,16 @@
-ODLpp
-=====
+odlcuda
+=======
 
-C++ backend for ODL
+CUDA backend for [ODL](https://github.com/odlgroup/odl)
 
 Introduction
---------------------
-This is a default backend for ODL, intended to contain examples of ODL interactions with C++.
+------------
+This is a default CUDA backend for ODL. It contains a default implementation of a CUDA based Rn space with associated methods.
 
-It contains a default implementation of a CUDA based Rn space with associated methods.
+By installing this package, CUDA spaces become available to `rn`, `uniform_discr` and related spaces in ODL through the `impl='cuda'` option.
 
 Building
---------------------
+--------
 The project uses CMake to enable builds. Using *cmake-gui* is recommended
 
 [Cmake webpage](http://www.cmake.org/)
@@ -28,7 +28,7 @@ To build and install the package to your python installation, run (as root)
 
     make pyinstall
 
-To verify your installation, run (in the odlpp root directory)
+To verify your installation, run (in the odlcuda root directory)
 
     py.test
 
@@ -62,7 +62,7 @@ External Dependences
 Current external dependencies are
 
 #####Python
-The building block of ODL, ODLpp needs access to both python and numpy header files and compiled files to link against.
+The building block of ODL, odlcuda needs access to both python and numpy header files and compiled files to link against.
 
 #####Boost
 General library with C++ code. This project specifically uses [Boost.Python](http://www.boost.org/doc/libs/1_58_0/libs/python/doc/index.html) to handle the python bindings.
@@ -95,7 +95,7 @@ There are a few common errors encountered, this is the solution to some of these
 
 * Compiling
 
-        [ 20%] Building NVCC (Device) object ODLpp/CMakeFiles/cuda.dir//./cuda_generated_cuda.cu.o /usr/include/c++/4.9.2/bits/alloc_traits.h(248): error: expected a ">"
+        [ 20%] Building NVCC (Device) object odlcuda/CMakeFiles/cuda.dir//./cuda_generated_cuda.cu.o /usr/include/c++/4.9.2/bits/alloc_traits.h(248): error: expected a ">"
 
     It may be that you are trying to compile with CUDA 6.5 and GCC 4.9, this combination is not supported by CUDA.
 
@@ -103,12 +103,12 @@ There are a few common errors encountered, this is the solution to some of these
 
         error LNK1112: module machine type 'x64' conflicts with target machine type 'X86'
 
-    You have a 64-bit library on your path (Boost for instance) while trying to build 32-bit odlpp. Either change the lib, or configure to build 64-bit. On Windows, if you are using Visual Studio to compile use Configuration Manager to set platform to x64, if you are compiling on command line via CMake ensure that the Script Generator is for instance "Visual Studio 12 2013 Win64" (note the Win64 at the end).
+    You have a 64-bit library on your path (Boost for instance) while trying to build 32-bit odlcuda. Either change the lib, or configure to build 64-bit. On Windows, if you are using Visual Studio to compile use Configuration Manager to set platform to x64, if you are compiling on command line via CMake ensure that the Script Generator is for instance "Visual Studio 12 2013 Win64" (note the Win64 at the end).
 
 
 * If you get a error like
 
-        Error	5	error LNK2019: unresolved external symbol "__declspec(dllimport) struct _object * __cdecl boost::python::detail::init_module(struct PyModuleDef &,void (__cdecl*)(void))" (__imp_?init_module@detail@python@boost@@YAPEAU_object@@AEAUPyModuleDef@@P6AXXZ@Z) referenced in function PyInit_utils	C:\Programming\Projects\ODLpp_bin\RLcpp\utils.obj	utils
+        Error	5	error LNK2019: unresolved external symbol "__declspec(dllimport) struct _object * __cdecl boost::python::detail::init_module(struct PyModuleDef &,void (__cdecl*)(void))" (__imp_?init_module@detail@python@boost@@YAPEAU_object@@AEAUPyModuleDef@@P6AXXZ@Z) referenced in function PyInit_utils	C:\Programming\Projects\odlcuda_bin\odlcuda\utils.obj	utils
 
     then it is likely that you are trying to build against unmatched python header files and boost python version
 
@@ -122,7 +122,7 @@ There are a few common errors encountered, this is the solution to some of these
 
 * If, when running the test in python, you encounter an error like
 
-        ImportError: No module named odlpp
+        ImportError: No module named odlcuda
 
     It may be that you have not installed the package, run (as root) `make pyinstall` or equivalent.
     
diff --git a/examples/cudaspace_speedtest.py b/examples/cudaspace_speedtest.py
new file mode 100644
index 0000000..50df23e
--- /dev/null
+++ b/examples/cudaspace_speedtest.py
@@ -0,0 +1,82 @@
+# Copyright 2014-2016 The ODL development group
+#
+# This file is part of ODL.
+#
+# ODL is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ODL is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with ODL.  If not, see <http://www.gnu.org/licenses/>.
+
+"""Speed comparison between CPU and GPU spaces."""
+
+# Imports for common Python 2/3 codebase
+from __future__ import print_function, division, absolute_import
+from future import standard_library
+standard_library.install_aliases()
+from builtins import range
+
+import numpy as np
+import odl
+from odl.util.testutils import Timer
+
+n = 10**7
+iterations = 100
+device_space = odl.CudaRn(n)
+host_space = odl.Rn(n)
+x = np.random.rand(n)
+y = np.random.rand(n)
+z = np.empty(n)
+
+x_dev = device_space.element(x)
+y_dev = device_space.element(y)
+z_dev = device_space.element(z)
+x_host = host_space.element(x)
+y_host = host_space.element(y)
+z_host = host_space.element(z)
+
+
+def run_test(function, message):
+    with Timer('+GPU ' + message):
+        for _ in range(iterations):
+            function(z_dev, x_dev, y_dev)
+
+    with Timer('-CPU ' + message):
+        for _ in range(iterations):
+            function(z_host, x_host, y_host)
+
+# lincomb tests
+run_test(lambda z, x, y: x.space.lincomb(0, x, 0, y, z), "z = 0")
+run_test(lambda z, x, y: x.space.lincomb(0, x, 1, y, z), "z = y")
+run_test(lambda z, x, y: x.space.lincomb(0, x, 2, y, z), "z = b*y")
+
+run_test(lambda z, x, y: x.space.lincomb(1, x, 0, y, z), "z = x")
+run_test(lambda z, x, y: x.space.lincomb(1, x, 1, y, z), "z = x + y")
+run_test(lambda z, x, y: x.space.lincomb(1, x, 2, y, z), "z = x + b*y")
+
+run_test(lambda z, x, y: x.space.lincomb(2, x, 0, y, z), "z = a*x")
+run_test(lambda z, x, y: x.space.lincomb(2, x, 1, y, z), "z = a*x + y")
+run_test(lambda z, x, y: x.space.lincomb(2, x, 2, y, z), "z = a*x + b*y")
+
+# Test optimization for 1 aliased vector
+run_test(lambda z, x, y: x.space.lincomb(1, z, 0, y, z), "z = z")
+run_test(lambda z, x, y: x.space.lincomb(1, z, 1, y, z), "z += y")
+run_test(lambda z, x, y: x.space.lincomb(1, z, 2, y, z), "z += b*y")
+
+run_test(lambda z, x, y: x.space.lincomb(2, z, 0, y, z), "z = a*z")
+run_test(lambda z, x, y: x.space.lincomb(2, z, 1, y, z), "z = a*z + y")
+run_test(lambda z, x, y: x.space.lincomb(2, z, 2, y, z), "z = a*z + b*y")
+
+# Test optimization for 2 aliased vectors
+run_test(lambda z, x, y: x.space.lincomb(2, z, 2, z, z), "z = (a+b)*z")
+
+# Non lincomb tests
+run_test(lambda z, x, y: x + y, "z = x + y")
+run_test(lambda z, x, y: y.assign(x), "y.assign(x)")
diff --git a/odlpp/CMakeLists.txt b/odlcuda/CMakeLists.txt
similarity index 69%
rename from odlpp/CMakeLists.txt
rename to odlcuda/CMakeLists.txt
index 25c46bf..59a9743 100644
--- a/odlpp/CMakeLists.txt
+++ b/odlcuda/CMakeLists.txt
@@ -11,19 +11,25 @@ else(WIN32)
 endif(WIN32)
 
 if(CUDA_ENABLED)
-    set(PYTHON_LIBS ${PYTHON_LIBS} "odlpp_cuda")
+    set(PYTHON_LIBS ${PYTHON_LIBS} "odlcuda_")
 endif(CUDA_ENABLED)
 
 set(INSTALL_LIB "pyinstall")
 add_custom_target(${INSTALL_LIB} COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/__init__.py ${TARGET_PY}/__init__.py)
 set_property(TARGET ${INSTALL_LIB} PROPERTY FOLDER python)
 
-# do the copying
+# copy raw python files
+set(python_files odl_plugin.py setup.py cu_ntuples.py ufuncs.py)
+foreach(file_i ${python_files})
+    add_custom_command(TARGET ${INSTALL_LIB} COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/${file_i} ${TARGET_PY}/${file_i})
+endforeach()
+
+# copy compiled files
 foreach(file_i ${PYTHON_LIBS})
     add_custom_command(TARGET ${INSTALL_LIB} COMMAND ${CMAKE_COMMAND} -E copy ${TARGET_PY}/${FILESTART}${file_i}.${FILETYPE} ${TARGET_PY}/${file_i}.${TARGETFILETYPE})
 endforeach()
 
-add_custom_command(TARGET ${INSTALL_LIB} COMMAND cd ${TARGET_PY} && ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/setup.py install)
+add_custom_command(TARGET ${INSTALL_LIB} COMMAND cd ${TARGET_PY} && ${PYTHON_EXECUTABLE} setup.py install)
 
 add_dependencies(${INSTALL_LIB} ${PYTHON_LIBS})
 
diff --git a/odlpp/__init__.py b/odlcuda/__init__.py
similarity index 76%
rename from odlpp/__init__.py
rename to odlcuda/__init__.py
index 311d112..5283600 100644
--- a/odlpp/__init__.py
+++ b/odlcuda/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2014, 2015 The ODL development group
+# Copyright 2014-2016 The ODL development group
 #
 # This file is part of ODL.
 #
@@ -17,6 +17,9 @@
 
 from __future__ import absolute_import
 
-from . import odlpp_cuda
+# Workaround for weird bug. Nothing is installed without this
+_install_location = __file__
 
-# __all__ = ['']
+from . import odlcuda_
+from . import cu_ntuples
+from . import ufuncs
diff --git a/odlcuda/cu_ntuples.py b/odlcuda/cu_ntuples.py
new file mode 100644
index 0000000..193e71b
--- /dev/null
+++ b/odlcuda/cu_ntuples.py
@@ -0,0 +1,1457 @@
+﻿# Copyright 2014-2016 The ODL development group
+#
+# This file is part of ODL.
+#
+# ODL is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ODL is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with ODL.  If not, see <http://www.gnu.org/licenses/>.
+
+"""CUDA implementation of n-dimensional Cartesian spaces."""
+
+# Imports for common Python 2/3 codebase
+from __future__ import print_function, division, absolute_import
+from future import standard_library
+standard_library.install_aliases()
+from builtins import int, super
+
+import numpy as np
+
+from odl.set.sets import RealNumbers, ComplexNumbers
+from odl.set.space import LinearSpaceVector
+from odl.space.base_ntuples import (
+    NtuplesBase, NtuplesBaseVector, FnBase, FnBaseVector)
+from odl.space.weighting import (
+    WeightingBase, VectorWeightingBase, ConstWeightingBase, NoWeightingBase,
+    CustomInnerProductBase, CustomNormBase, CustomDistBase)
+from odl.util.utility import dtype_repr
+
+from odlcuda.ufuncs import CudaNtuplesUFuncs
+
+try:
+    import odlcuda.odlcuda_ as backend
+    CUDA_AVAILABLE = True
+except ImportError:
+    backend = None
+    CUDA_AVAILABLE = False
+
+
+__all__ = ('CudaNtuples', 'CudaNtuplesVector',
+           'CudaFn', 'CudaFnVector',
+           'CUDA_DTYPES', 'CUDA_AVAILABLE',
+           'CudaFnConstWeighting', 'CudaFnVectorWeighting',
+           'cu_weighted_inner', 'cu_weighted_norm', 'cu_weighted_dist')
+
+
+def _get_int_type():
+    """Return the correct int vector type on the current platform."""
+    if np.dtype(np.int).itemsize == 4:
+        return 'CudaVectorInt32'
+    elif np.dtype(np.int).itemsize == 8:
+        return 'CudaVectorInt64'
+    else:
+        return 'CudaVectorIntNOT_AVAILABLE'
+
+
+def _add_if_exists(dtype, name):
+    """Add ``dtype`` to ``CUDA_DTYPES`` if it's available."""
+    if hasattr(backend, name):
+        _TYPE_MAP_NPY2CUDA[np.dtype(dtype)] = getattr(backend, name)
+        CUDA_DTYPES.append(np.dtype(dtype))
+
+
+# A list of all available dtypes
+CUDA_DTYPES = []
+
+# Typemap from numpy dtype to implementations
+_TYPE_MAP_NPY2CUDA = {}
+
+# Initialize the available dtypes
+_add_if_exists(np.float, 'CudaVectorFloat64')
+_add_if_exists(np.float32, 'CudaVectorFloat32')
+_add_if_exists(np.float64, 'CudaVectorFloat64')
+_add_if_exists(np.int, _get_int_type())
+_add_if_exists(np.int8, 'CudaVectorInt8')
+_add_if_exists(np.int16, 'CudaVectorInt16')
+_add_if_exists(np.int32, 'CudaVectorInt32')
+_add_if_exists(np.int64, 'CudaVectorInt64')
+_add_if_exists(np.uint8, 'CudaVectorUInt8')
+_add_if_exists(np.uint16, 'CudaVectorUInt16')
+_add_if_exists(np.uint32, 'CudaVectorUInt32')
+_add_if_exists(np.uint64, 'CudaVectorUInt64')
+CUDA_DTYPES = list(set(CUDA_DTYPES))  # Remove duplicates
+
+
+class CudaNtuples(NtuplesBase):
+    """The space `NtuplesBase`, implemented in CUDA."""
+
+    def __init__(self, size, dtype):
+        """Initialize a new instance.
+
+        Parameters
+        ----------
+        size : `int`
+            The number entries per tuple
+        dtype : `object`
+            The data type for each tuple entry. Can be provided in any
+            way the `numpy.dtype` function understands, most notably
+            as built-in type, as one of NumPy's internal datatype
+            objects or as string.
+
+            Check ``CUDA_DTYPES`` for a list of available data types.
+        """
+
+        if np.dtype(dtype) not in _TYPE_MAP_NPY2CUDA.keys():
+            raise TypeError('data type {!r} not supported in CUDA'
+                            ''.format(dtype))
+
+        super().__init__(size, dtype)
+
+        self._vector_impl = _TYPE_MAP_NPY2CUDA[self._dtype]
+
+    def element(self, inp=None, data_ptr=None):
+        """Create a new element.
+
+        Parameters
+        ----------
+        inp : `array-like` or scalar, optional
+            Input to initialize the new element.
+
+            If ``inp`` is a `numpy.ndarray` of shape ``(size,)``
+            and the same data type as this space, the array is wrapped,
+            not copied.
+            Other array-like objects are copied (with broadcasting
+            if necessary).
+
+            If a single value is given, it is copied to all entries.
+
+            If both ``inp`` and ``data_ptr`` are `None`, an empty
+            element is created with no guarantee of its state
+            (memory allocation only).
+
+        data_ptr : `int`, optional
+            Memory address of a CUDA array container
+
+            Cannot be combined with ``inp``.
+
+        Returns
+        -------
+        element : `CudaNtuplesVector`
+            The new element
+
+        Notes
+        -----
+        This method preserves "array views" of correct size and type,
+        see the examples below.
+
+        TODO: No, it does not yet!
+
+        Examples
+        --------
+        >>> uc3 = CudaNtuples(3, 'uint8')
+        >>> x = uc3.element(np.array([1, 2, 3], dtype='uint8'))
+        >>> x
+        CudaNtuples(3, 'uint8').element([1, 2, 3])
+        >>> y = uc3.element([1, 2, 3])
+        >>> y
+        CudaNtuples(3, 'uint8').element([1, 2, 3])
+        """
+        if inp is None:
+            if data_ptr is None:
+                return self.element_type(self, self._vector_impl(self.size))
+            else:  # TODO: handle non-1 length strides
+                return self.element_type(
+                    self, self._vector_impl.from_pointer(data_ptr, self.size,
+                                                         1))
+        else:
+            if data_ptr is None:
+                if isinstance(inp, self._vector_impl):
+                    return self.element_type(self, inp)
+                elif isinstance(inp, self.element_type):
+                    if inp in self:
+                        return inp
+                    else:
+                        # Bulk-copy for non-matching dtypes
+                        elem = self.element()
+                        elem[:] = inp
+                        return elem
+                else:
+                    # Array-like input. Need to go through a NumPy array
+                    arr = np.array(inp, copy=False, dtype=self.dtype, ndmin=1)
+                    if arr.shape != (self.size,):
+                        raise ValueError('expected input shape {}, got {}'
+                                         ''.format((self.size,), arr.shape))
+                    elem = self.element()
+                    elem[:] = arr
+                    return elem
+            else:
+                raise ValueError('cannot provide both `inp` and `data_ptr`')
+
+    @property
+    def element_type(self):
+        """ `CudaNtuplesVector` """
+        return CudaNtuplesVector
+
+    impl = 'cuda'
+
+    @staticmethod
+    def available_dtypes():
+        """Return the available data types."""
+        return CUDA_DTYPES
+
+    @staticmethod
+    def default_dtype(field=None):
+        """Return the default of `Fn` data type for a given field.
+
+        Parameters
+        ----------
+        field : `Field`
+            Set of numbers to be represented by a data type.
+            Currently supported : `RealNumbers`, `ComplexNumbers`
+
+        Returns
+        -------
+        dtype : `type`
+            Numpy data type specifier. The returned defaults are:
+
+            ``RealNumbers()`` : ``np.dtype('float32')``
+
+            ``ComplexNumbers()`` : `NotImplemented`
+        """
+        if field is None or field == RealNumbers():
+            return np.dtype('float32')
+        elif field == ComplexNumbers():
+            return NotImplemented
+        else:
+            raise ValueError('no default data type defined for field {}'
+                             ''.format(field))
+
+
+class CudaNtuplesVector(NtuplesBaseVector, LinearSpaceVector):
+
+    """Representation of a `CudaNtuples` element."""
+
+    def __init__(self, space, data):
+        """Initialize a new instance."""
+        if not isinstance(space, CudaNtuples):
+            raise TypeError('{!r} not a `CudaNtuples` instance'
+                            ''.format(space))
+
+        super().__init__(space)
+
+        if not isinstance(data, self._space._vector_impl):
+            raise TypeError('data {!r} not a `{}` instance'
+                            ''.format(data, self._space._vector_impl))
+        self._data = data
+
+    @property
+    def data(self):
+        """The data of this vector.
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        ptr : CudaFnVectorImpl
+            Underlying cuda data representation
+
+        Examples
+        --------
+        """
+        return self._data
+
+    @property
+    def data_ptr(self):
+        """A raw pointer to the data of this vector."""
+        return self.data.data_ptr()
+
+    def __eq__(self, other):
+        """Return ``self == other``.
+
+        Returns
+        -------
+        equals : `bool`
+            `True` if all elements of ``other`` are equal to this
+            vector's elements, `False` otherwise
+
+        Examples
+        --------
+        >>> r3 = CudaNtuples(3, 'float32')
+        >>> x = r3.element([1, 2, 3])
+        >>> x == x
+        True
+        >>> y = r3.element([1, 2, 3])
+        >>> x == y
+        True
+        >>> y = r3.element([0, 0, 0])
+        >>> x == y
+        False
+        >>> r3_2 = CudaNtuples(3, 'uint8')
+        >>> z = r3_2.element([1, 2, 3])
+        >>> x != z
+        True
+        """
+        if other is self:
+            return True
+        elif other not in self.space:
+            return False
+        else:
+            return self.data == other.data
+
+    def copy(self):
+        """Create an identical (deep) copy of this vector.
+
+        Returns
+        -------
+        copy : `CudaNtuplesVector`
+            The deep copy
+
+        Examples
+        --------
+        >>> vec1 = CudaNtuples(3, 'uint8').element([1, 2, 3])
+        >>> vec2 = vec1.copy()
+        >>> vec2
+        CudaNtuples(3, 'uint8').element([1, 2, 3])
+        >>> vec1 == vec2
+        True
+        >>> vec1 is vec2
+        False
+        """
+        return self.space.element_type(self.space, self.data.copy())
+
+    def asarray(self, start=None, stop=None, step=None, out=None):
+        """Extract the data of this array as a numpy array.
+
+        Parameters
+        ----------
+        start : `int`, optional
+            Start position. None means the first element.
+        start : `int`, optional
+            One element past the last element to be extracted.
+            None means the last element.
+        start : `int`, optional
+            Step length. None means 1.
+        out : `numpy.ndarray`
+            Array in which the result should be written in-place.
+            Has to be contiguous and of the correct dtype.
+
+        Returns
+        -------
+        asarray : `numpy.ndarray`
+            Numpy array of the same type as the space.
+
+        Examples
+        --------
+        >>> uc3 = CudaNtuples(3, 'uint8')
+        >>> y = uc3.element([1, 2, 3])
+        >>> y.asarray()
+        array([1, 2, 3], dtype=uint8)
+        >>> y.asarray(1, 3)
+        array([2, 3], dtype=uint8)
+
+        Using the out parameter
+
+        >>> out = np.empty((3,), dtype='uint8')
+        >>> result = y.asarray(out=out)
+        >>> out
+        array([1, 2, 3], dtype=uint8)
+        >>> result is out
+        True
+        """
+        if out is None:
+            return self.data.get_to_host(slice(start, stop, step))
+        else:
+            self.data.copy_device_to_host(slice(start, stop, step), out)
+            return out
+
+    def __getitem__(self, indices):
+        """Access values of this vector.
+
+        This will cause the values to be copied to CPU
+        which is a slow operation.
+
+        Parameters
+        ----------
+        indices : `int` or `slice`
+            The position(s) that should be accessed
+
+        Returns
+        -------
+        values : scalar or `CudaNtuplesVector`
+            The value(s) at the index (indices)
+
+
+        Examples
+        --------
+        >>> uc3 = CudaNtuples(3, 'uint8')
+        >>> y = uc3.element([1, 2, 3])
+        >>> y[0]
+        1
+        >>> z = y[1:3]
+        >>> z
+        CudaNtuples(2, 'uint8').element([2, 3])
+        >>> y[::2]
+        CudaNtuples(2, 'uint8').element([1, 3])
+        >>> y[::-1]
+        CudaNtuples(3, 'uint8').element([3, 2, 1])
+
+        The returned value is a view, modifications are reflected
+        in the original data:
+
+        >>> z[:] = [4, 5]
+        >>> y
+        CudaNtuples(3, 'uint8').element([1, 4, 5])
+        """
+        if isinstance(indices, slice):
+            data = self.data.getslice(indices)
+            return type(self.space)(data.size, data.dtype).element(data)
+        else:
+            return self.data.__getitem__(indices)
+
+    def __setitem__(self, indices, values):
+        """Set values of this vector.
+
+        This will cause the values to be copied to CPU
+        which is a slow operation.
+
+        Parameters
+        ----------
+        indices : `int` or `slice`
+            The position(s) that should be set
+        values : scalar, `array-like` or `CudaNtuplesVector`
+            The value(s) that are to be assigned.
+
+            If ``index`` is an `int`, ``value`` must be single value.
+
+            If ``index`` is a `slice`, ``value`` must be broadcastable
+            to the size of the slice (same size, shape (1,)
+            or single value).
+
+        Returns
+        -------
+        `None`
+
+        Examples
+        --------
+        >>> uc3 = CudaNtuples(3, 'uint8')
+        >>> y = uc3.element([1, 2, 3])
+        >>> y[0] = 5
+        >>> y
+        CudaNtuples(3, 'uint8').element([5, 2, 3])
+        >>> y[1:3] = [7, 8]
+        >>> y
+        CudaNtuples(3, 'uint8').element([5, 7, 8])
+        >>> y[:] = np.array([0, 0, 0])
+        >>> y
+        CudaNtuples(3, 'uint8').element([0, 0, 0])
+
+        Scalar assignment
+
+        >>> y[:] = 5
+        >>> y
+        CudaNtuples(3, 'uint8').element([5, 5, 5])
+        """
+        if isinstance(values, CudaNtuplesVector):
+            self.assign(values)  # use lincomb magic
+        else:
+            if isinstance(indices, slice):
+                # Convert value to the correct type if needed
+                value_array = np.asarray(values, dtype=self.space.dtype)
+
+                if value_array.ndim == 0:
+                    self.data.fill(values)
+                else:
+                    # Size checking is performed in c++
+                    self.data.setslice(indices, value_array)
+            else:
+                self.data[int(indices)] = values
+
+    @property
+    def ufunc(self):
+        """`CudaNtuplesUFuncs`, access to numpy style ufuncs.
+
+        Examples
+        --------
+        >>> r2 = CudaFn(2)
+        >>> x = r2.element([1, -2])
+        >>> x.ufunc.absolute()
+        CudaFn(2).element([1.0, 2.0])
+
+        These functions can also be used with broadcasting
+
+        >>> x.ufunc.add(3)
+        CudaFn(2).element([4.0, 1.0])
+
+        and non-space elements
+
+        >>> x.ufunc.subtract([3, 3])
+        CudaFn(2).element([-2.0, -5.0])
+
+        There is also support for various reductions (sum, prod, min, max)
+
+        >>> x.ufunc.sum()
+        -1.0
+
+        Also supports out parameter
+
+        >>> y = r2.element([3, 4])
+        >>> out = r2.element()
+        >>> result = x.ufunc.add(y, out=out)
+        >>> result
+        CudaFn(2).element([4.0, 2.0])
+        >>> result is out
+        True
+
+        Notes
+        -----
+        Not all ufuncs are currently optimized, some use the default numpy
+        implementation. This can be improved in the future.
+
+        See also
+        --------
+        odl.util.ufuncs.NtuplesBaseUFuncs
+            Base class for ufuncs in `NtuplesBase` spaces.
+        """
+        return CudaNtuplesUFuncs(self)
+
+
+class CudaFn(FnBase, CudaNtuples):
+
+    """The space `FnBase`, implemented in CUDA.
+
+    Requires the compiled ODL extension odlpp.
+    """
+
+    def __init__(self, size, dtype='float32', **kwargs):
+        """Initialize a new instance.
+
+        Parameters
+        ----------
+        size : positive `int`
+            The number of dimensions of the space
+        dtype : `object`
+            The data type of the storage array. Can be provided in any
+            way the `numpy.dtype` function understands, most notably
+            as built-in type, as one of NumPy's internal datatype
+            objects or as string.
+
+            Only scalar data types are allowed.
+
+        weight : optional
+            Use weighted inner product, norm, and dist. The following
+            types are supported as ``weight``:
+
+            `FnWeightingBase` :
+            Use this weighting as-is. Compatibility with this
+            space's elements is not checked during init.
+
+            `float` :
+            Weighting by a constant
+
+            `array-like` :
+            Weighting by a vector (1-dim. array, corresponds to
+            a diagonal matrix). Note that the array is stored in
+            main memory, which results in slower space functions
+            due to a copy during evaluation.
+
+            `CudaFnVector` :
+            same as 1-dim. array-like, except that copying is
+            avoided if the ``dtype`` of the vector is the
+            same as this space's ``dtype``.
+
+            Default: no weighting
+
+            This option cannot be combined with ``dist``, ``norm``
+            or ``inner``.
+
+        exponent : positive `float`, optional
+            Exponent of the norm. For values other than 2.0, no
+            inner product is defined.
+
+            This option is ignored if ``dist``, ``norm`` or
+            ``inner`` is given.
+
+            Default: 2.0
+
+        dist : `callable`, optional
+            The distance function defining a metric on the space.
+            It must accept two `FnVector` arguments and
+            fulfill the following mathematical conditions for any
+            three vectors ``x, y, z``:
+
+            - ``dist(x, y) >= 0``
+            - ``dist(x, y) = 0``  if and only if  ``x = y``
+            - ``dist(x, y) = dist(y, x)``
+            - ``dist(x, y) <= dist(x, z) + dist(z, y)``
+
+            This option cannot be combined with ``weight``,
+            ``norm`` or ``inner``.
+
+        norm : `callable`, optional
+            The norm implementation. It must accept an
+            `FnVector` argument, return a `float` and satisfy the
+            following conditions for all vectors ``x, y`` and scalars
+            ``s``:
+
+            - ``||x|| >= 0``
+            - ``||x|| = 0``  if and only if  ``x = 0``
+            - ``||s * x|| = |s| * ||x||``
+            - ``||x + y|| <= ||x|| + ||y||``
+
+            By default, ``norm(x)`` is calculated as ``inner(x, x)``.
+
+            This option cannot be combined with ``weight``,
+            ``dist`` or ``inner``.
+
+        inner : `callable`, optional
+            The inner product implementation. It must accept two
+            `FnVector` arguments, return a element from
+            the field of the space (real or complex number) and
+            satisfy the following conditions for all vectors
+            ``x, y, z`` and scalars ``s``:
+
+            - ``<x, y> = conj(<y, x>)``
+            - ``<s*x + y, z> = s * <x, z> + <y, z>``
+            - ``<x, x> = 0``  if and only if  ``x = 0``
+
+            This option cannot be combined with ``weight``,
+            ``dist`` or ``norm``.
+        """
+        FnBase.__init__(self, size, dtype)
+        CudaNtuples.__init__(self, size, dtype)
+
+        dist = kwargs.pop('dist', None)
+        norm = kwargs.pop('norm', None)
+        inner = kwargs.pop('inner', None)
+        weight = kwargs.pop('weight', None)
+        exponent = kwargs.pop('exponent', 2.0)
+
+        # Check validity of option combination (3 or 4 out of 4 must be None)
+        if sum(x is None for x in (dist, norm, inner, weight)) < 3:
+            raise ValueError('invalid combination of options `weight`, '
+                             '`dist`, `norm` and `inner`')
+        if weight is not None:
+            if isinstance(weight, WeightingBase):
+                self._weighting = weight
+            elif np.isscalar(weight):
+                self._weighting = CudaFnConstWeighting(
+                    weight, exponent=exponent)
+            elif isinstance(weight, CudaFnVector):
+                self._weighting = CudaFnVectorWeighting(
+                    weight, exponent=exponent)
+            else:
+                # Must make a CudaFnVector from the array
+                weight = self.element(np.asarray(weight))
+                if weight.ndim == 1:
+                    self._weighting = CudaFnVectorWeighting(
+                        weight, exponent=exponent)
+                else:
+                    raise ValueError('invalid weight argument {!r}'
+                                     ''.format(weight))
+        elif dist is not None:
+            self._weighting = CudaFnCustomDist(dist)
+        elif norm is not None:
+            self._weighting = CudaFnCustomNorm(norm)
+        elif inner is not None:
+            # Use fast dist implementation
+            self._weighting = CudaFnCustomInnerProduct(
+                inner, dist_using_inner=True)
+        else:  # all None -> no weighing
+            self._weighting = CudaFnNoWeighting(exponent)
+
+    @property
+    def exponent(self):
+        """Exponent of the norm and distance."""
+        return self.weighting.exponent
+
+    @property
+    def weighting(self):
+        """This space's weighting scheme."""
+        return self._weighting
+
+    @property
+    def is_weighted(self):
+        """Return `True` if the weighting is not `CudaFnNoWeighting`."""
+        return not isinstance(self.weighting, CudaFnNoWeighting)
+
+    @staticmethod
+    def default_dtype(field=None):
+        """Return the default of `CudaFn` data type for a given field.
+
+        Parameters
+        ----------
+        field : `Field`
+            Set of numbers to be represented by a data type.
+            Currently supported: `RealNumbers`.
+
+        Returns
+        -------
+        dtype : `type`
+            Numpy data type specifier. The returned defaults are:
+
+            ``RealNumbers()`` : , ``np.dtype('float32')``
+        """
+        if field is None or field == RealNumbers():
+            return np.dtype('float32')
+        else:
+            raise ValueError('no default data type defined for field {}'
+                             ''.format(field))
+
+    def _lincomb(self, a, x1, b, x2, out):
+        """Linear combination of ``x1`` and ``x2``, assigned to ``out``.
+
+        Calculate ``z = a * x + b * y`` using optimized CUDA routines.
+
+        Parameters
+        ----------
+        a, b : `LinearSpace.field` `element`
+            Scalar to multiply ``x`` and ``y`` with.
+        x, y : `CudaFnVector`
+            The summands
+        out : `CudaFnVector`
+            The Vector that the result is written to.
+
+        Returns
+        -------
+        `None`
+
+        Examples
+        --------
+        >>> r3 = CudaFn(3)
+        >>> x = r3.element([1, 2, 3])
+        >>> y = r3.element([4, 5, 6])
+        >>> out = r3.element()
+        >>> r3.lincomb(2, x, 3, y, out)  # out is returned
+        CudaFn(3).element([14.0, 19.0, 24.0])
+        >>> out
+        CudaFn(3).element([14.0, 19.0, 24.0])
+        """
+        out.data.lincomb(a, x1.data, b, x2.data)
+
+    def _inner(self, x1, x2):
+        """Calculate the inner product of x and y.
+
+        Parameters
+        ----------
+        x1, x2 : `CudaFnVector`
+
+        Returns
+        -------
+        inner: `float` or `complex`
+            The inner product of x and y
+
+
+        Examples
+        --------
+        >>> uc3 = CudaFn(3, 'uint8')
+        >>> x = uc3.element([1, 2, 3])
+        >>> y = uc3.element([3, 1, 5])
+        >>> uc3.inner(x, y)
+        20.0
+        """
+        return self.weighting.inner(x1, x2)
+
+    def _dist(self, x1, x2):
+        """Calculate the distance between two vectors.
+
+        Parameters
+        ----------
+        x1, x2 : `CudaFnVector`
+            The vectors whose mutual distance is calculated
+
+        Returns
+        -------
+        dist : `float`
+            Distance between the vectors
+
+        Examples
+        --------
+        >>> r2 = CudaFn(2)
+        >>> x = r2.element([3, 8])
+        >>> y = r2.element([0, 4])
+        >>> r2.dist(x, y)
+        5.0
+        """
+        return self.weighting.dist(x1, x2)
+
+    def _norm(self, x):
+        """Calculate the norm of ``x``.
+
+        This method is implemented separately from ``sqrt(inner(x,x))``
+        for efficiency reasons.
+
+        Parameters
+        ----------
+        x : `CudaFnVector`
+
+        Returns
+        -------
+        norm : `float`
+            The norm of x
+
+        Examples
+        --------
+        >>> uc3 = CudaFn(3, 'uint8')
+        >>> x = uc3.element([2, 3, 6])
+        >>> uc3.norm(x)
+        7.0
+        """
+        return self.weighting.norm(x)
+
+    def _multiply(self, x1, x2, out):
+        """The pointwise product of two vectors, assigned to ``out``.
+
+        This is defined as:
+
+        multiply(x, y, out) := [x[0]*y[0], x[1]*y[1], ..., x[n-1]*y[n-1]]
+
+        Parameters
+        ----------
+
+        x1, x2 : `CudaFnVector`
+            Factors in product
+        out : `CudaFnVector`
+            Element to which the result is written
+
+        Returns
+        -------
+        `None`
+
+        Examples
+        --------
+
+        >>> rn = CudaFn(3)
+        >>> x1 = rn.element([5, 3, 2])
+        >>> x2 = rn.element([1, 2, 3])
+        >>> out = rn.element()
+        >>> rn.multiply(x1, x2, out)  # out is returned
+        CudaFn(3).element([5.0, 6.0, 6.0])
+        >>> out
+        CudaFn(3).element([5.0, 6.0, 6.0])
+        """
+        out.data.multiply(x1.data, x2.data)
+
+    def _divide(self, x1, x2, out):
+        """The pointwise division of two vectors, assigned to ``out``.
+
+        This is defined as:
+
+        multiply(z, x, y) := [x[0]/y[0], x[1]/y[1], ..., x[n-1]/y[n-1]]
+
+        Parameters
+        ----------
+
+        x1, x2 : `CudaFnVector`
+            Factors in the product
+        out : `CudaFnVector`
+            Element to which the result is written
+
+        Returns
+        -------
+        None
+
+        Examples
+        --------
+
+        >>> rn = CudaFn(3)
+        >>> x1 = rn.element([5, 3, 2])
+        >>> x2 = rn.element([1, 2, 2])
+        >>> out = rn.element()
+        >>> rn.divide(x1, x2, out)  # out is returned
+        CudaFn(3).element([5.0, 1.5, 1.0])
+        >>> out
+        CudaFn(3).element([5.0, 1.5, 1.0])
+        """
+        out.data.divide(x1.data, x2.data)
+
+    def zero(self):
+        """Create a vector of zeros."""
+        return self.element_type(self, self._vector_impl(self.size, 0))
+
+    def one(self):
+        """Create a vector of ones."""
+        return self.element_type(self, self._vector_impl(self.size, 1))
+
+    def __eq__(self, other):
+        """s.__eq__(other) <==> s == other.
+
+        Returns
+        -------
+        equals : `bool`
+            `True` if other is an instance of this space's type
+            with the same ``size``, ``dtype`` and space functions,
+            otherwise `False`.
+
+        Examples
+        --------
+        >>> from numpy.linalg import norm
+        >>> def dist(x, y, p):
+        ...     return norm(x - y, ord=p)
+
+        >>> from functools import partial
+        >>> dist2 = partial(dist, p=2)
+        >>> r3 = CudaFn(3, dist=dist2)
+        >>> r3_same = CudaFn(3, dist=dist2)
+        >>> r3  == r3_same
+        True
+
+        Different ``dist`` functions result in different spaces - the
+        same applies for ``norm`` and ``inner``:
+
+        >>> dist1 = partial(dist, p=1)
+        >>> r3_1 = CudaFn(3, dist=dist1)
+        >>> r3_2 = CudaFn(3, dist=dist2)
+        >>> r3_1 == r3_2
+        False
+
+        Be careful with Lambdas - they result in non-identical function
+        objects:
+
+        >>> r3_lambda1 = CudaFn(3, dist=lambda x, y: norm(x-y, p=1))
+        >>> r3_lambda2 = CudaFn(3, dist=lambda x, y: norm(x-y, p=1))
+        >>> r3_lambda1 == r3_lambda2
+        False
+        """
+        if other is self:
+            return True
+
+        return (CudaNtuples.__eq__(self, other) and
+                self.weighting == other.weighting)
+
+    def __repr__(self):
+        """s.__repr__() <==> repr(s)."""
+        if self.is_rn:
+            class_name = 'CudaFn'
+        elif self.is_cn:
+            class_name = 'CudaCn'
+        else:
+            class_name = self.__class__.__name__
+
+        inner_str = '{}'.format(self.size)
+        if self.dtype != self.default_dtype(self.field):
+            inner_str += ', {}'.format(dtype_repr(self.dtype))
+
+        weight_str = self.weighting.repr_part
+        if weight_str:
+            inner_str += ', ' + weight_str
+        return '{}({})'.format(class_name, inner_str)
+
+    @property
+    def element_type(self):
+        """ `CudaFnVector` """
+        return CudaFnVector
+
+
+class CudaFnVector(FnBaseVector, CudaNtuplesVector):
+
+    """Representation of a `CudaFn` element."""
+
+    def __init__(self, space, data):
+        """Initialize a new instance."""
+        super().__init__(space, data)
+
+
+def _weighting(weight, exponent):
+    """Return a weighting whose type is inferred from the arguments."""
+    if np.isscalar(weight):
+        weighting = CudaFnConstWeighting(
+            weight, exponent)
+    elif isinstance(weight, CudaFnVector):
+        weighting = CudaFnVectorWeighting(
+            weight, exponent=exponent)
+    else:
+        weight_ = np.asarray(weight)
+        if weight_.dtype == object:
+            raise ValueError('bad weight {}'.format(weight))
+        if weight_.ndim == 1:
+            weighting = CudaFnVectorWeighting(
+                weight_, exponent)
+        elif weight_.ndim == 2:
+            raise NotImplementedError('matrix weighting not implemented '
+                                      'for CUDA spaces')
+#            weighting = CudaFnMatrixWeighting(
+#                weight_, exponent, dist_using_inner=dist_using_inner)
+        else:
+            raise ValueError('array-like weight must have 1 or 2 dimensions, '
+                             'but {} has {} dimensions'
+                             ''.format(weight, weight_.ndim))
+    return weighting
+
+
+def cu_weighted_inner(weight):
+    """Weighted inner product on `CudaFn` spaces as free function.
+
+    Parameters
+    ----------
+    weight : scalar, `array-like` or `CudaFnVector`
+        Weight of the inner product. A scalar is interpreted as a
+        constant weight and a 1-dim. array or a `CudaFnVector`
+        as a weighting vector.
+
+    Returns
+    -------
+    inner : `callable`
+        Inner product function with given weight. Constant weightings
+        are applicable to spaces of any size, for arrays the sizes
+        of the weighting and the space must match.
+
+    See also
+    --------
+    CudaFnConstWeighting, CudaFnVectorWeighting
+    """
+    return _weighting(weight, exponent=2.0).inner
+
+
+def cu_weighted_norm(weight, exponent=2.0):
+    """Weighted norm on `CudaFn` spaces as free function.
+
+    Parameters
+    ----------
+    weight : scalar, `array-like` or `CudaFnVector`
+        Weight of the inner product. A scalar is interpreted as a
+        constant weight and a 1-dim. array or a `CudaFnVector`
+        as a weighting vector.
+    exponent : positive `float`
+        Exponent of the norm. If ``weight`` is a sparse matrix, only
+        1.0, 2.0 and ``inf`` are allowed.
+
+    Returns
+    -------
+    norm : `callable`
+        Norm function with given weight. Constant weightings
+        are applicable to spaces of any size, for arrays the sizes
+        of the weighting and the space must match.
+
+    See also
+    --------
+    CudaFnConstWeighting, CudaFnVectorWeighting
+    """
+    return _weighting(weight, exponent=exponent).norm
+
+
+def cu_weighted_dist(weight, exponent=2.0):
+    """Weighted distance on `CudaFn` spaces as free function.
+
+    Parameters
+    ----------
+    weight : scalar, `array-like` or `CudaFnVector`
+        Weight of the inner product. A scalar is interpreted as a
+        constant weight and a 1-dim. array or a `CudaFnVector`
+        as a weighting vector.
+    exponent : positive `float`
+        Exponent of the distance
+
+    Returns
+    -------
+    dist : `callable`
+        Distance function with given weight. Constant weightings
+        are applicable to spaces of any size, for arrays the sizes
+        of the weighting and the space must match.
+
+    See also
+    --------
+    CudaFnConstWeighting, CudaFnVectorWeighting
+    """
+    return _weighting(weight, exponent=exponent).dist
+
+
+def _dist_default(x1, x2):
+    """Default Euclidean distance implementation."""
+    return x1.data.dist(x2.data)
+
+
+def _pdist_default(x1, x2, p):
+    """Default p-distance implementation."""
+    if p == float('inf'):
+        raise NotImplementedError('inf-norm not implemented')
+    return x1.data.dist_power(x2.data, p)
+
+
+def _pdist_diagweight(x1, x2, p, w):
+    """Diagonally weighted p-distance implementation."""
+    return x1.data.dist_weight(x2.data, p, w.data)
+
+
+def _norm_default(x):
+    """Default Euclidean norm implementation."""
+    return x.data.norm()
+
+
+def _pnorm_default(x, p):
+    """Default p-norm implementation."""
+    if p == float('inf'):
+        raise NotImplementedError('inf-norm not implemented')
+    return x.data.norm_power(p)
+
+
+def _pnorm_diagweight(x, p, w):
+    """Diagonally weighted p-norm implementation."""
+    if p == float('inf'):
+        raise NotImplementedError('inf-norm not implemented')
+    return x.data.norm_weight(p, w.data)
+
+
+def _inner_default(x1, x2):
+    """Default Euclidean inner product implementation."""
+    return x1.data.inner(x2.data)
+
+
+def _inner_diagweight(x1, x2, w):
+    return x1.data.inner_weight(x2.data, w.data)
+
+
+class CudaFnVectorWeighting(VectorWeightingBase):
+
+    """Vector weighting for `CudaFn`.
+
+    For exponent 2.0, a new weighted inner product with vector ``w``
+    is defined as::
+
+        <a, b>_w := <w * a, b> = b^H (w * a)
+
+    with ``b^H`` standing for transposed complex conjugate and
+    ``w * a`` for element-wise multiplication.
+
+    For other exponents, only norm and dist are defined. In the case of
+    exponent ``inf``, the weighted norm is
+
+        ||a||_{w, inf} := ||w * a||_inf
+
+    otherwise it is::
+
+        ||a||_{w, p} := ||w^{1/p} * a||
+
+    Note that this definition does **not** fulfill the limit property
+    in ``p``, i.e.::
+
+        ||x||_{w, p} --/-> ||x||_{w, inf}  for p --> inf
+
+    unless ``w = (1,...,1)``.
+
+    The vector may only have positive entries, otherwise it does not
+    define an inner product or norm, respectively. This is not checked
+    during initialization.
+    """
+
+    def __init__(self, vector, exponent=2.0):
+        """Initialize a new instance.
+
+        Parameters
+        ----------
+        vector : `CudaFnVector`
+            Weighting vector of the inner product, norm and distance
+        exponent : positive `float`
+            Exponent of the norm. For values other than 2.0, the inner
+            product is not defined.
+        """
+        if not isinstance(vector, CudaFnVector):
+            raise TypeError('vector {!r} is not a CudaFnVector instance'
+                            ''.format(vector))
+
+        super().__init__(vector, impl='cuda', exponent=exponent,
+                         dist_using_inner=False)
+
+    def inner(self, x1, x2):
+        """Calculate the vector weighted inner product of two vectors.
+
+        Parameters
+        ----------
+        x1, x2 : `CudaFnVector`
+            Vectors whose inner product is calculated
+
+        Returns
+        -------
+        inner : `float` or `complex`
+            The inner product of the two provided vectors
+        """
+        if self.exponent != 2.0:
+            raise NotImplementedError('No inner product defined for '
+                                      'exponent != 2 (got {})'
+                                      ''.format(self.exponent))
+        else:
+            return _inner_diagweight(x1, x2, self.vector)
+
+    def norm(self, x):
+        """Calculate the vector-weighted norm of a vector.
+
+        Parameters
+        ----------
+        x : `CudaFnVector`
+            Vector whose norm is calculated
+
+        Returns
+        -------
+        norm : `float`
+            The norm of the provided vector
+        """
+        if self.exponent == float('inf'):
+            raise NotImplementedError('inf norm not implemented yet')
+        else:
+            return _pnorm_diagweight(x, self.exponent, self.vector)
+
+    def dist(self, x1, x2):
+        """Calculate the vector-weighted distance between two vectors.
+
+        Parameters
+        ----------
+        x1, x2 : `CudaFnVector`
+            Vectors whose mutual distance is calculated
+
+        Returns
+        -------
+        dist : `float`
+            The distance between the vectors
+        """
+        if self.exponent == float('inf'):
+            raise NotImplementedError('inf norm not implemented yet')
+        else:
+            return _pdist_diagweight(x1, x2, self.exponent, self.vector)
+
+
+class CudaFnConstWeighting(ConstWeightingBase):
+
+    """Weighting of `CudaFn` by a constant.
+
+    For exponent 2.0, a new weighted inner product with constant
+    ``c`` is defined as::
+
+        <a, b>_c = c * <a, b> = c * b^H a
+
+    with ``b^H`` standing for transposed complex conjugate.
+
+    For other exponents, only norm and dist are defined. In the case of
+    exponent ``inf``, the weighted norm is defined as::
+
+        ||a||_{c, inf} := c ||a||_inf
+
+    otherwise it is::
+
+        ||a||_{c, p} := c^{1/p}  ||a||_p
+
+    Note that this definition does **not** fulfill the limit property
+    in ``p``, i.e.::
+
+        ||a||_{c,p} --/-> ||a||_{c,inf}  for p --> inf
+
+    unless ``c = 1``.
+
+    The constant must be positive, otherwise it does not define an
+    inner product or norm, respectively.
+    """
+
+    def __init__(self, constant, exponent=2.0):
+        """Initialize a new instance.
+
+        Parameters
+        ----------
+        constant : positive finite `float`
+            Weighting constant of the inner product.
+        exponent : positive `float`
+            Exponent of the norm. For values other than 2.0, the inner
+            product is not defined.
+        """
+        super().__init__(constant, impl='cuda', exponent=exponent,
+                         dist_using_inner=False)
+
+    def inner(self, x1, x2):
+        """Calculate the constant-weighted inner product of two vectors.
+
+        Parameters
+        ----------
+        x1, x2 : `CudaFnVector`
+            Vectors whose inner product is calculated
+
+        Returns
+        -------
+        inner : `float` or `complex`
+            The inner product of the two vectors
+        """
+        if self.exponent != 2.0:
+            raise NotImplementedError('no inner product defined for '
+                                      'exponent != 2 (got {})'
+                                      ''.format(self.exponent))
+        else:
+            return self.const * _inner_default(x1, x2)
+
+    def norm(self, x):
+        """Calculate the constant-weighted norm of a vector.
+
+        Parameters
+        ----------
+        x1 : `CudaFnVector`
+            Vector whose norm is calculated
+
+        Returns
+        -------
+        norm : `float`
+            The norm of the vector
+        """
+        if self.exponent == float('inf'):
+            raise NotImplementedError
+            # Example impl
+            # return self.const * float(_pnorm_default(x, self.exponent))
+        else:
+            return (self.const ** (1 / self.exponent) *
+                    float(_pnorm_default(x, self.exponent)))
+
+    def dist(self, x1, x2):
+        """Calculate the constant-weighted distance between two vectors.
+
+        Parameters
+        ----------
+        x1, x2 : `CudaFnVector`
+            Vectors whose mutual distance is calculated
+
+        Returns
+        -------
+        dist : `float`
+            The distance between the vectors
+        """
+        if self.exponent == float('inf'):
+            raise NotImplementedError
+        else:
+            return (self.const ** (1 / self.exponent) *
+                    _pdist_default(x1, x2, self.exponent))
+
+
+class CudaFnNoWeighting(NoWeightingBase, CudaFnConstWeighting):
+
+    """Weighting of `CudaFn` with constant 1.
+
+    For exponent 2.0, the unweighted inner product is defined as::
+
+        <a, b> := b^H a
+
+    with ``b^H`` standing for transposed complex conjugate.
+
+    For other exponents, only norm and dist are defined.
+    """
+
+    # Implement singleton pattern for efficiency in the default case
+    _instance = None
+
+    def __new__(cls, *args, **kwargs):
+        """Implement singleton pattern if ``exponent==2.0``."""
+        if len(args) == 0:
+            exponent = kwargs.pop('exponent', 2.0)
+        else:
+            exponent = args[0]
+            args = args[1:]
+
+        if exponent == 2.0:
+            if not cls._instance:
+                cls._instance = super().__new__(cls, *args, **kwargs)
+            return cls._instance
+        else:
+            return super().__new__(cls, *args, **kwargs)
+
+    def __init__(self, exponent=2.0):
+        """Initialize a new instance.
+
+        Parameters
+        ----------
+        exponent : positive `float`
+            Exponent of the norm. For values other than 2.0, the inner
+            product is not defined.
+        """
+        super().__init__(exponent=exponent, impl='cuda',
+                         dist_using_inner=False)
+
+
+class CudaFnCustomInnerProduct(CustomInnerProductBase):
+
+    """Class for handling a user-specified inner product on `CudaFn`."""
+
+    def __init__(self, inner, dist_using_inner=True):
+        """Initialize a new instance.
+
+        Parameters
+        ----------
+        inner : `callable`
+            The inner product implementation. It must accept two
+            `FnVector` arguments, return an element from their space's
+            field (real or complex number) and satisfy the following
+            conditions for all vectors ``x, y, z`` and scalars ``s``:
+
+            - ``<x, y> = conj(<y, x>)``
+            - ``<s*x + y, z> = s * <x, z> + <y, z>``
+            - ``<x, x> = 0``  if and only if  ``x = 0``
+
+        dist_using_inner : `bool`, optional
+            Calculate ``dist`` using the following formula::
+
+                ||x - y||^2 = ||x||^2 + ||y||^2 - 2 * Re <x, y>
+
+            This avoids the creation of new arrays and is thus faster
+            for large arrays. On the downside, it will not evaluate to
+            exactly zero for equal (but not identical) ``x`` and ``y``.
+        """
+        super().__init__(inner, impl='cuda',
+                         dist_using_inner=dist_using_inner)
+
+
+class CudaFnCustomNorm(CustomNormBase):
+
+    """Class for handling a user-specified norm in `CudaFn`.
+
+    Note that this removes ``inner``.
+    """
+
+    def __init__(self, norm):
+        """Initialize a new instance.
+
+        Parameters
+        ----------
+        norm : `callable`
+            The norm implementation. It must accept a `CudaFnVector`
+            argument, return a `float` and satisfy the following
+            conditions for all vectors ``x, y`` and scalars ``s``:
+
+            - ``||x|| >= 0``
+            - ``||x|| = 0``  if and only if  ``x = 0``
+            - ``||s * x|| = |s| * ||x||``
+            - ``||x + y|| <= ||x|| + ||y||``
+        """
+        super().__init__(norm, impl='cuda')
+
+
+class CudaFnCustomDist(CustomDistBase):
+
+    """Class for handling a user-specified distance in `CudaFn`.
+
+    Note that this removes ``inner`` and ``norm``.
+    """
+
+    def __init__(self, dist):
+        """Initialize a new instance.
+
+        Parameters
+        ----------
+        dist : `callable`
+            The distance function defining a metric on `Fn`. It must
+            accept two `FnVector` arguments, return a `float` and and
+            fulfill the following mathematical conditions for any three
+            vectors ``x, y, z``:
+
+            - ``dist(x, y) >= 0``
+            - ``dist(x, y) = 0``  if and only if  ``x = y``
+            - ``dist(x, y) = dist(y, x)``
+            - ``dist(x, y) <= dist(x, z) + dist(z, y)``
+        """
+        super().__init__(dist, impl='cuda')
+
+
+if __name__ == '__main__':
+    # pylint: disable=wrong-import-position
+    from odl.util.testutils import run_doctests
+    run_doctests()
diff --git a/odlpp/cuda/CMakeLists.txt b/odlcuda/cuda/CMakeLists.txt
similarity index 97%
rename from odlpp/cuda/CMakeLists.txt
rename to odlcuda/cuda/CMakeLists.txt
index 2c40254..fc41bd7 100644
--- a/odlpp/cuda/CMakeLists.txt
+++ b/odlcuda/cuda/CMakeLists.txt
@@ -1,7 +1,7 @@
 include_directories ("${PROJECT_SOURCE_DIR}")
 
 if(CUDA_ENABLED)
-    set(LIB_NAME "odlpp_cuda")
+    set(LIB_NAME "odlcuda_")
     cuda_include_directories ("${PROJECT_SOURCE_DIR}")
     CUDA_ADD_LIBRARY(${LIB_NAME} SHARED "cuda.cpp"
                                         "cuda.cu"
diff --git a/odlpp/cuda/CudaVector.h b/odlcuda/cuda/CudaVector.h
similarity index 97%
rename from odlpp/cuda/CudaVector.h
rename to odlcuda/cuda/CudaVector.h
index 74d80c6..f2b1c96 100644
--- a/odlpp/cuda/CudaVector.h
+++ b/odlcuda/cuda/CudaVector.h
@@ -15,8 +15,8 @@
 #include <thrust/device_vector.h>
 #include <numpy/arrayobject.h>
 
-#include <odlpp/cuda/CudaVectorImpl.h>
-#include <odlpp/cuda/SliceHelper.h>
+#include <odlcuda/cuda/CudaVectorImpl.h>
+#include <odlcuda/cuda/SliceHelper.h>
 
 using namespace boost::python;
 
diff --git a/odlpp/cuda/CudaVectorImpl.cu b/odlcuda/cuda/CudaVectorImpl.cu
similarity index 99%
rename from odlpp/cuda/CudaVectorImpl.cu
rename to odlcuda/cuda/CudaVectorImpl.cu
index 5e812c7..fb4bb1c 100644
--- a/odlpp/cuda/CudaVectorImpl.cu
+++ b/odlcuda/cuda/CudaVectorImpl.cu
@@ -19,9 +19,9 @@
 #include <iostream>
 
 // ODL
-#include <odlpp/cuda/DeviceVectorImpl.h>
-#include <odlpp/cuda/CudaVectorImpl.h>
-#include <odlpp/cuda/TypeMacro.h>
+#include <odlcuda/cuda/DeviceVectorImpl.h>
+#include <odlcuda/cuda/CudaVectorImpl.h>
+#include <odlcuda/cuda/TypeMacro.h>
 
 // Utils
 #include <odl_cpp_utils/cuda/thrustUtils.h>
diff --git a/odlpp/cuda/CudaVectorImpl.h b/odlcuda/cuda/CudaVectorImpl.h
similarity index 97%
rename from odlpp/cuda/CudaVectorImpl.h
rename to odlcuda/cuda/CudaVectorImpl.h
index 9f3ecd1..d68972f 100644
--- a/odlpp/cuda/CudaVectorImpl.h
+++ b/odlcuda/cuda/CudaVectorImpl.h
@@ -4,8 +4,8 @@
 #include <sstream>
 #include <type_traits>
 
-#include <odlpp/cuda/TypeMacro.h>
-#include <odlpp/cuda/DeviceVector.h>
+#include <odlcuda/cuda/TypeMacro.h>
+#include <odlcuda/cuda/DeviceVector.h>
 
 // The scalar type used for multiplication
 template <typename T, typename Enable = void>
diff --git a/odlpp/cuda/DeviceVector.h b/odlcuda/cuda/DeviceVector.h
similarity index 100%
rename from odlpp/cuda/DeviceVector.h
rename to odlcuda/cuda/DeviceVector.h
diff --git a/odlpp/cuda/DeviceVectorImpl.h b/odlcuda/cuda/DeviceVectorImpl.h
similarity index 98%
rename from odlpp/cuda/DeviceVectorImpl.h
rename to odlcuda/cuda/DeviceVectorImpl.h
index 518a1e8..65f23d3 100644
--- a/odlpp/cuda/DeviceVectorImpl.h
+++ b/odlcuda/cuda/DeviceVectorImpl.h
@@ -8,7 +8,7 @@
 #include <odl_cpp_utils/cuda/enableThrustWarnings.h>
 
 // ODL
-#include <odlpp/cuda/DeviceVector.h>
+#include <odlcuda/cuda/DeviceVector.h>
 
 // Utils
 #include <odl_cpp_utils/cuda/thrustUtils.h>
diff --git a/odlpp/cuda/Reduction.cu b/odlcuda/cuda/Reduction.cu
similarity index 88%
rename from odlpp/cuda/Reduction.cu
rename to odlcuda/cuda/Reduction.cu
index 242b16a..131b5fe 100644
--- a/odlpp/cuda/Reduction.cu
+++ b/odlcuda/cuda/Reduction.cu
@@ -12,9 +12,9 @@
 #include <iostream>
 
 // ODL
-#include <odlpp/cuda/Reduction.h>
-#include <odlpp/cuda/DeviceVectorImpl.h>
-#include <odlpp/cuda/CudaVectorImpl.h>
+#include <odlcuda/cuda/Reduction.h>
+#include <odlcuda/cuda/DeviceVectorImpl.h>
+#include <odlcuda/cuda/CudaVectorImpl.h>
 
 // Utils
 #include <math.h>
diff --git a/odlpp/cuda/Reduction.h b/odlcuda/cuda/Reduction.h
similarity index 88%
rename from odlpp/cuda/Reduction.h
rename to odlcuda/cuda/Reduction.h
index 18cdfc4..eede244 100644
--- a/odlpp/cuda/Reduction.h
+++ b/odlcuda/cuda/Reduction.h
@@ -4,8 +4,8 @@
 #include <sstream>
 #include <type_traits>
 
-#include <odlpp/cuda/DeviceVector.h>
-#include <odlpp/cuda/CudaVectorImpl.h>
+#include <odlcuda/cuda/DeviceVector.h>
+#include <odlcuda/cuda/CudaVectorImpl.h>
 
 // clang-format off
 
diff --git a/odlpp/cuda/SliceHelper.h b/odlcuda/cuda/SliceHelper.h
similarity index 100%
rename from odlpp/cuda/SliceHelper.h
rename to odlcuda/cuda/SliceHelper.h
diff --git a/odlpp/cuda/TypeMacro.h b/odlcuda/cuda/TypeMacro.h
similarity index 100%
rename from odlpp/cuda/TypeMacro.h
rename to odlcuda/cuda/TypeMacro.h
diff --git a/odlpp/cuda/UFunc.cu b/odlcuda/cuda/UFunc.cu
similarity index 94%
rename from odlpp/cuda/UFunc.cu
rename to odlcuda/cuda/UFunc.cu
index 1659598..b5f21f0 100644
--- a/odlpp/cuda/UFunc.cu
+++ b/odlcuda/cuda/UFunc.cu
@@ -11,9 +11,9 @@
 #include <iostream>
 
 // ODL
-#include <odlpp/cuda/UFunc.h>
-#include <odlpp/cuda/DeviceVectorImpl.h>
-#include <odlpp/cuda/CudaVectorImpl.h>
+#include <odlcuda/cuda/UFunc.h>
+#include <odlcuda/cuda/DeviceVectorImpl.h>
+#include <odlcuda/cuda/CudaVectorImpl.h>
 
 // Utils
 #include <math.h>
diff --git a/odlpp/cuda/UFunc.h b/odlcuda/cuda/UFunc.h
similarity index 90%
rename from odlpp/cuda/UFunc.h
rename to odlcuda/cuda/UFunc.h
index 6a77223..9d5d4d5 100644
--- a/odlpp/cuda/UFunc.h
+++ b/odlcuda/cuda/UFunc.h
@@ -4,8 +4,8 @@
 #include <sstream>
 #include <type_traits>
 
-#include <odlpp/cuda/DeviceVector.h>
-#include <odlpp/cuda/CudaVectorImpl.h>
+#include <odlcuda/cuda/DeviceVector.h>
+#include <odlcuda/cuda/CudaVectorImpl.h>
 
 // clang-format off
 
diff --git a/odlpp/cuda/cuda.cpp b/odlcuda/cuda/cuda.cpp
similarity index 97%
rename from odlpp/cuda/cuda.cpp
rename to odlcuda/cuda/cuda.cpp
index 94793ee..15f5555 100644
--- a/odlpp/cuda/cuda.cpp
+++ b/odlcuda/cuda/cuda.cpp
@@ -14,11 +14,11 @@
 #include <numpy/arrayobject.h>
 
 #include <odl_cpp_utils/python/numpy_utils.h>
-#include <odlpp/cuda/DeviceVector.h>
-#include <odlpp/cuda/UFunc.h>
-#include <odlpp/cuda/Reduction.h>
-#include <odlpp/cuda/TypeMacro.h>
-#include <odlpp/cuda/CudaVector.h>
+#include <odlcuda/cuda/DeviceVector.h>
+#include <odlcuda/cuda/UFunc.h>
+#include <odlcuda/cuda/Reduction.h>
+#include <odlcuda/cuda/TypeMacro.h>
+#include <odlcuda/cuda/CudaVector.h>
 
 using namespace boost::python;
 
@@ -174,7 +174,7 @@ void instantiateCudaVector(const std::string& name) {
 }
 
 // Expose classes and methods to Python
-BOOST_PYTHON_MODULE(odlpp_cuda) {
+BOOST_PYTHON_MODULE(odlcuda_) {
     auto result = _import_array(); // Import numpy
     if (result != 0) {
         PyErr_Print();
diff --git a/odlpp/cuda/cuda.cu b/odlcuda/cuda/cuda.cu
similarity index 98%
rename from odlpp/cuda/cuda.cu
rename to odlcuda/cuda/cuda.cu
index d7bde08..dda95b8 100644
--- a/odlpp/cuda/cuda.cu
+++ b/odlcuda/cuda/cuda.cu
@@ -16,9 +16,9 @@
 #include <odl_cpp_utils/cuda/enableThrustWarnings.h>
 
 // ODL
-#include <odlpp/cuda/DeviceVectorImpl.h>
-#include <odlpp/cuda/TypeMacro.h>
-#include <odlpp/cuda/CudaVectorImpl.h>
+#include <odlcuda/cuda/DeviceVectorImpl.h>
+#include <odlcuda/cuda/TypeMacro.h>
+#include <odlcuda/cuda/CudaVectorImpl.h>
 
 // Utils
 #include <odl_cpp_utils/cuda/thrustUtils.h>
diff --git a/odlpp/cuda/dummy.cpp b/odlcuda/cuda/dummy.cpp
similarity index 100%
rename from odlpp/cuda/dummy.cpp
rename to odlcuda/cuda/dummy.cpp
diff --git a/odlcuda/odl_plugin.py b/odlcuda/odl_plugin.py
new file mode 100644
index 0000000..ba7c4db
--- /dev/null
+++ b/odlcuda/odl_plugin.py
@@ -0,0 +1,9 @@
+"""Exposes the odlcuda files to ODL as a plugin."""
+
+from . import cu_ntuples
+
+def ntuples_impls():
+    return {'cuda': cu_ntuples.CudaNtuples}
+
+def fn_impls():
+    return {'cuda': cu_ntuples.CudaFn}
diff --git a/odlcuda/setup.py b/odlcuda/setup.py
new file mode 100644
index 0000000..21bcd7d
--- /dev/null
+++ b/odlcuda/setup.py
@@ -0,0 +1,20 @@
+"""Setup script for odlcuda."""
+
+from future import standard_library
+standard_library.install_aliases()
+
+from setuptools import setup
+
+
+setup(name='odlcuda',
+      version='0.3.0',
+      author='Jonas Adler',
+      author_email='jonasadl@kth.se',
+      url='https://github.com/odlgroup/odlcuda',
+      description='C++ backend for odl',
+      license='GPLv3',
+      install_requires=['odl >= 0.3.0'],
+      packages=['odlcuda'],
+      package_dir={'odlcuda': '.'},
+      package_data={'' : ['*.pyd', '*.so']},
+      entry_points={'odl.space': ['odlcuda = odlcuda.odl_plugin']})
diff --git a/odlcuda/ufuncs.py b/odlcuda/ufuncs.py
new file mode 100644
index 0000000..dd3f6e0
--- /dev/null
+++ b/odlcuda/ufuncs.py
@@ -0,0 +1,91 @@
+# Copyright 2014-2016 The ODL development group
+#
+# This file is part of ODL.
+#
+# ODL is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ODL is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with ODL.  If not, see <http://www.gnu.org/licenses/>.
+
+"""UFuncs for ODL vectors.
+
+These functions are internal and should only be used as methods on
+`NtuplesBaseVector` type spaces.
+
+See `numpy.ufuncs
+<http://docs.scipy.org/doc/numpy/reference/ufuncs.html>`_
+for more information.
+
+Notes
+-----
+The default implementation of these methods make heavy use of the
+``NtuplesBaseVector.__array__`` to extract a `numpy.ndarray` from the vector,
+and then apply a ufunc to it. Afterwards, ``NtuplesBaseVector.__array_wrap__``
+is used to re-wrap the data into the appropriate space.
+"""
+
+# Imports for common Python 2/3 codebase
+from __future__ import print_function, division, absolute_import
+from future import standard_library
+standard_library.install_aliases()
+
+import re
+import odl
+
+
+__all__ = ('CudaNtuplesUFuncs',)
+
+
+# Optimizations for CUDA
+def _make_nullary_fun(name):
+    def fun(self):
+        return getattr(self.vector.data, name)()
+
+    fun.__doc__ = getattr(odl.util.ufuncs.NtuplesBaseUFuncs, name).__doc__
+    fun.__name__ = name
+    return fun
+
+
+def _make_unary_fun(name):
+    def fun(self, out=None):
+        if out is None:
+            out = self.vector.space.element()
+        getattr(self.vector.data, name)(out.data)
+        return out
+
+    fun.__doc__ = getattr(odl.util.ufuncs.NtuplesBaseUFuncs, name).__doc__
+    fun.__name__ = name
+    return fun
+
+
+class CudaNtuplesUFuncs(odl.util.ufuncs.NtuplesBaseUFuncs):
+
+    """UFuncs for `CudaNtuplesVector` objects.
+
+    Internal object, should not be created except in `CudaNtuplesVector`.
+    """
+
+    # Ufuncs
+    sin = _make_unary_fun('sin')
+    cos = _make_unary_fun('cos')
+    arcsin = _make_unary_fun('arcsin')
+    arccos = _make_unary_fun('arccos')
+    log = _make_unary_fun('log')
+    exp = _make_unary_fun('exp')
+    absolute = _make_unary_fun('absolute')
+    sign = _make_unary_fun('sign')
+    sqrt = _make_unary_fun('sqrt')
+
+    # Reductions
+    sum = _make_nullary_fun('sum')
+    prod = _make_nullary_fun('prod')
+    min = _make_nullary_fun('min')
+    max = _make_nullary_fun('max')
diff --git a/odlpp/setup.py b/odlpp/setup.py
deleted file mode 100644
index dff3494..0000000
--- a/odlpp/setup.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-
-"""
-
-from future import standard_library
-standard_library.install_aliases()
-
-from distutils.core import setup
-
-
-setup(name='odlpp',
-      version='0.2.0',
-      author='Jonas Adler',
-      author_email='jonasadl@kth.se',
-      url='https://github.com/odlgroup/odlpp',
-      description='C++ backend for odl',
-      license='GPLv3',
-      packages=['odlpp'],
-      package_dir={'odlpp': '.'},
-      package_data={'odlpp': ['*.*']})
diff --git a/test/cu_ntuples_test.py b/test/cu_ntuples_test.py
new file mode 100644
index 0000000..88443e7
--- /dev/null
+++ b/test/cu_ntuples_test.py
@@ -0,0 +1,1112 @@
+﻿# Copyright 2014-2016 The ODL development group
+#
+# This file is part of ODL.
+#
+# ODL is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ODL is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with ODL.  If not, see <http://www.gnu.org/licenses/>.
+
+
+# Imports for common Python 2/3 codebase
+from __future__ import print_function, division, absolute_import
+from future import standard_library
+standard_library.install_aliases()
+from builtins import range
+
+# External module imports
+import pytest
+import numpy as np
+from numpy import float64
+
+# ODL imports
+import odl
+from odlcuda.cu_ntuples import (
+    CudaNtuples, CudaFn, CudaFnVector,
+    CudaFnNoWeighting, CudaFnConstWeighting, CudaFnVectorWeighting,
+    CudaFnCustomInnerProduct, CudaFnCustomNorm, CudaFnCustomDist,
+    cu_weighted_inner, cu_weighted_norm, cu_weighted_dist, CUDA_DTYPES)
+
+from odl.util.testutils import (all_equal, all_almost_equal, almost_equal,
+                                example_vectors, example_element)
+
+
+# Helper to generate data
+def _pos_vector(fn):
+    """Create an vector with positive real entries as weight in `fn`."""
+    return np.abs(example_element(fn)) + 0.1
+
+
+# Pytest fixtures
+
+
+spc_params = ['100 float32']
+spc_ids = [' size={} dtype={} '
+           ''.format(*p.split()) for p in spc_params]
+
+
+@pytest.fixture(scope="module", ids=spc_ids, params=spc_params)
+def fn(request):
+    size, dtype = request.param.split()
+    return CudaFn(int(size), dtype=dtype)
+
+
+# Simply modify exp_params to modify the fixture
+exp_params = [2.0, 1.0, float('inf'), 0.5, 1.5, 3.0]
+exp_ids = [' p = {} '.format(p) for p in exp_params]
+exp_fixture = pytest.fixture(scope="module", ids=exp_ids, params=exp_params)
+
+
+@exp_fixture
+def exponent(request):
+    return request.param
+
+
+# Simply modify dtype_params to modify the fixture
+dtype_params = CudaFn.available_dtypes()
+dtype_ids = [' dtype = {} '.format(t) for t in dtype_params]
+dtype_fixture = pytest.fixture(scope="module", ids=dtype_ids,
+                               params=dtype_params)
+
+
+@dtype_fixture
+def dtype(request):
+    return request.param
+
+
+ufunc_params = [ufunc for ufunc in odl.util.ufuncs.UFUNCS]
+ufunc_ids = [' ufunc={} '.format(p[0]) for p in ufunc_params]
+
+
+@pytest.fixture(scope="module", ids=ufunc_ids, params=ufunc_params)
+def ufunc(request):
+    return request.param
+
+
+reduction_params = [reduction for reduction in odl.util.ufuncs.REDUCTIONS]
+reduction_ids = [' reduction={} '.format(p[0]) for p in reduction_params]
+
+
+@pytest.fixture(scope="module", ids=reduction_ids, params=reduction_params)
+def reduction(request):
+    return request.param
+
+
+# --- CUDA space tests --- #
+
+
+def test_init_cudantuples(dtype):
+    # verify that the code runs
+    CudaNtuples(3, dtype=dtype).element()
+    CudaFn(3, dtype=dtype).element()
+
+
+def test_init_exponent(exponent, dtype):
+    CudaFn(3, dtype=dtype, exponent=exponent)
+
+
+def test_init_cudantuples_bad_dtype():
+    with pytest.raises(TypeError):
+        CudaNtuples(3, dtype=np.ndarray)
+    with pytest.raises(TypeError):
+        CudaNtuples(3, dtype=str)
+    with pytest.raises(TypeError):
+        CudaNtuples(3, dtype=np.matrix)
+
+
+def test_init_weighting(exponent):
+    const = 1.5
+    weight_vec = _pos_vector(CudaFn(3))
+    weight_elem = CudaFn(3, dtype='float32').element(weight_vec)
+
+    f3_none = CudaFn(3, dtype='float32', exponent=exponent)
+    f3_const = CudaFn(3, dtype='float32', weight=const, exponent=exponent)
+    f3_vec = CudaFn(3, dtype='float32', weight=weight_vec, exponent=exponent)
+    f3_elem = CudaFn(3, dtype='float32', weight=weight_elem, exponent=exponent)
+
+    weighting_none = CudaFnNoWeighting(exponent=exponent)
+    weighting_const = CudaFnConstWeighting(const, exponent=exponent)
+    weighting_vec = CudaFnVectorWeighting(weight_vec, exponent=exponent)
+    weighting_elem = CudaFnVectorWeighting(weight_elem, exponent=exponent)
+
+    assert f3_none.weighting == weighting_none
+    assert f3_const.weighting == weighting_const
+    assert f3_vec.weighting == weighting_vec
+    assert f3_elem.weighting == weighting_elem
+
+
+def test_element(fn):
+    x = fn.element()
+    assert x in fn
+
+    y = fn.element(inp=[0] * fn.size)
+    assert y in fn
+
+    z = fn.element(data_ptr=y.data_ptr)
+    assert z in fn
+
+    # Rewrap
+    z2 = fn.element(z)
+    assert z2 in fn
+
+    w = fn.element(inp=np.zeros(fn.size, fn.dtype))
+    assert w in fn
+
+    with pytest.raises(ValueError):
+        fn.element(inp=[0] * fn.size, data_ptr=y.data_ptr)
+
+
+def test_vector_cuda():
+
+    # Rn
+    inp = [1.0, 2.0, 3.0]
+
+    x = odl.vector(inp, dtype='float32', impl='cuda')
+    assert isinstance(x, CudaFnVector)
+    assert x.dtype == np.dtype('float32')
+    assert all_equal(x, inp)
+
+    x = odl.vector([1.0, 2.0, float('inf')], dtype='float32', impl='cuda')
+    assert x.dtype == np.dtype('float32')
+    assert isinstance(x, CudaFnVector)
+
+    x = odl.vector([1.0, 2.0, float('nan')], dtype='float32', impl='cuda')
+    assert x.dtype == np.dtype('float32')
+    assert isinstance(x, CudaFnVector)
+
+    x = odl.vector([1, 2, 3], dtype='float32', impl='cuda')
+    assert x.dtype == np.dtype('float32')
+    assert isinstance(x, CudaFnVector)
+
+
+def test_zero(fn):
+    assert all_almost_equal(fn.zero(), [0] * fn.size)
+
+
+def test_one(fn):
+    assert all_almost_equal(fn.one(), [1] * fn.size)
+
+
+def test_list_init(fn):
+    x_list = list(range(fn.size))
+    x = fn.element(x_list)
+    assert all_almost_equal(x, x_list)
+
+
+def test_ndarray_init():
+    r3 = CudaFn(3)
+
+    x0 = np.array([1., 2., 3.])
+    x = r3.element(x0)
+    assert all_equal(x, x0)
+
+    x0 = np.array([1, 2, 3], dtype=float64)
+    x = r3.element(x0)
+    assert all_equal(x, x0)
+
+    x0 = np.array([1, 2, 3], dtype=int)
+    x = r3.element(x0)
+    assert all_equal(x, x0)
+
+
+def test_getitem():
+    r3 = CudaFn(3)
+    y = [1, 2, 3]
+    x = r3.element(y)
+
+    for index in [0, 1, 2, -1, -2, -3]:
+        assert x[index] == y[index]
+
+
+def test_iterator():
+    r3 = CudaFn(3)
+    y = [1, 2, 3]
+    x = r3.element(y)
+
+    assert all_equal([a for a in x], [b for b in y])
+
+
+def test_getitem_index_error():
+    r3 = CudaFn(3)
+    x = r3.element([1, 2, 3])
+
+    with pytest.raises(IndexError):
+        x[-4]
+
+    with pytest.raises(IndexError):
+        x[3]
+
+
+def test_setitem():
+    r3 = CudaFn(3)
+    x = r3.element([42, 42, 42])
+
+    for index in [0, 1, 2, -1, -2, -3]:
+        x[index] = index
+        assert x[index] == index
+
+
+def test_setitem_index_error():
+    r3 = CudaFn(3)
+    x = r3.element([1, 2, 3])
+
+    with pytest.raises(IndexError):
+        x[-4] = 0
+
+    with pytest.raises(IndexError):
+        x[3] = 0
+
+
+def _test_getslice(slice):
+    # Validate get against python list behaviour
+    r6 = CudaFn(6)
+    y = [0, 1, 2, 3, 4, 5]
+    x = r6.element(y)
+
+    assert all_equal(x[slice], y[slice])
+
+
+def test_getslice():
+    # Tests getting all combinations of slices
+    steps = [None, -2, -1, 1, 2]
+    starts = [None, -1, -3, 0, 2, 5]
+    ends = [None, -1, -3, 0, 2, 5]
+
+    for start in starts:
+        for end in ends:
+            for step in steps:
+                _test_getslice(slice(start, end, step))
+
+
+def test_slice_of_slice():
+    # Verify that creating slices from slices works as expected
+    r10 = CudaFn(10)
+    xh = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    xd = r10.element(xh)
+
+    yh = xh[1:8:2]
+    yd = xd[1:8:2]
+
+    assert all_equal(yh, yd)
+
+    zh = yh[1::2]
+    zd = yd[1::2]
+
+    assert all_equal(zh, zd)
+
+
+def test_slice_is_view():
+    # Verify that modifications of a view modify the original data
+    r10 = CudaFn(10)
+    xh = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+    xd = r10.element(xh)
+
+    yh = xh[1:8:2]
+    yh[:] = [0, 0, 0, 0]
+
+    yd = xd[1:8:2]
+    yd[:] = [0, 0, 0, 0]
+
+    assert all_equal(xh, xd)
+    assert all_equal(yh, yd)
+
+
+def test_getslice_index_error():
+    r3 = CudaFn(3)
+    xd = r3.element([1, 2, 3])
+
+    # Bad slice
+    with pytest.raises(IndexError):
+        xd[10:13]
+
+
+def _test_setslice(slice):
+    # Validate set against python list behaviour
+    r6 = CudaFn(6)
+    z = [7, 8, 9, 10, 11, 10]
+    y = [0, 1, 2, 3, 4, 5]
+    x = r6.element(y)
+
+    x[slice] = z[slice]
+    y[slice] = z[slice]
+    assert all_equal(x, y)
+
+
+def test_setslice():
+    # Tests a range of combination of slices
+    steps = [None, -2, -1, 1, 2]
+    starts = [None, -1, -3, 0, 2, 5]
+    ends = [None, -1, -3, 0, 2, 5]
+
+    for start in starts:
+        for end in ends:
+            for step in steps:
+                _test_setslice(slice(start, end, step))
+
+
+def test_setslice_index_error():
+    r3 = CudaFn(3)
+    xd = r3.element([1, 2, 3])
+
+    # Bad slice
+    with pytest.raises(IndexError):
+        xd[10:13] = [1, 2, 3]
+
+    # Bad size of rhs
+    with pytest.raises(IndexError):
+        xd[:] = []
+
+    with pytest.raises(IndexError):
+        xd[:] = [1, 2]
+
+    with pytest.raises(IndexError):
+        xd[:] = [1, 2, 3, 4]
+
+
+def test_inner():
+    r3 = CudaFn(3)
+    x = r3.element([1, 2, 3])
+    y = r3.element([5, 3, 9])
+
+    correct_inner = 1 * 5 + 2 * 3 + 3 * 9
+
+    # Space function
+    assert almost_equal(r3.inner(x, y), correct_inner)
+
+    # Exponent != 2 -> no inner product
+    r3 = CudaFn(3, exponent=1)
+    x = r3.element([1, 2, 3])
+    y = r3.element([5, 3, 9])
+
+    with pytest.raises(NotImplementedError):
+        r3.inner(x, y)
+    with pytest.raises(NotImplementedError):
+        x.inner(y)
+
+
+def test_norm(exponent):
+    r3 = CudaFn(3, exponent=exponent)
+    xarr, x = example_vectors(r3)
+
+    correct_norm = np.linalg.norm(xarr, ord=exponent)
+
+    if exponent == float('inf'):
+        # Not yet implemented, should raise
+        with pytest.raises(NotImplementedError):
+            r3.norm(x)
+            x.norm()
+    else:
+        assert almost_equal(r3.norm(x), correct_norm)
+        assert almost_equal(x.norm(), correct_norm)
+
+
+def test_dist(exponent):
+    r3 = CudaFn(3, exponent=exponent)
+    [xarr, yarr], [x, y] = example_vectors(r3, n=2)
+
+    correct_dist = np.linalg.norm(xarr - yarr, ord=exponent)
+
+    if exponent == float('inf'):
+        # Not yet implemented, should raise
+        with pytest.raises(NotImplementedError):
+            r3.dist(x, y)
+        with pytest.raises(NotImplementedError):
+            x.dist(y)
+    else:
+        assert almost_equal(r3.dist(x, y), correct_dist)
+        assert almost_equal(x.dist(y), correct_dist)
+
+
+def test_astype():
+    # Complex not implemented
+    rn = CudaFn(3, weight=1.5)
+    assert rn.astype('float32') == rn
+
+    with pytest.raises(TypeError):
+        rn.astype(complex)
+
+
+def _test_lincomb(fn, a, b):
+    # Validates lincomb against the result on host with randomized
+    # data and given a,b
+
+    # Unaliased arguments
+    [x_arr, y_arr, z_arr], [x, y, z] = example_vectors(fn, 3)
+
+    z_arr[:] = a * x_arr + b * y_arr
+    fn.lincomb(a, x, b, y, out=z)
+    assert all_almost_equal([x, y, z],
+                            [x_arr, y_arr, z_arr])
+
+    # First argument aliased with output
+    [x_arr, y_arr, z_arr], [x, y, z] = example_vectors(fn, 3)
+
+    z_arr[:] = a * z_arr + b * y_arr
+    fn.lincomb(a, z, b, y, out=z)
+    assert all_almost_equal([x, y, z],
+                            [x_arr, y_arr, z_arr])
+
+    # Second argument aliased with output
+    [x_arr, y_arr, z_arr], [x, y, z] = example_vectors(fn, 3)
+
+    z_arr[:] = a * x_arr + b * z_arr
+    fn.lincomb(a, x, b, z, out=z)
+    assert all_almost_equal([x, y, z],
+                            [x_arr, y_arr, z_arr])
+
+    # Both arguments aliased with each other
+    [x_arr, y_arr, z_arr], [x, y, z] = example_vectors(fn, 3)
+
+    z_arr[:] = a * x_arr + b * x_arr
+    fn.lincomb(a, x, b, x, out=z)
+    assert all_almost_equal([x, y, z],
+                            [x_arr, y_arr, z_arr])
+
+    # All aliased
+    [x_arr, y_arr, z_arr], [x, y, z] = example_vectors(fn, 3)
+
+    z_arr[:] = a * z_arr + b * z_arr
+    fn.lincomb(a, z, b, z, out=z)
+    assert all_almost_equal([x, y, z],
+                            [x_arr, y_arr, z_arr])
+
+
+def test_lincomb(fn):
+    scalar_values = [0, 1, -1, 3.41]
+    for a in scalar_values:
+        for b in scalar_values:
+            _test_lincomb(fn, a, b)
+
+
+def _test_member_lincomb(spc, a):
+    # Validates vector member lincomb against the result on host
+
+    # Generate vectors
+    [x_host, y_host], [x_device, y_device] = example_vectors(spc, 2)
+
+    # Host side calculation
+    y_host[:] = a * x_host
+
+    # Device side calculation
+    y_device.lincomb(a, x_device)
+
+    # Cuda only uses floats, so require 5 places
+    assert all_almost_equal(y_device, y_host)
+
+
+def test_member_lincomb(fn):
+    scalar_values = [0, 1, -1, 3.41, 10.0, 1.0001]
+    for a in scalar_values:
+        _test_member_lincomb(fn, a)
+
+
+def test_multiply(fn):
+    # Validates multiply against the result on host with randomized data
+    [xarr, yarr, zarr], [x_device, y_device, z_device] = example_vectors(fn, 3)
+
+    # Host side calculation
+    zarr[:] = xarr * yarr
+
+    # Device side calculation
+    fn.multiply(x_device, y_device, out=z_device)
+
+    assert all_almost_equal([x_device, y_device, z_device],
+                            [xarr, yarr, zarr])
+
+    # Aliased
+    zarr[:] = xarr * zarr
+    fn.multiply(z_device, x_device, out=z_device)
+
+    assert all_almost_equal([x_device, z_device],
+                            [xarr, zarr])
+
+    # Aliased
+    zarr[:] = zarr * zarr
+    fn.multiply(z_device, z_device, out=z_device)
+
+    assert all_almost_equal(z_device, zarr)
+
+
+def test_member_multiply(fn):
+    # Validate vector member multiply against the result on host
+    # with randomized data
+    [x_host, y_host], [x_device, y_device] = example_vectors(fn, 2)
+
+    # Host side calculation
+    y_host *= x_host
+
+    # Device side calculation
+    y_device *= x_device
+
+    # Cuda only uses floats, so require 5 places
+    assert all_almost_equal(y_device, y_host)
+
+
+def _test_unary_operator(spc, function):
+    # Verify that the statement y=function(x) gives equivalent
+    # results to Numpy.
+    x_arr, x = example_vectors(spc)
+
+    y_arr = function(x_arr)
+    y = function(x)
+
+    assert all_almost_equal([x, y],
+                            [x_arr, y_arr])
+
+
+def _test_binary_operator(spc, function):
+    # Verify that the statement z=function(x,y) gives equivalent
+    # results to Numpy.
+    [x_arr, y_arr], [x, y] = example_vectors(spc, 2)
+
+    z_arr = function(x_arr, y_arr)
+    z = function(x, y)
+
+    assert all_almost_equal([x, y, z],
+                            [x_arr, y_arr, z_arr])
+
+
+def test_operators(fn):
+    # Test of all operator overloads against the corresponding
+    # Numpy implementation
+
+    # Unary operators
+    _test_unary_operator(fn, lambda x: +x)
+    _test_unary_operator(fn, lambda x: -x)
+
+    # Scalar multiplication
+    for scalar in [-31.2, -1, 0, 1, 2.13]:
+        def imul(x):
+            x *= scalar
+        _test_unary_operator(fn, imul)
+        _test_unary_operator(fn, lambda x: x * scalar)
+
+    # Scalar division
+    for scalar in [-31.2, -1, 1, 2.13]:
+        def idiv(x):
+            x /= scalar
+        _test_unary_operator(fn, idiv)
+        _test_unary_operator(fn, lambda x: x / scalar)
+
+    # Incremental operations
+    def iadd(x, y):
+        x += y
+
+    def isub(x, y):
+        x -= y
+
+    def imul(x, y):
+        x *= y
+
+    def idiv(x, y):
+        x /= y
+
+    _test_binary_operator(fn, iadd)
+    _test_binary_operator(fn, isub)
+    _test_binary_operator(fn, imul)
+    _test_binary_operator(fn, idiv)
+
+    # Incremental operators with aliased inputs
+    def iadd_aliased(x):
+        x += x
+
+    def isub_aliased(x):
+        x -= x
+
+    def imul_aliased(x):
+        x *= x
+
+    def idiv_aliased(x):
+        x /= x
+
+    _test_unary_operator(fn, iadd_aliased)
+    _test_unary_operator(fn, isub_aliased)
+    _test_unary_operator(fn, imul_aliased)
+    _test_unary_operator(fn, idiv_aliased)
+
+    # Binary operators
+    _test_binary_operator(fn, lambda x, y: x + y)
+    _test_binary_operator(fn, lambda x, y: x - y)
+    _test_binary_operator(fn, lambda x, y: x * y)
+    _test_binary_operator(fn, lambda x, y: x / y)
+
+    # Binary with aliased inputs
+    _test_unary_operator(fn, lambda x: x + x)
+    _test_unary_operator(fn, lambda x: x - x)
+    _test_unary_operator(fn, lambda x: x * x)
+    _test_unary_operator(fn, lambda x: x / x)
+
+
+def test_incompatible_operations():
+    r3 = CudaFn(3)
+    R3h = odl.rn(3)
+    xA = r3.zero()
+    xB = R3h.zero()
+
+    with pytest.raises(TypeError):
+        xA += xB
+
+    with pytest.raises(TypeError):
+        xA -= xB
+
+    with pytest.raises(TypeError):
+        xA + xB
+
+    with pytest.raises(TypeError):
+        xA - xB
+
+
+def test_copy(fn):
+    import copy
+
+    x = example_element(fn)
+    y = copy.copy(x)
+
+    assert x == y
+    assert y is not x
+
+    z = copy.deepcopy(x)
+
+    assert x == z
+    assert z is not x
+
+
+def test_transpose(fn):
+    x = example_element(fn)
+    y = example_element(fn)
+
+    # Assert linear operator
+    assert isinstance(x.T, odl.Operator)
+    assert x.T.is_linear
+
+    # Check result
+    assert almost_equal(x.T(y), x.inner(y))
+    assert all_almost_equal(x.T.adjoint(1.0), x)
+
+    # x.T.T returns self
+    assert x.T.T == x
+
+
+def test_modify():
+    r3 = CudaFn(3)
+    xd = r3.element([1, 2, 3])
+    yd = r3.element(data_ptr=xd.data_ptr)
+
+    yd[:] = [5, 6, 7]
+
+    assert all_equal(xd, yd)
+
+
+def test_sub_vector():
+    r6 = CudaFn(6)
+    r3 = CudaFn(3)
+    xd = r6.element([1, 2, 3, 4, 5, 6])
+
+    yd = r3.element(data_ptr=xd.data_ptr)
+    yd[:] = [7, 8, 9]
+
+    assert all_almost_equal([7, 8, 9, 4, 5, 6], xd)
+
+
+def test_offset_sub_vector():
+    r6 = CudaFn(6)
+    r3 = CudaFn(3)
+    xd = r6.element([1, 2, 3, 4, 5, 6])
+
+    yd = r3.element(data_ptr=xd.data_ptr + 3 * xd.space.dtype.itemsize)
+    yd[:] = [7, 8, 9]
+
+    assert all_equal([1, 2, 3, 7, 8, 9], xd)
+
+
+def _test_dtype(dt):
+    if dt not in CUDA_DTYPES:
+        with pytest.raises(TypeError):
+            r3 = CudaFn(3, dt)
+    else:
+        r3 = CudaFn(3, dt)
+        x = r3.element([1, 2, 3])
+        y = r3.element([4, 5, 6])
+        z = x + y
+        assert all_almost_equal(z, [5, 7, 9])
+
+
+def test_dtypes():
+    for dt in [np.int8, np.int16, np.int32, np.int64, np.int,
+               np.uint8, np.uint16, np.uint32, np.uint64, np.uint,
+               np.float32, np.float64, np.float,
+               np.complex64, np.complex128, np.complex]:
+        yield _test_dtype, dt
+
+# --- Weighting tests --- #
+
+
+def test_const_init(exponent):
+    const = 1.5
+    CudaFnConstWeighting(const, exponent=exponent)
+
+
+def test_const_equals(exponent):
+    constant = 1.5
+
+    weighting = CudaFnConstWeighting(constant, exponent=exponent)
+    weighting2 = CudaFnConstWeighting(constant, exponent=exponent)
+    other_weighting = CudaFnConstWeighting(2.5, exponent=exponent)
+    wrong_exp = CudaFnConstWeighting(constant, exponent=exponent + 1)
+
+    assert weighting == weighting
+    assert weighting == weighting2
+    assert weighting2 == weighting
+
+    assert weighting != other_weighting
+    assert weighting != wrong_exp
+
+
+def test_const_inner():
+    rn = CudaFn(5)
+    [xarr, yarr], [x, y] = example_vectors(rn, 2)
+
+    constant = 1.5
+    weighting = CudaFnConstWeighting(constant)
+
+    true_inner = constant * np.vdot(yarr, xarr)
+    assert almost_equal(weighting.inner(x, y), true_inner)
+
+
+def test_const_norm(exponent):
+    rn = CudaFn(5)
+    xarr, x = example_vectors(rn)
+
+    constant = 1.5
+    weighting = CudaFnConstWeighting(constant, exponent=exponent)
+
+    factor = 1 if exponent == float('inf') else constant ** (1 / exponent)
+    true_norm = factor * np.linalg.norm(xarr, ord=exponent)
+
+    if exponent == float('inf'):
+        # Not yet implemented, should raise
+        with pytest.raises(NotImplementedError):
+            weighting.norm(x)
+    else:
+        assert almost_equal(weighting.norm(x), true_norm)
+
+    # Same with free function
+    pnorm = cu_weighted_norm(constant, exponent=exponent)
+    if exponent == float('inf'):
+        # Not yet implemented, should raise
+        with pytest.raises(NotImplementedError):
+            pnorm(x)
+    else:
+        assert almost_equal(pnorm(x), true_norm)
+
+
+def test_const_dist(exponent):
+    rn = CudaFn(5)
+    [xarr, yarr], [x, y] = example_vectors(rn, n=2)
+
+    constant = 1.5
+    weighting = CudaFnConstWeighting(constant, exponent=exponent)
+
+    factor = 1 if exponent == float('inf') else constant ** (1 / exponent)
+    true_dist = factor * np.linalg.norm(xarr - yarr, ord=exponent)
+
+    if exponent == float('inf'):
+        # Not yet implemented, should raise
+        with pytest.raises(NotImplementedError):
+            weighting.dist(x, y)
+    else:
+        assert almost_equal(weighting.dist(x, y), true_dist)
+
+    # Same with free function
+    pdist = cu_weighted_dist(constant, exponent=exponent)
+    if exponent == float('inf'):
+        # Not yet implemented, should raise
+        with pytest.raises(NotImplementedError):
+            pdist(x, y)
+    else:
+        assert almost_equal(pdist(x, y), true_dist)
+
+
+def test_vector_init():
+    rn = CudaFn(5)
+    weight_vec = _pos_vector(rn)
+
+    CudaFnVectorWeighting(weight_vec)
+    CudaFnVectorWeighting(rn.element(weight_vec))
+
+
+def test_vector_is_valid():
+    rn = CudaFn(5)
+    weight = _pos_vector(rn)
+
+    weighting = CudaFnVectorWeighting(weight)
+
+    assert weighting.is_valid()
+
+    # Invalid
+    weight[0] = 0
+
+    weighting = CudaFnVectorWeighting(weight)
+
+    assert not weighting.is_valid()
+
+
+def test_vector_equals():
+    rn = CudaFn(5)
+    weight = _pos_vector(rn)
+
+    weighting = CudaFnVectorWeighting(weight)
+    weighting2 = CudaFnVectorWeighting(weight)
+
+    assert weighting == weighting2
+
+
+def test_vector_inner():
+    rn = CudaFn(5)
+    [xarr, yarr], [x, y] = example_vectors(rn, 2)
+
+    weight = _pos_vector(CudaFn(5))
+
+    weighting = CudaFnVectorWeighting(weight)
+
+    true_inner = np.vdot(yarr, xarr * weight.asarray())
+
+    assert almost_equal(weighting.inner(x, y), true_inner)
+
+    # Same with free function
+    inner_vec = cu_weighted_inner(weight)
+
+    assert almost_equal(inner_vec(x, y), true_inner)
+
+    # Exponent != 2 -> no inner product, should raise
+    with pytest.raises(NotImplementedError):
+        CudaFnVectorWeighting(weight, exponent=1.0).inner(x, y)
+
+
+def test_vector_norm(exponent):
+    rn = CudaFn(5)
+    xarr, x = example_vectors(rn)
+
+    weight = _pos_vector(CudaFn(5))
+
+    weighting = CudaFnVectorWeighting(weight, exponent=exponent)
+
+    if exponent in (1.0, float('inf')):
+        true_norm = np.linalg.norm(weight.asarray() * xarr, ord=exponent)
+    else:
+        true_norm = np.linalg.norm(weight.asarray() ** (1 / exponent) * xarr,
+                                   ord=exponent)
+
+    if exponent == float('inf'):
+        # Not yet implemented, should raise
+        with pytest.raises(NotImplementedError):
+            weighting.norm(x)
+    else:
+        assert almost_equal(weighting.norm(x), true_norm)
+
+    # Same with free function
+    pnorm = cu_weighted_norm(weight, exponent=exponent)
+
+    if exponent == float('inf'):
+        # Not yet implemented, should raise
+        with pytest.raises(NotImplementedError):
+            pnorm(x)
+    else:
+        assert almost_equal(pnorm(x), true_norm)
+
+
+def test_vector_dist(exponent):
+    rn = CudaFn(5)
+    [xarr, yarr], [x, y] = example_vectors(rn, n=2)
+
+    weight = _pos_vector(CudaFn(5))
+
+    weighting = CudaFnVectorWeighting(weight, exponent=exponent)
+
+    if exponent in (1.0, float('inf')):
+        true_dist = np.linalg.norm(weight.asarray() * (xarr - yarr),
+                                   ord=exponent)
+    else:
+        true_dist = np.linalg.norm(
+            weight.asarray() ** (1 / exponent) * (xarr - yarr), ord=exponent)
+
+    if exponent == float('inf'):
+        # Not yet implemented, should raise
+        with pytest.raises(NotImplementedError):
+            weighting.dist(x, y)
+    else:
+        assert almost_equal(weighting.dist(x, y), true_dist)
+
+    # Same with free function
+    pdist = cu_weighted_dist(weight, exponent=exponent)
+
+    if exponent == float('inf'):
+        # Not yet implemented, should raise
+        with pytest.raises(NotImplementedError):
+            pdist(x, y)
+    else:
+        assert almost_equal(pdist(x, y), true_dist)
+
+
+def test_custom_inner(fn):
+    [xarr, yarr], [x, y] = example_vectors(fn, 2)
+
+    def inner(x, y):
+        return np.vdot(y, x)
+
+    w = CudaFnCustomInnerProduct(inner)
+    w_same = CudaFnCustomInnerProduct(inner)
+    w_other = CudaFnCustomInnerProduct(np.dot)
+    w_d = CudaFnCustomInnerProduct(inner, dist_using_inner=False)
+
+    assert w == w
+    assert w == w_same
+    assert w != w_other
+    assert w != w_d
+
+    true_inner = inner(xarr, yarr)
+    assert almost_equal(w.inner(x, y), true_inner)
+
+    true_norm = np.linalg.norm(xarr)
+    assert almost_equal(w.norm(x), true_norm)
+
+    true_dist = np.linalg.norm(xarr - yarr)
+    # Using 3 places (single precision default) since the result is always
+    # double even if the underlying computation was only single precision
+    assert almost_equal(w.dist(x, y), true_dist, places=3)
+    assert almost_equal(w_d.dist(x, y), true_dist)
+
+    with pytest.raises(TypeError):
+        CudaFnCustomInnerProduct(1)
+
+
+def test_custom_norm(fn):
+    [xarr, yarr], [x, y] = example_vectors(fn, 2)
+
+    norm = np.linalg.norm
+
+    def other_norm(x):
+        return np.linalg.norm(x, ord=1)
+
+    w = CudaFnCustomNorm(norm)
+    w_same = CudaFnCustomNorm(norm)
+    w_other = CudaFnCustomNorm(other_norm)
+
+    assert w == w
+    assert w == w_same
+    assert w != w_other
+
+    with pytest.raises(NotImplementedError):
+        w.inner(x, y)
+
+    true_norm = np.linalg.norm(xarr)
+    assert almost_equal(w.norm(x), true_norm)
+
+    true_dist = np.linalg.norm(xarr - yarr)
+    assert almost_equal(w.dist(x, y), true_dist)
+
+    with pytest.raises(TypeError):
+        CudaFnCustomNorm(1)
+
+
+def test_custom_dist(fn):
+    [xarr, yarr], [x, y] = example_vectors(fn, 2)
+
+    def dist(x, y):
+        return np.linalg.norm(x - y)
+
+    def other_dist(x, y):
+        return np.linalg.norm(x - y, ord=1)
+
+    w = CudaFnCustomDist(dist)
+    w_same = CudaFnCustomDist(dist)
+    w_other = CudaFnCustomDist(other_dist)
+
+    assert w == w
+    assert w == w_same
+    assert w != w_other
+
+    with pytest.raises(NotImplementedError):
+        w.inner(x, y)
+
+    with pytest.raises(NotImplementedError):
+        w.norm(x)
+
+    true_dist = np.linalg.norm(xarr - yarr)
+    assert almost_equal(w.dist(x, y), true_dist)
+
+    with pytest.raises(TypeError):
+        CudaFnCustomDist(1)
+
+
+def test_ufuncs(fn, ufunc):
+    name, n_args, n_out, _ = ufunc
+    if (np.issubsctype(fn.dtype, np.floating) and
+            name in ['bitwise_and',
+                     'bitwise_or',
+                     'bitwise_xor',
+                     'invert',
+                     'left_shift',
+                     'right_shift']):
+        # Skip integer only methods if floating point type
+        return
+
+    # Get the ufunc from numpy as reference
+    ufunc = getattr(np, name)
+
+    # Create some data
+    arrays, vectors = example_vectors(fn, n_args + n_out)
+    in_arrays = arrays[:n_args]
+    out_arrays = arrays[n_args:]
+    data_vector = vectors[0]
+    in_vectors = vectors[1:n_args]
+    out_vectors = vectors[n_args:]
+
+    # Out of place:
+    np_result = ufunc(*in_arrays)
+    vec_fun = getattr(data_vector.ufunc, name)
+    odl_result = vec_fun(*in_vectors)
+    assert all_almost_equal(np_result, odl_result)
+
+    # Test type of output
+    if n_out == 1:
+        assert isinstance(odl_result, fn.element_type)
+    elif n_out > 1:
+        for i in range(n_out):
+            assert isinstance(odl_result[i], fn.element_type)
+
+    # In place:
+    np_result = ufunc(*(in_arrays + out_arrays))
+    vec_fun = getattr(data_vector.ufunc, name)
+    odl_result = vec_fun(*(in_vectors + out_vectors))
+    assert all_almost_equal(np_result, odl_result)
+
+    # Test inplace actually holds:
+    if n_out == 1:
+        assert odl_result is out_vectors[0]
+    elif n_out > 1:
+        for i in range(n_out):
+            assert odl_result[i] is out_vectors[i]
+
+
+def test_reductions(fn, reduction):
+    name, _ = reduction
+
+    ufunc = getattr(np, name)
+
+    # Create some data
+    x_arr, x = example_vectors(fn, 1)
+
+    assert almost_equal(ufunc(x_arr), getattr(x.ufunc, name)())
+
+
+if __name__ == '__main__':
+    pytest.main(str(__file__.replace('\\', '/') + ' -v'))
diff --git a/test/cuda_test.py b/test/cuda_test.py
index 1a07573..8796131 100644
--- a/test/cuda_test.py
+++ b/test/cuda_test.py
@@ -3,7 +3,7 @@
 This is mostly compilation tests, main suite is in odl
 """
 
-import odlpp.odlpp_cuda as cuda
+import odlcuda.odlcuda_ as cuda
 import pytest
 
 vec_ids = [str(el) for el in dir(cuda) if el.startswith('CudaVector')]