diff --git a/CMakeLists.txt b/CMakeLists.txt
index c76525c0..0d8cbabc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,13 +55,20 @@ if(Matar_ENABLE_KOKKOS)
     if("${Matar_KOKKOS_PACKAGE}" STREQUAL "Trilinos")
         find_package(Trilinos REQUIRED)
         add_definitions(-DTRILINOS_INTERFACE=1)
+    elseif(Matar_ENABLE_TRILINOS)
+        find_package(Trilinos REQUIRED)
+        add_definitions(-DTRILINOS_INTERFACE=1)
     else()
         find_package(Kokkos REQUIRED)
     endif()
     if (Matar_ENABLE_MPI)
         find_package(MPI REQUIRED)
         add_definitions(-DHAVE_MPI=1)
-        target_link_libraries(matar INTERFACE Kokkos::kokkos MPI::MPI_CXX)
+        if(Matar_ENABLE_TRILINOS)
+            target_link_libraries(matar INTERFACE Trilinos::all_selected_libs MPI::MPI_CXX)
+        else()
+            target_link_libraries(matar INTERFACE Kokkos::kokkos MPI::MPI_CXX)
+        endif()
     else()
         target_link_libraries(matar INTERFACE Kokkos::kokkos)
     endif()
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 0f3e6aab..66b589a3 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -10,9 +10,9 @@ if (NOT TARGET distclean)
   INCLUDE(../cmake/Modules/TargetDistclean.cmake OPTIONAL)
 endif (NOT TARGET distclean)
 
+find_package(Matar REQUIRED)
 set(LINKING_LIBRARIES matar)
 
-find_package(Matar REQUIRED)
 if (MPI)
     find_package(MPI REQUIRED)
     add_definitions(-DHAVE_MPI=1)
@@ -36,9 +36,43 @@ if (NOT KOKKOS)
 endif()
 
 if (KOKKOS)
-  find_package(Kokkos REQUIRED) #new
+  if (Matar_ENABLE_TRILINOS)
+    find_package(Trilinos REQUIRED) #new
+    # Assume if the CXX compiler exists, the rest do too.
+    if (EXISTS ${Trilinos_CXX_COMPILER})
+      set(CMAKE_CXX_COMPILER ${Trilinos_CXX_COMPILER})
+      set(CMAKE_C_COMPILER ${Trilinos_C_COMPILER})
+      set(CMAKE_Fortran_COMPILER ${Trilinos_Fortran_COMPILER})
+    endif()
+    if(NOT DISTRIBUTION)
+      # Make sure to use same compilers and flags as Trilinos
+      set(CMAKE_CXX_FLAGS  "${Trilinos_CXX_COMPILER_FLAGS} ${CMAKE_CXX_FLAGS}")
+      set(CMAKE_C_FLAGS  "${Trilinos_C_COMPILER_FLAGS} ${CMAKE_C_FLAGS}")
+      set(CMAKE_Fortran_FLAGS  "${Trilinos_Fortran_COMPILER_FLAGS} ${CMAKE_Fortran_FLAGS}")
+    endif()
+
+    message("\nFound Trilinos!  Here are the details: ")
+    message("   Trilinos_DIR = ${Trilinos_DIR}")
+    message("   Trilinos_VERSION = ${Trilinos_VERSION}")
+    message("   Trilinos_PACKAGE_LIST = ${Trilinos_PACKAGE_LIST}")
+    message("   Trilinos_LIBRARIES = ${Trilinos_LIBRARIES}")
+    message("   Trilinos_INCLUDE_DIRS = ${Trilinos_INCLUDE_DIRS}")
+    message("   Trilinos_LIBRARY_DIRS = ${Trilinos_LIBRARY_DIRS}")
+    message("   Trilinos_TPL_LIST = ${Trilinos_TPL_LIST}")
+    message("   Trilinos_TPL_INCLUDE_DIRS = ${Trilinos_TPL_INCLUDE_DIRS}")
+    message("   Trilinos_TPL_LIBRARIES = ${Trilinos_TPL_LIBRARIES}")
+    message("   Trilinos_TPL_LIBRARY_DIRS = ${Trilinos_TPL_LIBRARY_DIRS}")
+    message("   Trilinos_BUILD_SHARED_LIBS = ${Trilinos_BUILD_SHARED_LIBS}")
+    message("End of Trilinos details\n")
+
+    include_directories(${Trilinos_INCLUDE_DIRS} ${Trilinos_TPL_INCLUDE_DIRS})
+    list(APPEND LINKING_LIBRARIES Trilinos::all_selected_libs)
+    add_definitions(-DTRILINOS_INTERFACE=1)
+  else()
+    find_package(Kokkos REQUIRED) #new
+    list(APPEND LINKING_LIBRARIES Kokkos::kokkos)
+  endif()
 
-  list(APPEND LINKING_LIBRARIES Kokkos::kokkos)
 
   add_definitions(-DHAVE_KOKKOS=1)
 
@@ -76,11 +110,36 @@ if (KOKKOS)
   add_executable(annkokkos ann_kokkos.cpp)
   target_link_libraries(annkokkos ${LINKING_LIBRARIES})
 
+  add_executable(annkokkos_compare ann_kokkos_compare.cpp)
+  target_link_libraries(annkokkos_compare ${LINKING_LIBRARIES})
+
+  if (Matar_ENABLE_TRILINOS)
+    add_executable(anndistributed ann_distributed.cpp)
+    target_link_libraries(anndistributed ${LINKING_LIBRARIES})
+    
+    add_executable(anndistributed_crs ann_distributed_crs.cpp)
+    target_link_libraries(anndistributed_crs ${LINKING_LIBRARIES})
+
+    add_executable(test_tpetra_farray test_tpetra_farray.cpp)
+    target_link_libraries(test_tpetra_farray ${LINKING_LIBRARIES})
+
+    add_executable(test_tpetra_carray test_tpetra_carray.cpp)
+    target_link_libraries(test_tpetra_carray ${LINKING_LIBRARIES})
+
+    add_executable(test_tpetra_mesh test_tpetra_mesh.cpp)
+    target_link_libraries(test_tpetra_mesh ${LINKING_LIBRARIES})
+  endif()
+
   if (OPENMP)
     add_executable(parallel_hello_world parallel_hello_world.cpp)
     target_link_libraries(parallel_hello_world ${LINKING_LIBRARIES})
   endif()
 
+  if (MPI)
+    include_directories(laplaceMPI)
+    add_subdirectory(laplaceMPI)
+  endif()
+
 endif()
 
 ### HIP Linking error, will add back in after fixed
@@ -114,11 +173,6 @@ add_subdirectory(sparsetests)
 include_directories(test_rocm)
 add_subdirectory(test_rocm)
 
-if (MPI)
-   include_directories(laplaceMPI)
-   add_subdirectory(laplaceMPI)
-endif()
-
 #include_directories(phaseField/srcKokkosVerbose)
 #add_subdirectory(phaseField/srcKokkosVerbose)
 
diff --git a/examples/ann_distributed.cpp b/examples/ann_distributed.cpp
new file mode 100644
index 00000000..7f75d919
--- /dev/null
+++ b/examples/ann_distributed.cpp
@@ -0,0 +1,436 @@
+/**********************************************************************************************
+ � 2020. Triad National Security, LLC. All rights reserved.
+ This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+ National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+ Department of Energy/National Nuclear Security Administration. All rights in the program are
+ reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+ Security Administration. The Government is granted for itself and others acting on its behalf a
+ nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+ derivative works, distribute copies to the public, perform publicly and display publicly, and
+ to permit others to do so.
+ This program is open source under the BSD-3 License.
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+ 1.  Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+ 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+ conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ 3.  Neither the name of the copyright holder nor the names of its contributors may be used
+ to endorse or promote products derived from this software without specific prior
+ written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **********************************************************************************************/
+#include <stdio.h>
+#include <array>
+#include <vector>
+#include <chrono>
+#include <math.h>
+
+#include "matar.h"
+
+using namespace mtr; // matar namespace
+
+
+
+// =================================================================
+// Artificial Neural Network (ANN)
+//
+// For a single layer, we have x_i inputs with weights_{ij}, 
+// creating y_j outputs.  We have
+//     y_j = Fcn(b_j) = Fcn( Sum_i {x_i w_{ij}} )
+// where the activation function Fcn is applied to b_j, creating 
+// outputs y_j. For multiple layers, we have
+//      b_j^l = Sum_i (x_i^{l-1} w_{ij}^l)
+// where l is a layer, and as before, an activation function is  
+// applied to b_j^l, creating outputs y_j^l.
+// 
+// =================================================================
+
+
+// =================================================================
+//
+// Number of nodes in each layer including inputs and outputs
+//
+// =================================================================
+std::vector <size_t> num_nodes_in_layer = {32000, 16000, 8000, 4000, 2000, 1000, 100} ;
+//std::vector <size_t> num_nodes_in_layer = {50, 25} ;
+// {9, 50, 100, 300, 200, 100, 20, 6}
+
+
+
+// =================================================================
+//
+// data types and classes
+//
+// =================================================================
+
+// array of ANN structs
+struct ANNLayer_t{
+    //input map will store every global id in the vector for simplificty of row-vector products in this example
+    TpetraPartitionMap<> output_partition_map; //map with all comms for row-vector product
+    TpetraPartitionMap<> output_unique_map; //submap of uniquely decomposed indices
+    TpetraDFArray<real_t> distributed_output_row;
+    TpetraDFArray<real_t> distributed_outputs;
+    TpetraDFArray<real_t> distributed_weights;
+    TpetraDFArray<real_t> distributed_biases;
+    TpetraCommunicationPlan<real_t> output_comms;
+
+}; // end struct
+
+
+
+// =================================================================
+//
+// functions
+//
+// =================================================================
+void vec_mat_multiply(TpetraDFArray<real_t> &inputs,
+                      TpetraDFArray<real_t> &outputs, 
+                      TpetraDFArray<real_t> &matrix){
+    
+    const size_t num_i = inputs.size();
+    const size_t num_j = outputs.submap_size();
+
+    using team_t = typename Kokkos::TeamPolicy<>::member_type;
+    Kokkos::parallel_for ("MatVec", Kokkos::TeamPolicy<> (num_j, Kokkos::AUTO),
+                 KOKKOS_LAMBDA (const team_t& team_h) {
+
+        float sum = 0;
+        int j = team_h.league_rank();
+        Kokkos::parallel_reduce (Kokkos::TeamThreadRange (team_h, num_i),
+                        [&] (int i, float& lsum) {
+            lsum += inputs(i)*matrix(j,i);
+        }, sum); // end parallel reduce
+        outputs(j) = sum;
+
+    }); // end parallel for
+
+
+    FOR_ALL(j,0,num_j, {
+            if(fabs(outputs(j) - num_i)>= 1e-15){
+                printf("error in vec mat multiply test at row %d of %f\n", j, fabs(outputs(j) - num_i));
+            }
+    });
+    
+    return;
+
+}; // end function
+
+KOKKOS_INLINE_FUNCTION
+float sigmoid(const float value){
+    return 1.0/(1.0 + exp(-value));  // exp2f doesn't work with CUDA
+}; // end function
+
+
+KOKKOS_INLINE_FUNCTION
+float sigmoid_derivative(const float value){
+    float sigval = sigmoid(value);
+    return sigval*(1.0 - sigval);  // exp2f doesn't work with CUDA
+}; // end function
+
+
+
+
+void forward_propagate_layer(TpetraDFArray<real_t> &inputs,
+                             TpetraDFArray<real_t> &outputs, 
+                             TpetraDFArray<real_t> &weights,
+                             const TpetraDFArray<real_t> &biases){
+    
+    const size_t num_i = inputs.size();
+    const size_t num_j = outputs.size();
+    //inputs.print();
+    //perform comms to get full input vector for row vector products on matrix
+    //VERY SIMPLE EXAMPLE OF COMMS; THIS IS A NONIDEAL WAY TO DECOMPOSE THE PROBLEM
+    
+    FOR_ALL(j, 0, num_j,{
+
+    	//printf("thread = %d \n", omp_get_thread_num());
+
+            float value = 0.0;
+            for(int i=0; i<num_i; i++){
+                // b_j = Sum_i {x_i w_{ij}}
+                value += inputs(i)*weights(i,j);
+            } // end for
+
+            // apply activation function, sigmoid on a float, y_j = Fcn(b_j)
+            outputs(j) = 1.0/(1.0 + exp(-value)); 
+
+    }); // end parallel for
+
+    // For a GPU, use the nested parallelism below here
+    /*
+    // FOR_FIRST(j, 0, num_j,{
+
+    // 	//printf("thread = %d \n", omp_get_thread_num());
+    //     float value = 0.0;
+    //     float lsum = 0.0;
+    //     FOR_REDUCE_SUM_SECOND(i, 0, num_i, lsum,{
+    //         // b_j = Sum_i {x_i w_{ij}}
+    //         lsum += inputs(i)*weights(i,j);
+    //     }, value); // end for
+
+    //     // apply activation function, sigmoid on a float, y_j = Fcn(b_j)
+    //     outputs(j) = 1.0/(1.0 + exp(-value)); 
+
+    // }); // end parallel for
+     */
+    
+   
+
+
+    return;
+
+}; // end function
+
+
+void set_biases(const TpetraDFArray<real_t> &biases){
+    const size_t num_j = biases.size();
+
+    FOR_ALL(j,0,num_j, {
+		    biases(j) = 0.0;
+	}); // end parallel for
+
+}; // end function
+
+
+void set_weights(const TpetraDFArray<real_t> &weights){
+
+    const size_t num_i = weights.dims(0);
+    const size_t num_j = weights.dims(1);
+    
+	FOR_ALL(i,0,num_i,
+	        j,0,num_j, {
+		    
+		    weights(i,j) = 1.0;
+	}); // end parallel for
+
+}; // end function
+
+
+// =================================================================
+//
+// Main function
+//
+// =================================================================
+int main(int argc, char* argv[])
+{   
+    MPI_Init(&argc, &argv);
+    int process_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
+    Kokkos::initialize(argc, argv);
+    {
+
+        // =================================================================
+        // allocate arrays
+        // =================================================================
+
+        // note: the num_nodes_in_layer has the inputs into the ANN, so subtract 1 for the layers
+        size_t num_layers = num_nodes_in_layer.size()-1;  
+
+        CArray <ANNLayer_t> ANNLayers(num_layers); // starts at 1 and goes to num_layers
+
+        // input and ouput values to ANN
+        TpetraPartitionMap<> input_pmap, input_unique_pmap;
+        DCArrayKokkos<long long int> all_layer_indices(num_nodes_in_layer[0]);
+        FOR_ALL(i,0,num_nodes_in_layer[0], {
+            all_layer_indices(i) = i;
+        });
+        all_layer_indices.update_host();  // copy inputs to device
+        //map of all indices in this layer to be used for row-vector product (in practice, this would not include all indices in the layer)
+        input_pmap = TpetraPartitionMap<>(all_layer_indices);
+
+        //map that decomposes indices of this onto set of processes uniquely (used to demonstrate comms for above)
+        input_unique_pmap = TpetraPartitionMap<>(num_nodes_in_layer[0]);
+        TpetraDFArray<real_t> inputs_row(input_pmap); //rows decomposed onto processes
+        long long int min_index = input_pmap.getLocalIndex(input_unique_pmap.getMinGlobalIndex());
+        TpetraDFArray<real_t> inputs(input_unique_pmap); //rows decomposed onto processes
+        //comming from subview requires both the original map and the submap to be composed of contiguous indices
+
+        // set the strides
+        // layer 0 are the inputs to the ANN
+        // layer n-1 are the outputs from the ANN
+        for (size_t layer=0; layer<num_layers; layer++){
+
+            // dimensions
+            size_t num_i = num_nodes_in_layer[layer];
+            size_t num_j = num_nodes_in_layer[layer+1];
+            DCArrayKokkos<long long int> all_current_layer_indices(num_nodes_in_layer[layer+1]);
+            FOR_ALL(i,0,num_nodes_in_layer[layer+1], {
+                all_current_layer_indices(i) = i;
+            });
+
+            ANNLayers(layer).output_partition_map = TpetraPartitionMap<>(all_current_layer_indices);
+            ANNLayers(layer).output_unique_map = TpetraPartitionMap<>(num_nodes_in_layer[layer+1]);
+            ANNLayers(layer).distributed_output_row = TpetraDFArray<real_t> (ANNLayers(layer).output_partition_map);
+            ANNLayers(layer).distributed_outputs = TpetraDFArray<real_t> (ANNLayers(layer).output_unique_map);
+            //comm object between unique mapped output and full output row view
+            ANNLayers(layer).output_comms = TpetraCommunicationPlan<real_t>(ANNLayers(layer).distributed_output_row, ANNLayers(layer).distributed_outputs);
+
+            // allocate the weights in this layer
+            ANNLayers(layer).distributed_weights = TpetraDFArray<real_t> (num_j, num_i);
+            ANNLayers(layer).distributed_biases = TpetraDFArray<real_t> (num_j);
+
+        } // end for
+
+
+        // =================================================================
+        // set weights, biases, and inputs
+        // =================================================================
+        
+        // inputs to ANN
+        size_t local_input_size = inputs.size();
+        //std::cout << "full_input_size " << input_pmap.num_global_ << "\n";
+        for (size_t i=0; i<local_input_size; i++) {
+            inputs.host(i) = 1.0;
+        }
+        
+        // //debug print
+        // std::ostream &out = std::cout;
+        // Teuchos::RCP<Teuchos::FancyOStream> fos;
+        // fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(out));
+        // inputs.tpetra_sub_vector->describe(*fos,Teuchos::VERB_EXTREME);
+        
+        inputs.update_device();  // copy inputs to device
+        TpetraCommunicationPlan<real_t> input_comms(inputs_row, inputs);
+        input_comms.execute_comms(); //distribute to full map for row-vector product
+        //inputs.print();
+
+        // for (size_t i=0; i<num_nodes_in_layer[0]; i++) {
+        //     std::cout << "input at " << i << " is " << inputs(i) << "\n";
+        // }
+
+        // weights of the ANN
+        for (size_t layer=0; layer<num_layers; layer++){
+
+            set_weights(ANNLayers(layer).distributed_weights);
+            set_biases(ANNLayers(layer).distributed_biases);
+
+        } // end for over layers
+
+
+
+        // =================================================================
+        // Testing vec matrix multiply
+        // =================================================================        
+        // vec_mat_multiply(inputs_row,
+        //                  ANNLayers(0).distributed_outputs,
+        //                  ANNLayers(0).distributed_weights); 
+        
+        if(process_rank==0)
+            std::cout << "vec mat multiply test completed \n";
+
+
+        //inputs_row.print();
+
+        // =================================================================
+        // Use the ANN
+        // =================================================================
+        MPI_Barrier(MPI_COMM_WORLD);
+        Kokkos::fence();
+        auto time_1 = std::chrono::high_resolution_clock::now();
+
+        // forward propogate
+
+        // layer 1, hidden layer 0, uses the inputs as the input values
+        forward_propagate_layer(inputs_row,
+                                ANNLayers(0).distributed_outputs,
+                                ANNLayers(0).distributed_weights,
+                                ANNLayers(0).distributed_biases); 
+
+        // layer 2 through n-1, layer n-1 goes to the output
+        for (size_t layer=1; layer<num_layers; layer++){
+            
+            ANNLayers(layer-1).distributed_outputs.update_host();
+            ANNLayers(layer-1).output_comms.execute_comms(); //distribute to full map for row-vector product
+            // go through this layer, the fcn takes(inputs, outputs, weights)
+            forward_propagate_layer(ANNLayers(layer-1).distributed_output_row, 
+                                    ANNLayers(layer).distributed_outputs,
+                                    ANNLayers(layer).distributed_weights,
+                                    ANNLayers(layer).distributed_biases);
+            
+        } // end for
+        Kokkos::fence();
+        MPI_Barrier(MPI_COMM_WORLD);
+        auto time_2 = std::chrono::high_resolution_clock::now();
+
+        std::chrono::duration <float, std::milli> ms = time_2 - time_1;
+        if(process_rank==0)
+            std::cout << "runtime of ANN test = " << ms.count() << "ms\n\n";
+        
+        
+        // =================================================================
+        // Copy values to host
+        // =================================================================
+        ANNLayers(num_layers-1).distributed_outputs.update_host();
+
+        // if(process_rank==0)
+        //     std::cout << "output values grid: \n";
+        std::flush(std::cout);
+        MPI_Barrier(MPI_COMM_WORLD);
+        ANNLayers(num_layers-1).distributed_outputs.print();
+
+        //test repartition; assume a 10 by 10 grid of outputs from ANN
+        //assign coords to each grid point, find a partition of the grid, then repartition output layer using new map
+        TpetraDFArray<real_t> output_grid(100, 2); //array of 2D coordinates for 10 by 10 grid of points
+        
+        //populate coords
+        long long int min_global = output_grid.pmap.getMinGlobalIndex();
+        FOR_ALL(i,0,output_grid.dims(0), {
+		    output_grid(i, 0) = (min_global + i)/10;
+            output_grid(i, 1) = (min_global + i)%10;
+	    }); // end parallel for
+
+        output_grid.update_host();
+        //output_grid.print();
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        if(process_rank==0){ 
+            std::cout << std::endl;
+            std::cout << " Map before repartitioning" << std::endl;
+        }
+        std::flush(std::cout);
+        output_grid.pmap.print();
+        
+        MPI_Barrier(MPI_COMM_WORLD);
+        output_grid.repartition_vector();
+        if(process_rank==0){ 
+            std::cout << std::endl;
+            std::cout << " Map after repartitioning" << std::endl;
+        }
+        output_grid.pmap.print();
+
+        if(process_rank==0){ 
+            std::cout << std::endl;
+            std::cout << " Grid components per rank after repartitioning" << std::endl;
+        }
+
+        output_grid.print();
+
+        //example to get repartitioned map to distribute new arrays with it
+        TpetraPartitionMap<> partitioned_output_map = output_grid.pmap;
+        TpetraDFArray<real_t> partitioned_array(partitioned_output_map, "partitioned output values");
+
+    } // end of kokkos scope
+
+    Kokkos::finalize();
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(process_rank==0)
+        printf("\nfinished\n\n");
+    MPI_Finalize();
+
+
+    return 0;
+}
+
+
diff --git a/examples/ann_distributed_crs.cpp b/examples/ann_distributed_crs.cpp
new file mode 100644
index 00000000..e415f23c
--- /dev/null
+++ b/examples/ann_distributed_crs.cpp
@@ -0,0 +1,452 @@
+/**********************************************************************************************
+ � 2020. Triad National Security, LLC. All rights reserved.
+ This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+ National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+ Department of Energy/National Nuclear Security Administration. All rights in the program are
+ reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+ Security Administration. The Government is granted for itself and others acting on its behalf a
+ nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+ derivative works, distribute copies to the public, perform publicly and display publicly, and
+ to permit others to do so.
+ This program is open source under the BSD-3 License.
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+ 1.  Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+ 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+ conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ 3.  Neither the name of the copyright holder nor the names of its contributors may be used
+ to endorse or promote products derived from this software without specific prior
+ written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **********************************************************************************************/
+#include <stdio.h>
+#include <array>
+#include <vector>
+#include <chrono>
+#include <math.h>
+
+#include "matar.h"
+
+using namespace mtr; // matar namespace
+
+
+
+// =================================================================
+// Artificial Neural Network (ANN)
+//
+// For a single layer, we have x_i inputs with weights_{ij}, 
+// creating y_j outputs.  We have
+//     y_j = Fcn(b_j) = Fcn( Sum_i {x_i w_{ij}} )
+// where the activation function Fcn is applied to b_j, creating 
+// outputs y_j. For multiple layers, we have
+//      b_j^l = Sum_i (x_i^{l-1} w_{ij}^l)
+// where l is a layer, and as before, an activation function is  
+// applied to b_j^l, creating outputs y_j^l.
+// 
+// =================================================================
+
+
+// =================================================================
+//
+// Number of nodes in each layer including inputs and outputs
+//
+// =================================================================
+std::vector <size_t> num_nodes_in_layer = {64000, 30000, 8000, 4000, 2000, 1000, 100} ;
+//std::vector <size_t> num_nodes_in_layer = {50, 25} ;
+// {9, 50, 100, 300, 200, 100, 20, 6}
+
+
+
+// =================================================================
+//
+// data types and classes
+//
+// =================================================================
+
+// array of ANN structs
+struct ANNLayer_t{
+    //input map will store every global id in the vector for simplificty of row-vector products in this example
+    TpetraPartitionMap<> output_partition_map; //map with all comms for row-vector product
+    TpetraPartitionMap<> output_unique_map; //submap of uniquely decomposed indices
+    TpetraDCArray<real_t> distributed_outputs;
+    TpetraDCArray<real_t> distributed_weights;
+    TpetraDCArray<real_t> distributed_biases; 
+
+}; // end struct
+
+
+
+// =================================================================
+//
+// functions
+//
+// =================================================================
+void vec_mat_multiply(TpetraDCArray<real_t> &inputs,
+                      TpetraDCArray<real_t> &outputs, 
+                      TpetraDCArray<real_t> &matrix){
+    
+    const size_t num_i = inputs.size();
+    const size_t num_j = outputs.submap_size();
+
+    using team_t = typename Kokkos::TeamPolicy<>::member_type;
+    Kokkos::parallel_for ("MatVec", Kokkos::TeamPolicy<> (num_j, Kokkos::AUTO),
+                 KOKKOS_LAMBDA (const team_t& team_h) {
+
+        float sum = 0;
+        int j = team_h.league_rank();
+        Kokkos::parallel_reduce (Kokkos::TeamThreadRange (team_h, num_i),
+                        [&] (int i, float& lsum) {
+            lsum += inputs(i)*matrix(j,i);
+        }, sum); // end parallel reduce
+        int global_index = outputs.getSubMapGlobalIndex(j);
+        int local_index = outputs.getMapLocalIndex(global_index);
+        outputs(local_index) = sum;
+
+    }); // end parallel for
+
+
+    FOR_ALL(j,0,num_j, {
+            int global_index = outputs.getSubMapGlobalIndex(j);
+            int local_index = outputs.getMapLocalIndex(global_index);
+            if(fabs(outputs(local_index) - num_i)>= 1e-15){
+                printf("error in vec mat multiply test at row %d of %f\n", j, fabs(outputs(local_index) - num_i));
+            }
+    });
+    
+    return;
+
+}; // end function
+
+KOKKOS_INLINE_FUNCTION
+float sigmoid(const float value){
+    return 1.0/(1.0 + exp(-value));  // exp2f doesn't work with CUDA
+}; // end function
+
+
+KOKKOS_INLINE_FUNCTION
+float sigmoid_derivative(const float value){
+    float sigval = sigmoid(value);
+    return sigval*(1.0 - sigval);  // exp2f doesn't work with CUDA
+}; // end function
+
+
+
+
+void forward_propagate_layer(TpetraDCArray<real_t> &inputs,
+                             TpetraDCArray<real_t> &outputs, 
+                             TpetraDCArray<real_t> &weights,
+                             const TpetraDCArray<real_t> &biases){
+    
+    const size_t num_i = inputs.size();
+    const size_t num_j = outputs.submap_size();
+
+    //perform comms to get full input vector for row vector products on matrix
+    //VERY SIMPLE EXAMPLE OF COMMS; THIS IS A TERRIBLE WAY TO DECOMPOSE THE PROBLEM
+
+    
+    FOR_ALL(j, 0, num_j,{
+
+    	//printf("thread = %d \n", omp_get_thread_num());
+
+            float value = 0.0;
+            for(int i=0; i<num_i; i++){
+                // b_j = Sum_i {x_i w_{ij}}
+                value += inputs(i)*weights(j,i);
+            } // end for
+
+            // apply activation function, sigmoid on a float, y_j = Fcn(b_j)
+            int global_index = outputs.getSubMapGlobalIndex(j);
+            int local_index = outputs.getMapLocalIndex(global_index);
+            outputs(local_index) = 1.0/(1.0 + exp(-value)); 
+
+        }); // end parallel for
+     
+
+    /*
+    // For a GPU, use the nested parallelism below here
+    using team_t = typename Kokkos::TeamPolicy<>::member_type;
+    Kokkos::parallel_for ("MatVec", Kokkos::TeamPolicy<> (num_j, Kokkos::AUTO),
+                 KOKKOS_LAMBDA (const team_t& team_h) {
+
+        float sum = 0;
+        int j = team_h.league_rank();
+        Kokkos::parallel_reduce (Kokkos::TeamThreadRange (team_h, num_i),
+                        [&] (int i, float& lsum) {
+            lsum += inputs(i)*weights(j,i) + biases(j);
+        }, sum); // end parallel reduce
+        int global_index = outputs.getSubMapGlobalIndex(j);
+        int local_index = outputs.getMapLocalIndex(global_index);
+        outputs(local_index) = 1.0/(1.0 + exp(-sum)); 
+
+    }); // end parallel for
+    */
+
+
+    return;
+
+}; // end function
+
+
+void set_biases(const TpetraDCArray<real_t> &biases){
+    const size_t num_j = biases.size();
+
+    FOR_ALL(j,0,num_j, {
+		    biases(j) = 0.0;
+	}); // end parallel for
+
+}; // end function
+
+
+void set_weights(const TpetraDCArray<real_t> &weights){
+
+    const size_t num_i = weights.dims(0);
+    const size_t num_j = weights.dims(1);
+    
+	FOR_ALL(i,0,num_i,
+	        j,0,num_j, {
+		    
+		    weights(i,j) = 1.0;
+	}); // end parallel for
+
+}; // end function
+
+
+// =================================================================
+//
+// Main function
+//
+// =================================================================
+int main(int argc, char* argv[])
+{   
+    MPI_Init(&argc, &argv);
+    int process_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
+    Kokkos::initialize(argc, argv);
+    {
+
+        // =================================================================
+        // allocate arrays
+        // =================================================================
+
+        // note: the num_nodes_in_layer has the inputs into the ANN, so subtract 1 for the layers
+        size_t num_layers = num_nodes_in_layer.size()-1;  
+
+        CArray <ANNLayer_t> ANNLayers(num_layers); // starts at 1 and goes to num_layers
+
+        // input and ouput values to ANN
+        TpetraPartitionMap<> input_pmap, input_unique_pmap;
+        DCArrayKokkos<long long int> all_layer_indices(num_nodes_in_layer[0]);
+        FOR_ALL(i,0,num_nodes_in_layer[0], {
+            all_layer_indices(i) = i;
+        });
+        all_layer_indices.update_host();  // copy inputs to device
+        //map of all indices in this layer to be used for row-vector product (in practice, this would not include all indices in the layer)
+        input_pmap = TpetraPartitionMap<>(all_layer_indices);
+
+        //map that decomposes indices of this onto set of processes uniquely (used to demonstrate comms for above)
+        input_unique_pmap = TpetraPartitionMap<>(num_nodes_in_layer[0]);
+        TpetraDCArray<real_t> inputs(input_pmap); //rows decomposed onto processes
+        //comming from subview requires both the original map and the submap to be composed of contiguous indices
+        inputs.own_comm_setup(input_unique_pmap); //tells the vector its communicating from a contiguous subset of its own data
+
+        // set the strides
+        // layer 0 are the inputs to the ANN
+        // layer n-1 are the outputs from the ANN
+        for (size_t layer=0; layer<num_layers; layer++){
+
+            // dimensions
+            size_t num_i = num_nodes_in_layer[layer];
+            size_t num_j = num_nodes_in_layer[layer+1];
+            DCArrayKokkos<long long int> all_current_layer_indices(num_nodes_in_layer[layer+1]);
+            FOR_ALL(i,0,num_nodes_in_layer[layer+1], {
+                all_current_layer_indices(i) = i;
+            });
+
+            ANNLayers(layer).output_partition_map = TpetraPartitionMap<>(all_current_layer_indices);
+            ANNLayers(layer).output_unique_map = TpetraPartitionMap<>(num_nodes_in_layer[layer+1]); 
+            ANNLayers(layer).distributed_outputs = TpetraDCArray<real_t> (ANNLayers(layer).output_partition_map);
+            //comming from subview requires both the original map and the submap to be composed of contiguous indices
+            ANNLayers(layer).distributed_outputs.own_comm_setup(ANNLayers(layer).output_unique_map);
+            // allocate the weights in this layer
+            ANNLayers(layer).distributed_weights = TpetraDCArray<real_t> (num_j, num_i);
+            ANNLayers(layer).distributed_biases = TpetraDCArray<real_t> (num_j);
+
+        } // end for
+
+
+        // =================================================================
+        // set weights, biases, and inputs
+        // =================================================================
+        
+        // inputs to ANN
+        size_t local_input_size = inputs.submap_size();
+        //std::cout << "full_input_size " << input_pmap.num_global_ << "\n";
+        for (size_t i=0; i<local_input_size; i++) {
+            int global_index = inputs.getSubMapGlobalIndex(i);
+            int local_index = inputs.getMapLocalIndex(global_index);
+            inputs.host(local_index) = 1.0;
+        }
+        
+        // //debug print
+        // std::ostream &out = std::cout;
+        // Teuchos::RCP<Teuchos::FancyOStream> fos;
+        // fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(out));
+        // inputs.tpetra_sub_vector->describe(*fos,Teuchos::VERB_EXTREME);
+        
+        inputs.update_device();  // copy inputs to device
+        inputs.perform_comms(); //distribute to full map for row-vector product
+
+        // for (size_t i=0; i<num_nodes_in_layer[0]; i++) {
+        //     std::cout << "input at " << i << " is " << inputs(i) << "\n";
+        // }
+
+        // weights of the ANN
+        for (size_t layer=0; layer<num_layers; layer++){
+
+            set_weights(ANNLayers(layer).distributed_weights);
+            set_biases(ANNLayers(layer).distributed_biases);
+
+        } // end for over layers
+
+
+
+        // =================================================================
+        // Testing vec matrix multiply
+        // =================================================================        
+        vec_mat_multiply(inputs,
+                         ANNLayers(0).distributed_outputs,
+                         ANNLayers(0).distributed_weights); 
+        
+        if(process_rank==0)
+            std::cout << "vec mat multiply test completed \n";
+
+
+
+
+        // =================================================================
+        // Use the ANN
+        // =================================================================
+        
+        MPI_Barrier(MPI_COMM_WORLD);
+        Kokkos::fence();
+        auto time_1 = std::chrono::high_resolution_clock::now();
+
+        // forward propogate
+
+        // layer 1, hidden layer 0, uses the inputs as the input values
+        forward_propagate_layer(inputs,
+                                ANNLayers(0).distributed_outputs,
+                                ANNLayers(0).distributed_weights,
+                                ANNLayers(0).distributed_biases); 
+
+        // layer 2 through n-1, layer n-1 goes to the output
+        for (size_t layer=1; layer<num_layers; layer++){
+            
+            ANNLayers(layer-1).distributed_outputs.perform_comms(); //distribute to full map for row-vector product
+            // go through this layer, the fcn takes(inputs, outputs, weights)
+            forward_propagate_layer(ANNLayers(layer-1).distributed_outputs, 
+                                    ANNLayers(layer).distributed_outputs,
+                                    ANNLayers(layer).distributed_weights,
+                                    ANNLayers(layer).distributed_biases);
+            
+        } // end for
+        Kokkos::fence();
+        MPI_Barrier(MPI_COMM_WORLD);
+        auto time_2 = std::chrono::high_resolution_clock::now();
+
+        std::chrono::duration <float, std::milli> ms = time_2 - time_1;
+        if(process_rank==0)
+            std::cout << "runtime of ANN test = " << ms.count() << "ms\n\n";
+        
+        
+        // =================================================================
+        // Copy values to host
+        // =================================================================
+        ANNLayers(num_layers-1).distributed_outputs.update_host();
+
+        // if(process_rank==0)
+        //     std::cout << "output values grid: \n";
+        std::flush(std::cout);
+        MPI_Barrier(MPI_COMM_WORLD);
+        std::stringstream output_stream;
+        size_t local_output_size = ANNLayers(num_layers-1).distributed_outputs.submap_size();
+        // for (size_t val=0; val < local_output_size; val++){
+        //     int global_index = ANNLayers(num_layers-1).distributed_outputs.getSubMapGlobalIndex(val);
+        //     int local_index = ANNLayers(num_layers-1).distributed_outputs.getMapLocalIndex(global_index);
+        //     output_stream << " " << ANNLayers(num_layers-1).distributed_outputs.host(local_index);
+        //     if(val%10==0) output_stream << std::endl;
+        // } // end for
+        // std::cout << output_stream.str();
+        // std::flush(std::cout);
+
+        //test repartition; assume a 10 by 10 grid of outputs from ANN
+        //assign coords to each grid point, find a partition of the grid, then repartition output layer using new map
+        TpetraDCArray<real_t> output_grid(100, 2); //array of 2D coordinates for 10 by 10 grid of points
+        
+        //populate coords
+        FOR_ALL(i,0,output_grid.dims(0), {
+		    output_grid(i, 0) = i/10;
+            output_grid(i, 1) = i%10;
+	    }); // end parallel for
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        if(process_rank==0){ 
+            std::cout << std::endl;
+            std::cout << " Map before repartitioning" << std::endl;
+        }
+        std::flush(std::cout);
+        output_grid.pmap.print();
+        
+        MPI_Barrier(MPI_COMM_WORLD);
+        //output_grid.repartition_vector();
+        if(process_rank==0){ 
+            std::cout << std::endl;
+            std::cout << " Map after repartitioning" << std::endl;
+        }
+        output_grid.pmap.print();
+
+        if(process_rank==0){ 
+            std::cout << std::endl;
+            std::cout << " Grid components per rank after repartitioning" << std::endl;
+        }
+        output_grid.print();
+
+        //get repartitioned map to distribute new arrays with it
+        TpetraPartitionMap<> partitioned_output_map = output_grid.pmap;
+        TpetraDCArray<real_t> partitioned_output_values(partitioned_output_map, "partitioned output values");
+
+        //construct a unique source vector from ANN output using the subview constructor
+        //(for example's sake this is in fact a copy of the subview wrapped by the output as well)
+        TpetraDCArray<real_t> sub_output_values(ANNLayers(num_layers-1).distributed_outputs, ANNLayers(num_layers-1).distributed_outputs.comm_pmap,
+                                                 ANNLayers(num_layers-1).distributed_outputs.comm_pmap.getMinGlobalIndex());
+
+        //general communication object between two vectors/arrays
+        TpetraLRCommunicationPlan<real_t> output_comms(partitioned_output_values, sub_output_values);
+        output_comms.execute_comms();
+        partitioned_output_values.print();
+
+    } // end of kokkos scope
+
+    Kokkos::finalize();
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(process_rank==0)
+        printf("\nfinished\n\n");
+    MPI_Finalize();
+
+
+    return 0;
+}
+
+
diff --git a/examples/ann_kokkos.cpp b/examples/ann_kokkos.cpp
index 33901f6d..9d7b1ffd 100644
--- a/examples/ann_kokkos.cpp
+++ b/examples/ann_kokkos.cpp
@@ -63,7 +63,7 @@ using namespace mtr; // matar namespace
 // Number of nodes in each layer including inputs and outputs
 //
 // =================================================================
-std::vector <size_t> num_nodes_in_layer = {64000, 30000, 8000, 4000, 100, 25, 6} ;
+std::vector <size_t> num_nodes_in_layer = {64000, 30000, 8000, 4000, 2000, 1000, 100} ;
 // {9, 50, 100, 300, 200, 100, 20, 6}
 
 
@@ -147,7 +147,7 @@ void forward_propagate_layer(DCArrayKokkos <float> &inputs,
     const size_t num_j = outputs.size();
 
 
-/*    
+    /*
     FOR_ALL(j, 0, num_j,{
 
     	//printf("thread = %d \n", omp_get_thread_num());
@@ -162,7 +162,7 @@ void forward_propagate_layer(DCArrayKokkos <float> &inputs,
             outputs(j) = sigmoid(value);
 
         }); // end parallel for
-*/     
+    */
 
 
     // For a GPU, use the nested parallelism below here
@@ -293,7 +293,7 @@ int main(int argc, char* argv[])
         // =================================================================
         // Use the ANN
         // =================================================================
-
+        Kokkos::fence();
         auto time_1 = std::chrono::high_resolution_clock::now();
 
         // forward propogate
@@ -314,6 +314,7 @@ int main(int argc, char* argv[])
                                     ANNLayers(1).biases); 
         } // end for
 
+        Kokkos::fence();
         auto time_2 = std::chrono::high_resolution_clock::now();
 
         std::chrono::duration <float, std::milli> ms = time_2 - time_1;
diff --git a/examples/ann_kokkos_compare.cpp b/examples/ann_kokkos_compare.cpp
new file mode 100644
index 00000000..e6f498cd
--- /dev/null
+++ b/examples/ann_kokkos_compare.cpp
@@ -0,0 +1,342 @@
+/**********************************************************************************************
+ � 2020. Triad National Security, LLC. All rights reserved.
+ This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+ National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+ Department of Energy/National Nuclear Security Administration. All rights in the program are
+ reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+ Security Administration. The Government is granted for itself and others acting on its behalf a
+ nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+ derivative works, distribute copies to the public, perform publicly and display publicly, and
+ to permit others to do so.
+ This program is open source under the BSD-3 License.
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+ 1.  Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+ 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+ conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ 3.  Neither the name of the copyright holder nor the names of its contributors may be used
+ to endorse or promote products derived from this software without specific prior
+ written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **********************************************************************************************/
+#include <stdio.h>
+#include <array>
+#include <vector>
+#include <chrono>
+#include <math.h>
+
+#include "matar.h"
+
+using namespace mtr; // matar namespace
+
+
+
+// =================================================================
+// Artificial Neural Network (ANN)
+//
+// For a single layer, we have x_i inputs with weights_{ij}, 
+// creating y_j outputs.  We have
+//     y_j = Fcn(b_j) = Fcn( Sum_i {x_i w_{ij}} )
+// where the activation function Fcn is applied to b_j, creating 
+// outputs y_j. For multiple layers, we have
+//      b_j^l = Sum_i (x_i^{l-1} w_{ij}^l)
+// where l is a layer, and as before, an activation function is  
+// applied to b_j^l, creating outputs y_j^l.
+// 
+// =================================================================
+
+
+// =================================================================
+//
+// Number of nodes in each layer including inputs and outputs
+//
+// =================================================================
+std::vector <size_t> num_nodes_in_layer = {32000, 16000, 8000, 4000, 100, 25, 6} ;
+// {9, 50, 100, 300, 200, 100, 20, 6}
+
+
+
+// =================================================================
+//
+// data types and classes
+//
+// =================================================================
+
+// array of ANN structs
+struct ANNLayer_t{
+
+    Kokkos::View <float*> outputs;  // dims = [layer]
+    Kokkos::View <float*> weights;  // dims = [layer-1, layer]
+    Kokkos::View <float*> biases;  // dims = [layer]
+
+}; // end struct
+
+
+
+// =================================================================
+//
+// functions
+//
+// =================================================================
+void vec_mat_multiply(Kokkos::View <float*> &inputs,
+                      Kokkos::View <float*> &outputs, 
+                      Kokkos::View <float*> &matrix){
+    
+    const size_t num_i = inputs.size();
+    const size_t num_j = outputs.size();
+
+    using team_t = typename Kokkos::TeamPolicy<>::member_type;
+    Kokkos::parallel_for ("MatVec", Kokkos::TeamPolicy<> (num_j, Kokkos::AUTO),
+                 KOKKOS_LAMBDA (const team_t& team_h) {
+
+        float sum = 0;
+        int j = team_h.league_rank();
+        Kokkos::parallel_reduce (Kokkos::TeamThreadRange (team_h, num_i),
+                        [&] (int i, float& lsum) {
+            lsum += inputs(i)*matrix(i+j*num_i);
+        }, sum); // end parallel reduce
+
+        outputs(j) = sum; 
+
+    }); // end parallel for
+
+
+    FOR_ALL(j,0,num_j, {
+            if(fabs(outputs(j) - num_i)>= 1e-15){
+                printf("error in vec mat multiply test \n");
+            }
+    });
+    
+    return;
+
+}; // end function
+
+KOKKOS_INLINE_FUNCTION
+float sigmoid(const float value){
+    return 1.0/(1.0 + exp(-value));  // exp2f doesn't work with CUDA
+}; // end function
+
+
+KOKKOS_INLINE_FUNCTION
+float sigmoid_derivative(const float value){
+    float sigval = sigmoid(value);
+    return sigval*(1.0 - sigval);  // exp2f doesn't work with CUDA
+}; // end function
+
+
+
+
+void forward_propagate_layer(Kokkos::View <float*> &inputs,
+                             Kokkos::View <float*> &outputs, 
+                             Kokkos::View <float*> &weights,
+                             const Kokkos::View <float*> &biases){
+    
+    const size_t num_i = inputs.size();
+    const size_t num_j = outputs.size();
+
+
+
+    FOR_ALL(j, 0, num_j,{
+
+    	//printf("thread = %d \n", omp_get_thread_num());
+
+            float value = 0.0;
+            for(int i=0; i<num_i; i++){
+                // b_j = Sum_i {x_i w_{ij}}
+                value += inputs(i)*weights(i+j*num_i);
+            } // end for
+
+            // apply activation function, sigmoid on a float, y_j = Fcn(b_j)
+            outputs(j) = sigmoid(value);
+
+        }); // end parallel for
+    
+
+
+    // For a GPU, use the nested parallelism below here
+    /*
+    using team_t = typename Kokkos::TeamPolicy<>::member_type;
+    Kokkos::parallel_for ("MatVec", Kokkos::TeamPolicy<> (num_j, Kokkos::AUTO),
+                 KOKKOS_LAMBDA (const team_t& team_h) {
+
+        float sum = 0;
+        int j = team_h.league_rank();
+        Kokkos::parallel_reduce (Kokkos::TeamThreadRange (team_h, num_i),
+                        [&] (int i, float& lsum) {
+            lsum += inputs(i)*weights(i,j) + biases(j);
+        }, sum); // end parallel reduce
+
+        outputs(j) = 1.0/(1.0 + exp(-sum)); 
+
+    }); // end parallel for
+    */
+
+
+    return;
+
+}; // end function
+
+
+void set_biases(const Kokkos::View <float*> &biases){
+    const size_t num_j = biases.size();
+
+    FOR_ALL(j,0,num_j, {
+		    biases(j) = 0.0;
+	}); // end parallel for
+
+}; // end function
+
+
+void set_weights(const Kokkos::View <float*> &weights, int num_i, int num_j){
+
+    
+	FOR_ALL(i,0,num_i,
+	        j,0,num_j, {
+		    
+		    weights(i+j*num_i) = 1.0;
+	}); // end parallel for
+
+}; // end function
+
+
+// =================================================================
+//
+// Main function
+//
+// =================================================================
+int main(int argc, char* argv[])
+{
+    Kokkos::initialize(argc, argv);
+    {
+
+        // =================================================================
+        // allocate arrays
+        // =================================================================
+
+        // note: the num_nodes_in_layer has the inputs into the ANN, so subtract 1 for the layers
+        size_t num_layers = num_nodes_in_layer.size()-1;  
+
+        CMatrix <ANNLayer_t> ANNLayers(num_layers); // starts at 1 and goes to num_layers
+
+        // input and ouput values to ANN
+        Kokkos::View <float*> inputs("inputs", num_nodes_in_layer[0]);
+
+
+        // set the strides
+        // layer 0 are the inputs to the ANN
+        // layer n-1 are the outputs from the ANN
+        for (size_t layer=1; layer<=num_layers; layer++){
+
+            // dimensions
+            size_t num_i = num_nodes_in_layer[layer-1];
+            size_t num_j = num_nodes_in_layer[layer];
+
+            // allocate the weights in this layer
+            ANNLayers(layer).weights = Kokkos::View <float*> ("weights", num_i*num_j); 
+            ANNLayers(layer).outputs = Kokkos::View <float*> ("outputs", num_j);
+            ANNLayers(layer).biases = Kokkos::View <float*> ("biases", num_j);
+
+        } // end for
+
+
+        // =================================================================
+        // set weights, biases, and inputs
+        // =================================================================
+        
+        // inputs to ANN
+        FOR_ALL(i,0,num_nodes_in_layer[0], {
+		    inputs(i) = 1.0;
+	    }); // end parallel for
+
+        // weights of the ANN
+        for (size_t layer=1; layer<=num_layers; layer++){
+
+            // dimensions
+            size_t num_i = num_nodes_in_layer[layer-1];
+            size_t num_j = num_nodes_in_layer[layer];
+
+
+            set_weights(ANNLayers(layer).weights, num_i, num_j);
+            set_biases(ANNLayers(layer).biases);
+
+        } // end for over layers
+
+
+
+        // =================================================================
+        // Testing vec matrix multiply
+        // =================================================================        
+        vec_mat_multiply(inputs,
+                         ANNLayers(1).outputs,
+                         ANNLayers(1).weights); 
+        
+        std::cout << "vec mat multiply test completed \n";
+
+
+
+
+        // =================================================================
+        // Use the ANN
+        // =================================================================
+        Kokkos::fence();
+        auto time_1 = std::chrono::high_resolution_clock::now();
+
+        // forward propogate
+
+        // layer 1, hidden layer 0, uses the inputs as the input values
+        forward_propagate_layer(inputs,
+                                ANNLayers(1).outputs,
+                                ANNLayers(1).weights,
+                                ANNLayers(1).biases); 
+
+        // layer 2 through n-1, layer n-1 goes to the output
+        for (size_t layer=2; layer<=num_layers; layer++){
+
+            // go through this layer, the fcn takes(inputs, outputs, weights)
+            forward_propagate_layer(ANNLayers(layer-1).outputs, 
+                                    ANNLayers(layer).outputs,
+                                    ANNLayers(layer).weights,
+                                    ANNLayers(1).biases); 
+        } // end for
+
+        Kokkos::fence();
+        auto time_2 = std::chrono::high_resolution_clock::now();
+
+        std::chrono::duration <float, std::milli> ms = time_2 - time_1;
+        std::cout << "runtime of ANN test = " << ms.count() << "ms\n\n";
+
+
+        // =================================================================
+        // Copy values to host
+        // =================================================================
+        //ANNLayers(num_layers).outputs.update_host();
+        
+        std::cout << "output values: \n";
+        for (size_t val=0; val<num_nodes_in_layer[num_layers]; val++){
+            //std::cout << " " << ANNLayers(num_layers).outputs.host(val) << std::endl;
+        } // end for
+ 
+    } // end of kokkos scope
+
+    Kokkos::finalize();
+
+
+
+    printf("\nfinished\n\n");
+
+    return 0;
+}
+
+
diff --git a/examples/gArrayofgArrays/CMakeLists.txt b/examples/gArrayofgArrays/CMakeLists.txt
index acbfb6fa..33a5fa97 100644
--- a/examples/gArrayofgArrays/CMakeLists.txt
+++ b/examples/gArrayofgArrays/CMakeLists.txt
@@ -4,7 +4,7 @@ find_package(Matar REQUIRED)
 
 if (KOKKOS)
   add_definitions(-DHAVE_KOKKOS=1)
-  find_package(Kokkos REQUIRED)
+  #find_package(Kokkos REQUIRED)
 
   add_definitions(-DHAVE_KOKKOS=1)
   if (CUDA)
@@ -18,6 +18,6 @@ if (KOKKOS)
   endif()
 
   add_executable(viewofview main.cpp)
-  target_link_libraries(viewofview matar Kokkos::kokkos)
+  target_link_libraries(viewofview ${LINKING_LIBRARIES})
 
 endif()
diff --git a/examples/halfspace_cooling/CMakeLists.txt b/examples/halfspace_cooling/CMakeLists.txt
index 5714039a..dbcaa6f9 100644
--- a/examples/halfspace_cooling/CMakeLists.txt
+++ b/examples/halfspace_cooling/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.1.3)
 find_package(Matar REQUIRED)
 
 if (KOKKOS)
-  find_package(Kokkos REQUIRED) #new
+  #find_package(Kokkos REQUIRED) #new
   
   add_executable(halfspace_cooling halfspace_cooling.cpp)
 
diff --git a/examples/laplace/CMakeLists.txt b/examples/laplace/CMakeLists.txt
index 45d5fbd4..acbd4a1f 100644
--- a/examples/laplace/CMakeLists.txt
+++ b/examples/laplace/CMakeLists.txt
@@ -15,7 +15,7 @@ if (NOT KOKKOS)
 endif()
 
 if (KOKKOS)
-  find_package(Kokkos REQUIRED) #new
+  #find_package(Kokkos REQUIRED) #new
 
   add_executable(kokkosview main_kokkosview.cpp)
   add_executable(carraykokkos_default_indexing main_carraykokkos_default_indexing.cpp)
diff --git a/examples/laplaceMPI/CMakeLists.txt b/examples/laplaceMPI/CMakeLists.txt
index cedc01b2..d9d4ec6c 100644
--- a/examples/laplaceMPI/CMakeLists.txt
+++ b/examples/laplaceMPI/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.1.3)
 
 if (KOKKOS)
-  find_package(Kokkos REQUIRED) #new
+  #find_package(Kokkos REQUIRED) #new
   find_package(MPI REQUIRED)
   
   #add_executable(laplace_mpi laplace_mpi.cpp)
diff --git a/examples/matar_fortran/CMakeLists.txt b/examples/matar_fortran/CMakeLists.txt
index f99bc326..37f6ce39 100644
--- a/examples/matar_fortran/CMakeLists.txt
+++ b/examples/matar_fortran/CMakeLists.txt
@@ -1,10 +1,10 @@
 cmake_minimum_required(VERSION 3.18)
 
 # specify languages that project will use here
-project(matar-fortran LANGUAGES CXX Fortran)
+#project(matar-fortran LANGUAGES CXX Fortran)
 
 if (KOKKOS)
-  find_package(Kokkos REQUIRED) #new
+  #find_package(Kokkos REQUIRED) #new
 
   add_definitions(-DHAVE_KOKKOS=1)
   if (CUDA)
diff --git a/examples/phaseFieldMPI/CMakeLists.txt b/examples/phaseFieldMPI/CMakeLists.txt
index efa63e45..3650430a 100644
--- a/examples/phaseFieldMPI/CMakeLists.txt
+++ b/examples/phaseFieldMPI/CMakeLists.txt
@@ -1,13 +1,13 @@
 cmake_minimum_required(VERSION 3.1.3)
 
-project (phasefield_mpi)
+#project (phasefield_mpi)
 
 
 if (KOKKOS)
 
   add_definitions(-DHAVE_KOKKOS=1)
 
-  find_package(Kokkos REQUIRED)
+  #find_package(Kokkos REQUIRED)
   find_package(MPI REQUIRED)  
 
   # heffte compilation flags
diff --git a/examples/sparsetests/CMakeLists.txt b/examples/sparsetests/CMakeLists.txt
index f17d7f5f..b8e3164d 100644
--- a/examples/sparsetests/CMakeLists.txt
+++ b/examples/sparsetests/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.1.3)
 
 if (KOKKOS)
-    find_package(Kokkos REQUIRED)
+    #find_package(Kokkos REQUIRED)
     add_definitions(-DHAVE_KOKKOS=1)
     
     add_executable(matVec matVec.cpp)
diff --git a/examples/test_rocm/CMakeLists.txt b/examples/test_rocm/CMakeLists.txt
index 39d9e61d..31c4c2e2 100644
--- a/examples/test_rocm/CMakeLists.txt
+++ b/examples/test_rocm/CMakeLists.txt
@@ -1,9 +1,9 @@
 cmake_minimum_required(VERSION 3.1.3)
 
-project (test_rocm)
+#project (test_rocm)
 
 if (KOKKOS)
-  find_package(Kokkos REQUIRED) #new
+  #find_package(Kokkos REQUIRED) #new
   
   add_executable(test_rocm SomeClass.cpp main.cpp)
 
diff --git a/examples/test_set_values.cpp:Zone.Identifier b/examples/test_set_values.cpp:Zone.Identifier
deleted file mode 100644
index a45e1ac4..00000000
--- a/examples/test_set_values.cpp:Zone.Identifier
+++ /dev/null
@@ -1,2 +0,0 @@
-[ZoneTransfer]
-ZoneId=3
diff --git a/examples/test_tpetra_carray.cpp b/examples/test_tpetra_carray.cpp
new file mode 100644
index 00000000..405df559
--- /dev/null
+++ b/examples/test_tpetra_carray.cpp
@@ -0,0 +1,224 @@
+/**********************************************************************************************
+ � 2020. Triad National Security, LLC. All rights reserved.
+ This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+ National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+ Department of Energy/National Nuclear Security Administration. All rights in the program are
+ reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+ Security Administration. The Government is granted for itself and others acting on its behalf a
+ nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+ derivative works, distribute copies to the public, perform publicly and display publicly, and
+ to permit others to do so.
+ This program is open source under the BSD-3 License.
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+ 1.  Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+ 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+ conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ 3.  Neither the name of the copyright holder nor the names of its contributors may be used
+ to endorse or promote products derived from this software without specific prior
+ written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **********************************************************************************************/
+#include <stdio.h>
+#include <iostream>
+
+#include "matar.h"
+#include "Kokkos_DualView.hpp"
+
+using namespace mtr; // matar namespace
+
+void TpetraCArrayOneDimensionExample();
+void TpetraCArrayTwoDimensionExample();
+void TpetraCArraySevenDimensionExample();
+
+int main(int argc, char* argv[])
+{   
+    MPI_Init(&argc, &argv);
+    int process_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
+    Kokkos::initialize();
+    {
+        // Run TpetraFArray 1D example
+        if(process_rank==0){
+            printf("\n====================Running 1D TpetraCarray example====================\n");
+        }
+        TpetraCArrayOneDimensionExample();
+
+        // Run TpetraFArray 2D example
+        if(process_rank==0){
+            printf("\n====================Running 2D TpetraCarray example====================\n");
+        }
+        TpetraCArrayTwoDimensionExample();
+
+        // Run TpetraFArray 7D example
+        if(process_rank==0){
+            printf("\n====================Running 7D TpetraCarray example====================\n");
+        }
+        TpetraCArraySevenDimensionExample();
+    } // end of kokkos scope
+    Kokkos::finalize();
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(process_rank==0)
+        printf("\nfinished\n\n");
+    MPI_Finalize();
+}
+
+void TpetraCArrayOneDimensionExample()
+{
+
+    int n = 20; //global dimension
+
+    //distributed dual array with layout left
+    TpetraDCArray<double> myarray(n);
+
+    //local size
+    int nlocal = myarray.size();
+
+    // set values on host copy of data
+    printf("Printing host copy of data (should be global ids):\n");
+    for (int i = 0; i < nlocal; i++) {
+        //set each array element to the corresponding global index
+        //we get global indices using a partition map member in the array
+        myarray.host(i) = myarray.pmap.getGlobalIndex(i);
+    }
+
+    myarray.update_device();
+
+    // Print host copy of data
+    myarray.print();
+    Kokkos::fence();
+
+    // Manupulate data on device and update host
+    FOR_ALL(i, 0, nlocal,{
+        myarray(i) = 2*myarray(i);
+    });
+    myarray.update_host();
+    Kokkos::fence();
+    printf("---Data multiplied by 2 on device---\n");
+
+    // Print host copy of data
+    myarray.print();
+    Kokkos::fence();
+}
+
+void TpetraCArrayTwoDimensionExample()
+{
+
+    int nx = 20; //global dimension
+    int ny = 5;
+
+    //distributed (first dimension gets distributed) dual array with layout left
+    TpetraDCArray<double> myarray(nx, ny);
+
+    //local size
+    int nxlocal = myarray.dims(0);
+
+    // set values on host copy of data
+    printf("Printing host copy of data (should be global ids):\n");
+    for (int i = 0; i < nxlocal; i++) {
+        for (int j = 0; j < ny; j++){
+            //set each array element to a computed global degree of freedom index
+            //we get global indices for dim0 using a partition map member in the array
+            myarray.host(i,j) = ny*myarray.pmap.getGlobalIndex(i) + j;
+        }
+    }
+
+    myarray.update_device();
+
+    // Print host copy of data
+    myarray.print();
+    Kokkos::fence();
+
+    // Manupulate data on device and update host
+    FOR_ALL(i, 0, nxlocal,
+            j, 0, ny,{
+        myarray(i,j) = 2*myarray(i,j);
+    });
+    myarray.update_host();
+    Kokkos::fence();
+    printf("---Data multiplied by 2 on device---\n");
+
+    // Print host copy of data
+    myarray.print();
+    Kokkos::fence();
+}
+
+void TpetraCArraySevenDimensionExample()
+{
+
+    int nx = 20; //global dimension
+    int ny = 3;
+    int nz = 3;
+    int nu = 3;
+    int ns = 2;
+    int nt = 2;
+    int nw = 2;
+
+    //distributed (first dimension gets distributed) dual array with layout left
+    TpetraDCArray<double> myarray(nx, ny, nz, nu, ns, nt, nw);
+
+    //local size
+    int nxlocal = myarray.dims(0);
+
+    // set values on host copy of data
+    printf("Printing host copy of data (should be global ids):\n");
+    for (int i = 0; i < nxlocal; i++) {
+        for (int j = 0; j < ny; j++){
+            for (int k = 0; k < nz; k++){
+                for (int u = 0; u < nu; u++){
+                    for (int s = 0; s < ns; s++){
+                        for (int t = 0; t < nt; t++){
+                            for (int w = 0; w < nw; w++){
+                                //set each array element to a computed global degree of freedom index
+                                //we get global indices for dim0 using a partition map member in the array
+                                myarray.host(i,j,k,u,s,t,w) = ny*nz*nu*ns*nt*nw*myarray.pmap.getGlobalIndex(i) +
+                                                              nz*nu*ns*nt*nw*j + nu*ns*nt*nw*k + ns*nt*nw*u +
+                                                              nt*nw*s + nw*t + w;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    myarray.update_device();
+
+    // Print host copy of data
+    myarray.print();
+    Kokkos::fence();
+
+    // Manupulate data on device and update host
+    FOR_ALL(i, 0, nxlocal,
+            j, 0, ny,
+            k, 0, nz,{
+            for (int u = 0; u < nu; u++){
+                    for (int s = 0; s < ns; s++){
+                        for (int t = 0; t < nt; t++){
+                            for (int w = 0; w < nw; w++){
+                                myarray(i,j,k,u,s,t,w) = 2*myarray(i,j,k,u,s,t,w);
+                            }
+                        }
+                    }
+            }
+    });
+    myarray.update_host();
+    Kokkos::fence();
+    printf("---Data multiplied by 2 on device---\n");
+
+    // Print host copy of data
+    myarray.print();
+    Kokkos::fence();
+}
diff --git a/examples/test_tpetra_farray.cpp b/examples/test_tpetra_farray.cpp
new file mode 100644
index 00000000..58bc6a68
--- /dev/null
+++ b/examples/test_tpetra_farray.cpp
@@ -0,0 +1,218 @@
+/**********************************************************************************************
+ � 2020. Triad National Security, LLC. All rights reserved.
+ This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+ National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+ Department of Energy/National Nuclear Security Administration. All rights in the program are
+ reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+ Security Administration. The Government is granted for itself and others acting on its behalf a
+ nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+ derivative works, distribute copies to the public, perform publicly and display publicly, and
+ to permit others to do so.
+ This program is open source under the BSD-3 License.
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+ 1.  Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+ 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+ conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ 3.  Neither the name of the copyright holder nor the names of its contributors may be used
+ to endorse or promote products derived from this software without specific prior
+ written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **********************************************************************************************/
+#include <stdio.h>
+#include <iostream>
+
+#include "matar.h"
+#include "Kokkos_DualView.hpp"
+
+using namespace mtr; // matar namespace
+
+void TpetraFArrayOneDimensionExample();
+void TpetraFArrayTwoDimensionExample();
+void TpetraFArraySevenDimensionExample();
+
+int main(int argc, char* argv[])
+{   
+    MPI_Init(&argc, &argv);
+    int process_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
+    Kokkos::initialize();
+    {
+        // Run TpetraFArray 1D example
+        TpetraFArrayOneDimensionExample();
+
+        // Run TpetraFArray 2D example
+        TpetraFArrayTwoDimensionExample();
+
+        // Run TpetraFArray 7D example
+        TpetraFArraySevenDimensionExample();
+    } // end of kokkos scope
+    Kokkos::finalize();
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(process_rank==0)
+        printf("\nfinished\n\n");
+    MPI_Finalize();
+}
+
+void TpetraFArrayOneDimensionExample()
+{
+    printf("\n====================Running 1D TpetraFarray example====================\n");
+
+    int n = 20; //global dimension
+
+    //distributed dual array with layout left
+    TpetraDFArray<double> myarray(n);
+
+    //local size
+    int nlocal = myarray.size();
+
+    // set values on host copy of data
+    printf("Printing host copy of data (should be global ids):\n");
+    for (int i = 0; i < nlocal; i++) {
+        //set each array element to the corresponding global index
+        //we get global indices using a partition map member in the array
+        myarray.host(i) = myarray.pmap.getGlobalIndex(i);
+    }
+
+    myarray.update_device();
+
+    // Print host copy of data
+    myarray.print();
+    Kokkos::fence();
+
+    // Manupulate data on device and update host
+    FOR_ALL(i, 0, nlocal,{
+        myarray(i) = 2*myarray(i);
+    });
+    myarray.update_host();
+    Kokkos::fence();
+    printf("---Data multiplied by 2 on device---\n");
+
+    // Print host copy of data
+    myarray.print();
+    Kokkos::fence();
+}
+
+void TpetraFArrayTwoDimensionExample()
+{
+    printf("\n====================Running 2D TpetraFarray example====================\n");
+
+    int nx = 20; //global dimension
+    int ny = 5;
+
+    //distributed (first dimension gets distributed) dual array with layout left
+    TpetraDFArray<double> myarray(nx, ny);
+
+    //local size
+    int nxlocal = myarray.dims(0);
+
+    // set values on host copy of data
+    printf("Printing host copy of data (should be global ids):\n");
+    for (int i = 0; i < nxlocal; i++) {
+        for (int j = 0; j < ny; j++){
+            //set each array element to a computed global degree of freedom index
+            //we get global indices for dim0 using a partition map member in the array
+            myarray.host(i,j) = ny*myarray.pmap.getGlobalIndex(i) + j;
+        }
+    }
+
+    myarray.update_device();
+
+    // Print host copy of data
+    myarray.print();
+    Kokkos::fence();
+
+    // Manupulate data on device and update host
+    FOR_ALL(i, 0, nxlocal,
+            j, 0, ny,{
+        myarray(i,j) = 2*myarray(i,j);
+    });
+    myarray.update_host();
+    Kokkos::fence();
+    printf("---Data multiplied by 2 on device---\n");
+
+    // Print host copy of data
+    myarray.print();
+    Kokkos::fence();
+}
+
+void TpetraFArraySevenDimensionExample()
+{
+    printf("\n====================Running 7D TpetraFarray example====================\n");
+
+    int nx = 20; //global dimension
+    int ny = 3;
+    int nz = 3;
+    int nu = 3;
+    int ns = 2;
+    int nt = 2;
+    int nw = 2;
+
+    //distributed (first dimension gets distributed) dual array with layout left
+    TpetraDFArray<double> myarray(nx, ny, nz, nu, ns, nt, nw);
+
+    //local size
+    int nxlocal = myarray.dims(0);
+
+    // set values on host copy of data
+    printf("Printing host copy of data (should be global ids):\n");
+    for (int i = 0; i < nxlocal; i++) {
+        for (int j = 0; j < ny; j++){
+            for (int k = 0; k < nz; k++){
+                for (int u = 0; u < nu; u++){
+                    for (int s = 0; s < ns; s++){
+                        for (int t = 0; t < nt; t++){
+                            for (int w = 0; w < nw; w++){
+                                //set each array element to a computed global degree of freedom index
+                                //we get global indices for dim0 using a partition map member in the array
+                                myarray.host(i,j,k,u,s,t,w) = ny*nz*nu*ns*nt*nw*myarray.pmap.getGlobalIndex(i) +
+                                                              nz*nu*ns*nt*nw*j + nu*ns*nt*nw*k + ns*nt*nw*u +
+                                                              nt*nw*s + nw*t + w;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    myarray.update_device();
+
+    // Print host copy of data
+    myarray.print();
+    Kokkos::fence();
+
+    // Manupulate data on device and update host
+    FOR_ALL(i, 0, nxlocal,
+            j, 0, ny,
+            k, 0, nz,{
+            for (int u = 0; u < nu; u++){
+                    for (int s = 0; s < ns; s++){
+                        for (int t = 0; t < nt; t++){
+                            for (int w = 0; w < nw; w++){
+                                myarray(i,j,k,u,s,t,w) = 2*myarray(i,j,k,u,s,t,w);
+                            }
+                        }
+                    }
+            }
+    });
+    myarray.update_host();
+    Kokkos::fence();
+    printf("---Data multiplied by 2 on device---\n");
+
+    // Print host copy of data
+    myarray.print();
+    Kokkos::fence();
+}
diff --git a/examples/test_tpetra_mesh.cpp b/examples/test_tpetra_mesh.cpp
new file mode 100644
index 00000000..1ea13991
--- /dev/null
+++ b/examples/test_tpetra_mesh.cpp
@@ -0,0 +1,814 @@
+/**********************************************************************************************
+ � 2020. Triad National Security, LLC. All rights reserved.
+ This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+ National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+ Department of Energy/National Nuclear Security Administration. All rights in the program are
+ reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+ Security Administration. The Government is granted for itself and others acting on its behalf a
+ nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+ derivative works, distribute copies to the public, perform publicly and display publicly, and
+ to permit others to do so.
+ This program is open source under the BSD-3 License.
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+ 1.  Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+ 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+ conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ 3.  Neither the name of the copyright holder nor the names of its contributors may be used
+ to endorse or promote products derived from this software without specific prior
+ written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **********************************************************************************************/
+#include <stdio.h>
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <stdlib.h>
+#include <math.h>  // fmin, fmax, abs note: fminl is long
+#include <set>
+
+#include "matar.h"
+
+using namespace mtr; // matar namespace
+
+struct mesh_data {
+    int num_dim = 3;
+    size_t nlocal_nodes, rnum_elem; //local node and element count respectively
+    size_t num_nodes, num_elem; //global node and element count respectively
+    TpetraDFArray<double> node_coords_distributed; //unique local coords
+    TpetraDFArray<double> ghost_node_coords_distributed; //local data set by other processes
+    TpetraDFArray<double> all_node_coords_distributed; //unique + ghost
+    TpetraDFArray<long long int> nodes_in_elem_distributed; //element node connectivity
+    TpetraCommunicationPlan<real_t> ghost_comms; //comms plan to update ghost data
+};
+
+void setup_maps(mesh_data &mesh);
+void read_mesh_vtk(const char* MESH,mesh_data &mesh);
+real_t run_test(mesh_data &mesh);
+
+int main(int argc, char* argv[])
+{   
+    MPI_Init(&argc, &argv);
+    int process_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
+    Kokkos::initialize();
+    {
+        //allocate mesh struct
+        mesh_data mesh;
+
+        // read mesh file
+        read_mesh_vtk(argv[1], mesh);
+
+        //setup ghost maps
+        setup_maps(mesh);
+
+        //compute something; barriers for timer
+        MPI_Barrier(MPI_COMM_WORLD);
+        Kokkos::fence();
+
+        real_t checksum;
+        auto time_1 = std::chrono::high_resolution_clock::now();
+        checksum = run_test(mesh);
+        auto time_2 = std::chrono::high_resolution_clock::now();
+
+        Kokkos::fence();
+        MPI_Barrier(MPI_COMM_WORLD);
+        
+        MPI_Barrier(MPI_COMM_WORLD);
+        std::chrono::duration <float, std::milli> ms = time_2 - time_1;
+        if(process_rank==0){
+            std::cout << "Finished. Runtime was " << ms.count() << " ms" << std::endl;
+            std::cout << "FEA checksum value is " << checksum << std::endl;
+        }
+    } // end of kokkos scope
+    Kokkos::finalize();
+    MPI_Finalize();
+}
+
+/* ----------------------------------------------------------------------
+   Construct maps containing ghost nodes
+------------------------------------------------------------------------- */
+real_t run_test(mesh_data &mesh)
+{   
+    int  num_dim = mesh.num_dim;
+    TpetraDFArray<double> all_node_coords_distributed = mesh.all_node_coords_distributed;
+    TpetraDFArray<double> node_coords_distributed = mesh.node_coords_distributed;
+    TpetraDFArray<long long int> nodes_in_elem_distributed = mesh.nodes_in_elem_distributed;
+    TpetraPartitionMap<> all_node_map = mesh.nodes_in_elem_distributed.pmap;
+    size_t nlocal_nodes = mesh.nlocal_nodes;
+    int ntimesteps = 1000;
+    real_t timestep = 0.001;
+    real_t constant_driver = 0.001;
+
+    //arbitrary calculation done by looping over all local elements for all timesteps
+    //this loops over all ghosts as well to test load balancing
+
+    for(int itimestep = 0; itimestep < ntimesteps; itimestep++){
+        real_t local_driver = 0;
+        real_t global_driver = 0;
+        
+        //loop to test element load balancing; resulting per elem values aren't used to keep this example simple and free of inverse connectivity structures
+        FOR_ALL(ielem,0,mesh.rnum_elem, {
+                real_t sum = 0;
+                real_t square_sum = 0;
+                //compute arbitrary scaling function that adds node coords and divides by sqrt of square sum
+                for(int inode=0; inode < 8; inode++){
+                    int local_node_index = nodes_in_elem_distributed(ielem,inode);
+                    for(int idim=0; idim < num_dim; idim++){
+                        sum += all_node_coords_distributed(local_node_index, idim);
+                        square_sum += all_node_coords_distributed(local_node_index, idim)*all_node_coords_distributed(local_node_index, idim);
+                    }
+                }
+
+        });
+        Kokkos::fence();
+        
+        //loop to test node update and comms
+        FOR_ALL(inode,0,mesh.nlocal_nodes, {
+            //update coords based on evaluated element sum function
+            for(int idim=0; idim < num_dim; idim++){
+                //node_coords_distributed(local_node_index, idim) += sum/sqrt(square_sum)*timestep;
+                node_coords_distributed(inode, idim) += constant_driver*timestep;
+            }
+        });
+        Kokkos::fence();
+        mesh.node_coords_distributed.update_host();
+        //update ghosts
+        mesh.ghost_comms.execute_comms();
+        mesh.all_node_coords_distributed.update_device();
+    }
+
+    //perform checksum
+    real_t local_sum = 0;
+    real_t threads_sum = 0;
+    FOR_REDUCE_SUM(inode, 0, nlocal_nodes, threads_sum, {
+        for(int idim = 0; idim < num_dim; idim++){
+            threads_sum += node_coords_distributed(inode, idim) * node_coords_distributed(inode, idim);
+        }
+    }, local_sum);
+    Kokkos::fence();
+
+    real_t global_sum = 0;
+    MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    return global_sum;
+}
+
+/* ----------------------------------------------------------------------
+   Construct maps containing ghost nodes
+------------------------------------------------------------------------- */
+void setup_maps(mesh_data &mesh)
+{
+    int  num_dim = mesh.num_dim;
+    int  local_node_index, current_column_index;
+    int  nodes_per_element;
+    long long int   node_gid;
+    TpetraDFArray<double> node_coords_distributed = mesh.node_coords_distributed;
+    TpetraDFArray<long long int> nodes_in_elem_distributed = mesh.nodes_in_elem_distributed;
+    size_t rnum_elem = mesh.rnum_elem;
+    size_t nlocal_nodes = mesh.nlocal_nodes;
+    size_t num_nodes = mesh.num_nodes;
+    size_t num_elem = mesh.num_elem;
+    size_t nghost_nodes, nall_nodes;
+    
+    TpetraPartitionMap<> map = mesh.node_coords_distributed.pmap;
+    DCArrayKokkos<long long int, Kokkos::LayoutLeft> ghost_nodes;
+    int process_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
+
+    if (rnum_elem >= 1)
+    {
+        // Construct set of ghost nodes; start with a buffer with upper limit
+        size_t buffer_limit = 0;
+        if (num_dim == 2)
+        {
+            for (int ielem = 0; ielem < rnum_elem; ielem++)
+            {
+                buffer_limit += 4;
+            }
+        }
+
+        if (num_dim == 3)
+        {
+            for (int ielem = 0; ielem < rnum_elem; ielem++)
+            {
+                buffer_limit += 8;
+            }
+        }
+
+        CArrayKokkos<size_t, Kokkos::LayoutLeft, HostSpace> ghost_node_buffer(buffer_limit);
+
+        std::set<long long int> ghost_node_set;
+
+        // search through local elements for global node indices not owned by this MPI rank
+        if (num_dim == 2)
+        {
+            for (int cell_rid = 0; cell_rid < rnum_elem; cell_rid++)
+            {
+                // set nodes per element
+                nodes_per_element = 4;
+                for (int node_lid = 0; node_lid < nodes_per_element; node_lid++)
+                {
+                    node_gid = nodes_in_elem_distributed.host(cell_rid, node_lid);
+                    if (!map.isProcessGlobalIndex(node_gid))
+                    {
+                        ghost_node_set.insert(node_gid);
+                    }
+                }
+            }
+        }
+
+        if (num_dim == 3)
+        {
+            for (int cell_rid = 0; cell_rid < rnum_elem; cell_rid++)
+            {
+                // set nodes per element
+                nodes_per_element = 8;
+                for (int node_lid = 0; node_lid < nodes_per_element; node_lid++)
+                {
+                    node_gid = nodes_in_elem_distributed.host(cell_rid, node_lid);
+                    if (!map.isProcessGlobalIndex(node_gid))
+                    {
+                        ghost_node_set.insert(node_gid);
+                    }
+                }
+            }
+        }
+
+        // by now the set contains, with no repeats, all the global node indices that are ghosts for this rank
+        // now pass the contents of the set over to a CArrayKokkos, then create a map to find local ghost indices from global ghost indices
+
+        nghost_nodes     = ghost_node_set.size();
+        ghost_nodes = DCArrayKokkos<long long int, Kokkos::LayoutLeft>(nghost_nodes, "ghost_nodes");
+        int  ighost = 0;
+        auto it     = ghost_node_set.begin();
+
+        while (it != ghost_node_set.end()) {
+            ghost_nodes.host(ighost++) = *it;
+            it++;
+        }
+
+        // debug print of ghost nodes
+        // std::cout << " GHOST NODE SET ON TASK " << process_rank << std::endl;
+        // for(int i = 0; i < nghost_nodes; i++)
+        // std::cout << "{" << i + 1 << "," << ghost_nodes(i) + 1 << "}" << std::endl;
+
+        // debug print of ghost nodes
+        // std::cout << " GHOST NODE MAP ON TASK " << process_rank << std::endl;
+        // for(int i = 0; i < nghost_nodes; i++)
+        // std::cout << "{" << i + 1 << "," << global2local_map.get(ghost_nodes(i)) + 1 << "}" << std::endl;
+    }
+
+    ghost_nodes.update_device();
+
+    // create a Map for ghost node indices
+    TpetraPartitionMap<> ghost_node_map = TpetraPartitionMap<>(ghost_nodes);
+
+    // communicate ghost node positions; construct multivector distributed object using local node data
+
+    // construct array for all indices (ghost + local)
+    nall_nodes = nlocal_nodes + nghost_nodes;
+    // CArrayKokkos<GO, array_layout, device_type, memory_traits> all_node_indices(nall_nodes, "all_node_indices");
+    DCArrayKokkos<long long int, Kokkos::LayoutLeft> all_node_indices(nall_nodes, "all_node_indices");
+    //map.print();
+    for (int i = 0; i < nall_nodes; i++)
+    {
+        if (i < nlocal_nodes)
+        {
+            all_node_indices.host(i) = map.getGlobalIndex(i);
+        }
+        else
+        {
+            all_node_indices.host(i) = ghost_nodes.host(i - nlocal_nodes);
+        }
+        //if(all_node_indices.host(i) < 0)
+        //std::cout << "NEGATIVE INDEX AT " << i << " WITH LOCAL NODE COUNT " << nlocal_nodes << " " << all_node_indices.host(i) << " " << std::endl;
+    }
+    all_node_indices.update_device();
+    // debug print of node indices
+    // for(int inode=0; inode < index_counter; inode++)
+    // std::cout << " my_reduced_global_indices " << my_reduced_global_indices(inode) <<std::endl;
+
+    // create a Map for all the node indices (ghost + local)
+    TpetraPartitionMap<> all_node_map = TpetraPartitionMap<>(all_node_indices);
+
+    // create distributed multivector of the (local + ghost) node coords
+    mesh.all_node_coords_distributed = TpetraDFArray<double>(all_node_map, num_dim, "all_node_coords");
+
+    // create distributed multivector of the ghost node coords as a subview of the all vector
+    //mesh.ghost_node_coords_distributed = TpetraDFArray<double>(mesh.all_node_coords_distributed, ghost_node_map, nlocal_nodes);
+    mesh.ghost_node_coords_distributed = TpetraDFArray<double>(ghost_node_map, num_dim, "ghost_node_coords");
+
+    //initialize 0:nlocal-1 data in the all vector
+    FOR_ALL(inode,0,nlocal_nodes, {
+        for (int idim=0; idim < num_dim; idim++){
+            mesh.all_node_coords_distributed(inode,idim) = mesh.node_coords_distributed(inode,idim);
+        }
+    });
+    Kokkos::fence();
+
+    mesh.all_node_coords_distributed.update_host();
+    
+    //set local node array to be a subview of the all node array to avoid carrying duplicate memory
+    mesh.node_coords_distributed = TpetraDFArray<double>(mesh.all_node_coords_distributed, map, 0);
+    node_coords_distributed = mesh.node_coords_distributed; //reset local variable
+
+    // create communication object between ghosts and unique local data
+    mesh.ghost_comms = TpetraCommunicationPlan<real_t>(mesh.all_node_coords_distributed, mesh.node_coords_distributed);
+
+    // comms to get ghosts coords initialized
+    mesh.ghost_comms.execute_comms();
+    mesh.all_node_coords_distributed.update_device();
+    
+    //convert nodes in elem to local node ids to avoid excessive map conversion calls
+    for(int ielem = 0; ielem < mesh.rnum_elem; ielem++) {
+        for(int inode=0; inode < 8; inode++){
+            //recall that nodes in elem is storing global indices in this implementation
+            //you may just want to store local indices in your case to avoid the map call
+            nodes_in_elem_distributed.host(ielem,inode) = all_node_map.getLocalIndex(nodes_in_elem_distributed.host(ielem,inode));
+        }
+    }
+    
+    mesh.nodes_in_elem_distributed.update_device();
+    // std::cout << "number of patches = " << mesh->num_patches() << std::endl;
+    if (process_rank == 0)
+    {
+        std::cout << "End of map setup " << std::endl;
+    }
+}
+
+/* ----------------------------------------------------------------------
+   Read VTK format mesh file
+------------------------------------------------------------------------- */
+
+void read_mesh_vtk(const char* MESH, mesh_data &mesh)
+{
+    std::string skip_line, read_line, substring;
+    std::stringstream line_parse;
+
+    int num_dim = mesh.num_dim;
+    int local_node_index, current_column_index;
+    int buffer_loop, buffer_iteration, buffer_iterations, dof_limit, scan_loop;
+    int negative_index_found = 0;
+    int global_negative_index_found = 0;
+
+    size_t read_index_start, node_rid, elem_gid;
+    size_t nlocal_nodes, rnum_elem, max_nodes_per_element;
+    size_t buffer_nlines = 100000;
+    size_t num_nodes, num_elem;
+    size_t max_word = 30;
+
+    long long int     node_gid;
+    int words_per_line, elem_words_per_line;
+    real_t dof_value;
+    real_t unit_scaling      = 1.0;
+    bool   zero_index_base   = true;
+
+    std::ifstream* in = NULL;
+    std::string filename(MESH);
+
+    CArrayKokkos<char, Kokkos::LayoutLeft, HostSpace> read_buffer;
+
+    //corresponding MPI rank for this process
+    int process_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
+
+    // read the mesh
+    // --- Read the number of nodes in the mesh --- //
+    num_nodes = 0;
+    if (process_rank == 0)
+    {
+        std::cout << "FILE NAME IS " << filename << std::endl;
+        std::cout << " NUM DIM is " << num_dim << std::endl;
+        in = new std::ifstream();
+        in->open(filename);
+        bool found = false;
+
+        while (found == false&&in->good()) {
+            std::getline(*in, read_line);
+            line_parse.str("");
+            line_parse.clear();
+            line_parse << read_line;
+            line_parse >> substring;
+
+            // looking for the following text:
+            //      POINTS %d float
+            if (substring == "POINTS")
+            {
+                line_parse >> num_nodes;
+                std::cout << "declared node count: " << num_nodes << std::endl;
+                if (num_nodes <= 0)
+                {
+                    throw std::runtime_error("ERROR, NO NODES IN MESH");
+                }
+                found = true;
+            } // end if
+        } // end while
+
+        if (!found){
+            throw std::runtime_error("ERROR: Failed to find POINTS");
+        } // end if
+
+    } // end if(process_rank==0)
+
+    // broadcast number of nodes
+    MPI_Bcast(&num_nodes, 1, MPI_LONG_LONG_INT, 0, MPI_COMM_WORLD);
+    mesh.num_nodes = num_nodes;
+
+    // construct distributed storage for node coordinates now that we know the global number of nodes
+    // the default map of this array assigns an ordered contiguous subset of the global IDs to each process
+    mesh.node_coords_distributed = TpetraDFArray<double>(num_nodes, num_dim);
+    // node_coords_distributed.pmap.print();
+
+    //map of the distributed node coordinates vector
+    TpetraPartitionMap<> map = mesh.node_coords_distributed.pmap;
+
+    // read the initial mesh coordinates
+    /*only task 0 reads in nodes and elements from the input file
+    stores node data in a buffer and communicates once the buffer cap is reached
+    or the data ends*/
+
+    words_per_line = num_dim;
+    if(num_dim==2)
+        elem_words_per_line = 4;
+    else if(num_dim==3)
+        elem_words_per_line = 8;
+
+    // allocate read buffer
+    read_buffer = CArrayKokkos<char, Kokkos::LayoutLeft, HostSpace>(buffer_nlines, words_per_line, max_word);
+
+    dof_limit = num_nodes;
+    buffer_iterations = dof_limit / buffer_nlines;
+    if (dof_limit % buffer_nlines != 0)
+    {
+        buffer_iterations++;
+    }
+
+    // read coords
+    read_index_start = 0;
+    for (buffer_iteration = 0; buffer_iteration < buffer_iterations; buffer_iteration++)
+    {
+        // pack buffer on rank 0
+        if (process_rank == 0 && buffer_iteration < buffer_iterations - 1)
+        {
+            for (buffer_loop = 0; buffer_loop < buffer_nlines; buffer_loop++)
+            {
+                getline(*in, read_line);
+                line_parse.clear();
+                line_parse.str(read_line);
+
+                for (int iword = 0; iword < words_per_line; iword++)
+                {
+                    // read portions of the line into the substring variable
+                    line_parse >> substring;
+                    // debug print
+                    // std::cout<<" "<< substring <<std::endl;
+                    // assign the substring variable as a word of the read buffer
+                    strcpy(&read_buffer(buffer_loop, iword, 0), substring.c_str());
+                }
+            }
+        }
+        else if (process_rank == 0)
+        {
+            buffer_loop = 0;
+            while (buffer_iteration * buffer_nlines + buffer_loop < num_nodes) {
+                getline(*in, read_line);
+                line_parse.clear();
+                line_parse.str(read_line);
+                for (int iword = 0; iword < words_per_line; iword++)
+                {
+                    // read portions of the line into the substring variable
+                    line_parse >> substring;
+                    // debug print
+                    // std::cout<<" "<< substring <<std::endl;
+                    // assign the substring variable as a word of the read buffer
+                    strcpy(&read_buffer(buffer_loop, iword, 0), substring.c_str());
+                }
+                buffer_loop++;
+            }
+        }
+
+        // broadcast buffer to all ranks; each rank will determine which nodes in the buffer belong
+        MPI_Bcast(read_buffer.pointer(), buffer_nlines * words_per_line * max_word, MPI_CHAR, 0, MPI_COMM_WORLD);
+        // broadcast how many nodes were read into this buffer iteration
+        MPI_Bcast(&buffer_loop, 1, MPI_INT, 0, MPI_COMM_WORLD);
+
+        // debug_print
+        // std::cout << "NODE BUFFER LOOP IS: " << buffer_loop << std::endl;
+        // for(int iprint=0; iprint < buffer_loop; iprint++)
+        // std::cout<<"buffer packing: " << std::string(&read_buffer(iprint,0,0)) << std::endl;
+        // return;
+
+        // determine which data to store in the swage mesh members (the local node data)
+        // loop through read buffer
+        for (scan_loop = 0; scan_loop < buffer_loop; scan_loop++)
+        {
+            // set global node id (ensight specific order)
+            node_gid = read_index_start + scan_loop;
+            // let map decide if this node id belongs locally; if yes store data
+            if (map.isProcessGlobalIndex(node_gid))
+            {
+                // set local node index in this mpi rank
+                node_rid = map.getLocalIndex(node_gid);
+                // extract nodal position from the read buffer
+                // for tecplot format this is the three coords in the same line
+                dof_value = atof(&read_buffer(scan_loop, 0, 0));
+                mesh.node_coords_distributed.host(node_rid, 0) = dof_value * unit_scaling;
+                dof_value = atof(&read_buffer(scan_loop, 1, 0));
+                mesh.node_coords_distributed.host(node_rid, 1) = dof_value * unit_scaling;
+                if (num_dim == 3)
+                {
+                    dof_value = atof(&read_buffer(scan_loop, 2, 0));
+                    mesh.node_coords_distributed.host(node_rid, 2) = dof_value * unit_scaling;
+                }
+            }
+        }
+        read_index_start += buffer_nlines;
+    }
+    // repartition node distribution
+    mesh.node_coords_distributed.update_device();
+    mesh.node_coords_distributed.repartition_vector();
+    //reset our local map variable to the repartitioned map
+    map = mesh.node_coords_distributed.pmap;
+    // set the local number of nodes on this process
+    mesh.nlocal_nodes = nlocal_nodes = mesh.node_coords_distributed.dims(0);
+
+    // synchronize device data
+
+
+    // check that local assignments match global total
+
+    // read in element info (ensight file format is organized in element type sections)
+    // loop over this later for several element type sections
+
+    num_elem  = 0;
+    rnum_elem = 0;
+    CArrayKokkos<int, Kokkos::LayoutLeft, HostSpace> node_store(elem_words_per_line);
+
+    // --- read the number of cells in the mesh ---
+    // --- Read the number of vertices in the mesh --- //
+    if (process_rank == 0)
+    {
+        bool found = false;
+        while (found == false&&in->good()) {
+            std::getline(*in, read_line);
+            line_parse.str("");
+            line_parse.clear();
+            line_parse << read_line;
+            line_parse >> substring;
+
+            // looking for the following text:
+            //      CELLS num_cells size
+            if (substring == "CELLS")
+            {
+                line_parse >> num_elem;
+                std::cout << "declared element count: " << num_elem << std::endl;
+                if (num_elem <= 0)
+                {
+                    throw std::runtime_error("ERROR, NO ELEMENTS IN MESH");
+                }
+                found = true;
+            } // end if
+        } // end while
+
+        if (!found){
+            throw std::runtime_error("ERROR: Failed to find CELLS");
+        } // end if
+    } // end if(process_rank==0)
+
+    // broadcast number of elements
+    MPI_Bcast(&num_elem, 1, MPI_LONG_LONG_INT, 0, MPI_COMM_WORLD);
+    mesh.num_elem = num_elem;
+
+    if (process_rank == 0)
+    {
+        std::cout << "before mesh initialization" << std::endl;
+    }
+
+    // read in element connectivity
+    // we're gonna reallocate for the words per line expected for the element connectivity
+    read_buffer = CArrayKokkos<char, Kokkos::LayoutLeft, HostSpace>(buffer_nlines, elem_words_per_line, max_word);
+
+    // calculate buffer iterations to read number of lines
+    buffer_iterations = num_elem / buffer_nlines;
+    int assign_flag;
+
+    // dynamic buffer used to store elements before we know how many this rank needs
+    std::vector<long long int> element_temp(buffer_nlines * elem_words_per_line);
+    std::vector<long long int> global_indices_temp(buffer_nlines);
+    size_t buffer_max = buffer_nlines * elem_words_per_line;
+    size_t indices_buffer_max = buffer_nlines;
+
+    if (num_elem % buffer_nlines != 0)
+    {
+        buffer_iterations++;
+    }
+    read_index_start = 0;
+    // std::cout << "ELEMENT BUFFER ITERATIONS: " << buffer_iterations << std::endl;
+    rnum_elem = 0;
+    for (buffer_iteration = 0; buffer_iteration < buffer_iterations; buffer_iteration++)
+    {
+        // pack buffer on rank 0
+        if (process_rank == 0 && buffer_iteration < buffer_iterations - 1)
+        {
+            for (buffer_loop = 0; buffer_loop < buffer_nlines; buffer_loop++)
+            {
+                getline(*in, read_line);
+                line_parse.clear();
+                line_parse.str(read_line);
+                // disregard node count line since we're using one element type per mesh
+                line_parse >> substring;
+                for (int iword = 0; iword < elem_words_per_line; iword++)
+                {
+                    // read portions of the line into the substring variable
+                    line_parse >> substring;
+                    // debug print
+                    // std::cout<<" "<< substring;
+                    // assign the substring variable as a word of the read buffer
+                    strcpy(&read_buffer(buffer_loop, iword, 0), substring.c_str());
+                }
+                // std::cout <<std::endl;
+            }
+        }
+        else if (process_rank == 0)
+        {
+            buffer_loop = 0;
+            while (buffer_iteration * buffer_nlines + buffer_loop < num_elem) {
+                getline(*in, read_line);
+                line_parse.clear();
+                line_parse.str(read_line);
+                line_parse >> substring;
+                for (int iword = 0; iword < elem_words_per_line; iword++)
+                {
+                    // read portions of the line into the substring variable
+                    line_parse >> substring;
+                    // debug print
+                    // std::cout<<" "<< substring;
+                    // assign the substring variable as a word of the read buffer
+                    strcpy(&read_buffer(buffer_loop, iword, 0), substring.c_str());
+                }
+                // std::cout <<std::endl;
+                buffer_loop++;
+                // std::cout<<" "<< node_coords_distributed(node_gid, 0)<<std::endl;
+            }
+        }
+
+        // broadcast buffer to all ranks; each rank will determine which nodes in the buffer belong
+        MPI_Bcast(read_buffer.pointer(), buffer_nlines * elem_words_per_line * max_word, MPI_CHAR, 0, MPI_COMM_WORLD);
+        // broadcast how many nodes were read into this buffer iteration
+        MPI_Bcast(&buffer_loop, 1, MPI_INT, 0, MPI_COMM_WORLD);
+
+        // store element connectivity that belongs to this rank
+        // loop through read buffer
+        for (scan_loop = 0; scan_loop < buffer_loop; scan_loop++)
+        {
+            // set global node id (ensight specific order)
+            elem_gid = read_index_start + scan_loop;
+            // add this element to the local list if any of its nodes belong to this rank according to the map
+            // get list of nodes for each element line and check if they belong to the map
+            assign_flag = 0;
+            for (int inode = 0; inode < elem_words_per_line; inode++)
+            {
+                // as we loop through the nodes belonging to this element we store them
+                // if any of these nodes belongs to this rank this list is used to store the element locally
+                node_gid = atoi(&read_buffer(scan_loop, inode, 0));
+                if (zero_index_base)
+                {
+                    node_store(inode) = node_gid; // subtract 1 since file index start is 1 but code expects 0
+                }
+                else
+                {
+                    node_store(inode) = node_gid - 1; // subtract 1 since file index start is 1 but code expects 0
+                }
+                if (node_store(inode) < 0)
+                {
+                    negative_index_found = 1;
+                }
+                // first we add the elements to a dynamically allocated list
+                if (zero_index_base)
+                {
+                    if (map.isProcessGlobalIndex(node_gid) && !assign_flag)
+                    {
+                        assign_flag = 1;
+                        rnum_elem++;
+                    }
+                }
+                else
+                {
+                    if (map.isProcessGlobalIndex(node_gid - 1) && !assign_flag)
+                    {
+                        assign_flag = 1;
+                        rnum_elem++;
+                    }
+                }
+            }
+
+            if (assign_flag)
+            {
+                for (int inode = 0; inode < elem_words_per_line; inode++)
+                {
+                    if ((rnum_elem - 1) * elem_words_per_line + inode >= buffer_max)
+                    {
+                        element_temp.resize((rnum_elem - 1) * elem_words_per_line + inode + buffer_nlines * elem_words_per_line);
+                        buffer_max = (rnum_elem - 1) * elem_words_per_line + inode + buffer_nlines * elem_words_per_line;
+                    }
+                    element_temp[(rnum_elem - 1) * elem_words_per_line + inode] = node_store(inode);
+                    // std::cout << "VECTOR STORAGE FOR ELEM " << rnum_elem << " ON TASK " << process_rank << " NODE " << inode+1 << " IS " << node_store(inode) + 1 << std::endl;
+                }
+                // assign global element id to temporary list
+                if (rnum_elem - 1 >= indices_buffer_max)
+                {
+                    global_indices_temp.resize(rnum_elem - 1 + buffer_nlines);
+                    indices_buffer_max = rnum_elem - 1 + buffer_nlines;
+                }
+                global_indices_temp[rnum_elem - 1] = elem_gid;
+            }
+        }
+        read_index_start += buffer_nlines;
+    }
+
+    if (num_dim == 2) //QUad4
+    {
+        max_nodes_per_element = 4;
+    }
+
+    if (num_dim == 3) //Hex8
+    {
+        max_nodes_per_element = 8;
+    }
+
+    mesh.rnum_elem = rnum_elem;
+
+    // copy temporary element storage to multivector storage
+    DCArrayKokkos<long long int> All_Element_Global_Indices(rnum_elem, "dual_nodes_in_elem");
+
+    // copy temporary global indices storage to view storage
+    for (int ielem = 0; ielem < rnum_elem; ielem++)
+    {
+        All_Element_Global_Indices.host(ielem) = global_indices_temp[ielem];
+        if (global_indices_temp[ielem] < 0)
+        {
+            negative_index_found = 1;
+        }
+    }
+
+    MPI_Allreduce(&negative_index_found, &global_negative_index_found, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
+    if (global_negative_index_found)
+    {
+        if (process_rank == 0)
+        {
+            std::cout << "Node index less than or equal to zero detected; set \"zero_index_base = true\" " << std::endl;
+        }
+    }
+
+    All_Element_Global_Indices.update_device();
+
+    //map object with distribution of global indices of all Elements each process stores
+    TpetraPartitionMap<> all_element_map(All_Element_Global_Indices);
+
+    //build nodes in elem distributed storage
+    mesh.nodes_in_elem_distributed = TpetraDFArray<long long int>(all_element_map, max_nodes_per_element, "nodes_in_elem_distributed");
+
+    for (int ielem = 0; ielem < rnum_elem; ielem++)
+    {
+        for (int inode = 0; inode < elem_words_per_line; inode++)
+        {
+            mesh.nodes_in_elem_distributed.host(ielem, inode) = element_temp[ielem * elem_words_per_line + inode];
+        }
+    }
+    mesh.nodes_in_elem_distributed.update_device();
+
+    // delete temporary element connectivity and index storage
+    std::vector<long long int>().swap(element_temp);
+    std::vector<long long int>().swap(global_indices_temp);
+
+    // debug print
+    /*
+    Kokkos::View <GO*, array_layout, device_type, memory_traits> All_Element_Global_Indices_pass("All_Element_Global_Indices_pass",rnum_elem);
+    deep_copy(All_Element_Global_Indices_pass, All_Element_Global_Indices.h_view);
+    std::cout << " ------------ELEMENT GLOBAL INDICES ON TASK " << process_rank << " --------------"<<std::endl;
+    for (int ielem = 0; ielem < rnum_elem; ielem++){
+      std::cout << "elem: " << All_Element_Global_Indices_pass(ielem) + 1;
+      std::cout << std::endl;
+    }
+    */
+    //nodes_in_elem_distributed.print();
+
+    // Close mesh input file
+    if (process_rank == 0)
+    {
+        in->close();
+    }
+
+} // end read_mesh
diff --git a/examples/virtualFcnKokkos/CMakeLists.txt b/examples/virtualFcnKokkos/CMakeLists.txt
index 0af21ba3..b0673270 100644
--- a/examples/virtualFcnKokkos/CMakeLists.txt
+++ b/examples/virtualFcnKokkos/CMakeLists.txt
@@ -1,11 +1,11 @@
 cmake_minimum_required(VERSION 3.1.3)
 
-project (virttestkokkos)
+#project (virttestkokkos)
 
 find_package(Matar REQUIRED)
 
 if (KOKKOS)
-  find_package(Kokkos REQUIRED) #new
+  #find_package(Kokkos REQUIRED) #new
 
   add_executable(virttestkokkos child.cpp  child.hpp  inherited_inits.cpp  inherited_inits.hpp  kokkos_alias.h  main_kokkos_vfcn.cpp  parents.h)
 
diff --git a/examples/virtualFcnMATAR/CMakeLists.txt b/examples/virtualFcnMATAR/CMakeLists.txt
index 27a32df7..4e232051 100644
--- a/examples/virtualFcnMATAR/CMakeLists.txt
+++ b/examples/virtualFcnMATAR/CMakeLists.txt
@@ -4,7 +4,7 @@ find_package(Matar REQUIRED)
 
 if (KOKKOS)
   add_definitions(-DHAVE_KOKKOS=1)
-  find_package(Kokkos REQUIRED)
+  #find_package(Kokkos REQUIRED)
 
   add_definitions(-DHAVE_KOKKOS=1)
   if (CUDA)
diff --git a/examples/watt-graph/CMakeLists.txt b/examples/watt-graph/CMakeLists.txt
index 9491b38d..9db93716 100644
--- a/examples/watt-graph/CMakeLists.txt
+++ b/examples/watt-graph/CMakeLists.txt
@@ -7,7 +7,7 @@ if (NOT KOKKOS)
 endif()
 
 if (KOKKOS)
-    find_package(Kokkos REQUIRED)
+    #find_package(Kokkos REQUIRED)
     add_definitions(-DHAVE_KOKKOS=1)
 
     add_executable(test_kokkos_floyd kokkos_floyd.cpp)
diff --git a/scripts/build-matar.sh b/scripts/build-matar.sh
index cac2b4f9..0e3a55a9 100755
--- a/scripts/build-matar.sh
+++ b/scripts/build-matar.sh
@@ -6,7 +6,9 @@ show_help() {
     echo "  --kokkos_build_type=<none|serial|openmp|pthreads|cuda|hip|serial_mpi|openmp_mpi|cuda_mpi|hip_mpi|>. Default is 'serial'"
     echo "  --build_action=<full-app|set-env|install-kokkos|install-matar|matar>. Default is 'full-app'"
     echo "  --machine=<darwin|chicoma|linux|mac>. Default is 'linux'"
+    echo "  --intel_mkl=<enabled|disabled>. Default is 'disabled'"
     echo "  --build_cores=<Integers greater than 0>. Default is set 1"
+    echo "  --trilinos=<enabled|disabled>. Default is 'disabled'"
     echo "  --help: Display this help message"
     echo " "
     echo " "
@@ -44,6 +46,11 @@ show_help() {
     echo "          linux                       A general linux machine (that does not use modules)"
     echo "          mac                         A Mac computer. This option does not allow for cuda and hip builds, and build_cores will be set to 1"
     echo " "
+    echo "      --intel_mkl                     Decides whether to build Trilinos using the Intel MKL library"
+    echo " "
+    echo "          enabled                     Links and builds Trilinos with the Intel MKL library"
+    echo "          disabled                    Links and builds Trilinos using LAPACK and BLAS"
+    echo " "
     echo "      --build_cores                   The number of build cores to be used by make and make install commands. The default is 1"
     echo " "
     echo "      --trilinos                      Decides if Trilinos is available for certain MATAR functionality"
@@ -61,6 +68,7 @@ machine="linux"
 kokkos_build_type="serial"
 build_cores="1"
 trilinos="disabled"
+intel_mkl="disabled"
 
 # Define arrays of valid options
 valid_build_action=("full-app" "set-env" "install-matar" "install-kokkos" "matar")
@@ -68,6 +76,7 @@ valid_execution=("examples" "test" "benchmark")
 valid_kokkos_build_types=("none" "serial" "openmp" "pthreads" "cuda" "hip" "serial_mpi" "openmp_mpi" "cuda_mpi" "hip_mpi")
 valid_machines=("darwin" "chicoma" "linux" "mac")
 valid_trilinos=("disabled" "enabled")
+valid_intel_mkl=("disabled" "enabled")
 
 # Parse command line arguments
 for arg in "$@"; do
@@ -132,6 +141,16 @@ for arg in "$@"; do
                 return 1
             fi
             ;;
+        --intel_mkl=*)
+            option="${arg#*=}"
+            if [[ " ${valid_intel_mkl[*]} " == *" $option "* ]]; then
+                intel_mkl="$option"
+            else
+                echo "Error: Invalid --intel_mkl specified."
+                show_help
+                return 1
+            fi
+            ;;
         --help)
             show_help
             return 1
@@ -162,6 +181,12 @@ if [ "$machine" = "mac" ] && [ $build_cores -ne 1 ]; then
     # Nothing to do, default is already 1
 fi
 
+if [ "$trilinos" = "enabled" ] && [ "$kokkos_build_type" = "none" ]; then
+    echo "Error: Kokkos none cannot be requested with Trilinos"
+    show_help
+    return 1
+fi
+
 
 echo "Building based on these argument options:"
 echo "Build action - ${build_action}"
@@ -169,6 +194,7 @@ echo "Execution - ${execution}"
 echo "Kokkos backend - ${kokkos_build_type}"
 echo "make -j ${build_cores}"
 echo "Trilinos - ${trilinos}"
+echo "Intel MKL library - ${intel_mkl}"
 
 cd "$( dirname "${BASH_SOURCE[0]}" )"
 
@@ -185,6 +211,8 @@ if [ "$build_action" = "full-app" ]; then
 
     if [ "$trilinos" = "disabled" ]; then    
         source kokkos-install.sh ${kokkos_build_type}
+    elif [ "$trilinos" = "enabled" ]; then    
+        source trilinos-install.sh ${kokkos_build_type}  ${intel_mkl}
     fi
     source matar-install.sh ${kokkos_build_type} ${trilinos}
     source cmake_build_${execution}.sh ${kokkos_build_type} ${trilinos}
diff --git a/scripts/cmake_build_examples.sh b/scripts/cmake_build_examples.sh
index cca1ceeb..b2254b75 100644
--- a/scripts/cmake_build_examples.sh
+++ b/scripts/cmake_build_examples.sh
@@ -10,7 +10,6 @@ then
 fi
 
 cmake_options=(
-    -D CMAKE_PREFIX_PATH="${MATAR_INSTALL_DIR};${KOKKOS_INSTALL_DIR}"
     -D CMAKE_BUILD_TYPE=Release
     #-D CMAKE_BUILD_TYPE=Debug
 )
@@ -19,13 +18,26 @@ if [ "$kokkos_build_type" = "none" ]; then
     cmake_options+=(
         -D KOKKOS=OFF
     )
+elif [ "$trilinos" = "enabled" ]; then
+    if [ ! -d "${TRILINOS_INSTALL_DIR}/lib" ]; then
+        Trilinos_DIR=${TRILINOS_INSTALL_DIR}/lib64/cmake/Trilinos
+    else
+        Trilinos_DIR=${TRILINOS_INSTALL_DIR}/lib/cmake/Trilinos
+    fi
+    cmake_options+=(
+        -D CMAKE_PREFIX_PATH="${MATAR_INSTALL_DIR}"
+        -D Trilinos_DIR="$Trilinos_DIR"
+        -D Matar_ENABLE_TRILINOS=ON
+        -D KOKKOS=ON
+    )
 else
     cmake_options+=(
+        -D CMAKE_PREFIX_PATH="${MATAR_INSTALL_DIR};${KOKKOS_INSTALL_DIR}"
         -D KOKKOS=ON
     )
 fi
 
-if [[ "$kokkos_build_type" = *"mpi"* ]]; then
+if [[ "$kokkos_build_type" = *"mpi"* ]] || [ "$trilinos" = "enabled" ]; then
     cmake_options+=(
         -D MPI=ON
     )
diff --git a/scripts/matar-install.sh b/scripts/matar-install.sh
index b406772f..a9dc333f 100644
--- a/scripts/matar-install.sh
+++ b/scripts/matar-install.sh
@@ -8,16 +8,27 @@ mkdir -p ${MATAR_BUILD_DIR}
 
 cmake_options=(
     -D CMAKE_INSTALL_PREFIX="${MATAR_INSTALL_DIR}"
-    -D CMAKE_PREFIX_PATH="${KOKKOS_INSTALL_DIR}"
 )
 
 if [ "$kokkos_build_type" = "none" ]; then
     cmake_options+=(
         -D Matar_ENABLE_KOKKOS=OFF
     )
+elif [ "$trilinos" = "enabled" ]; then
+    if [ ! -d "${TRILINOS_INSTALL_DIR}/lib" ]; then
+        Trilinos_DIR=${TRILINOS_INSTALL_DIR}/lib64/cmake/Trilinos
+    else
+        Trilinos_DIR=${TRILINOS_INSTALL_DIR}/lib/cmake/Trilinos
+    fi
+    cmake_options+=(
+        -D Trilinos_DIR="$Trilinos_DIR"
+        -D Matar_ENABLE_TRILINOS=ON
+        -D Matar_ENABLE_KOKKOS=ON
+    )
 else
     cmake_options+=(
         -D Matar_ENABLE_KOKKOS=ON
+        -D CMAKE_PREFIX_PATH="${KOKKOS_INSTALL_DIR}"
     )
     if [ "$kokkos_build_type" = "cuda" ]; then
         cmake_options+=(
@@ -26,7 +37,7 @@ else
     fi
 fi
 
-if [[ "$kokkos_build_type" = *"mpi"* ]]; then
+if [[ "$kokkos_build_type" = *"mpi"* ]] || [ "$trilinos" = "enabled" ]; then
     cmake_options+=(
     -D Matar_ENABLE_MPI=ON
     )
diff --git a/scripts/setup-env.sh b/scripts/setup-env.sh
index fa97861a..e6fb36dd 100644
--- a/scripts/setup-env.sh
+++ b/scripts/setup-env.sh
@@ -31,6 +31,10 @@ export KOKKOS_SOURCE_DIR=${basedir}/src/Kokkos/kokkos
 export KOKKOS_BUILD_DIR=${builddir}/kokkos
 export KOKKOS_INSTALL_DIR=${installdir}/kokkos
 
+export TRILINOS_SOURCE_DIR=${basedir}/lib/Trilinos
+export TRILINOS_BUILD_DIR=${TRILINOS_SOURCE_DIR}/build-${kokkos_build_type}
+export TRILINOS_INSTALL_DIR=${TRILINOS_BUILD_DIR}
+
 export MATAR_SOURCE_DIR=${basedir}
 export MATAR_BUILD_DIR=${builddir}/matar
 export MATAR_INSTALL_DIR=${installdir}/matar
diff --git a/scripts/trilinos-install.sh b/scripts/trilinos-install.sh
new file mode 100644
index 00000000..13f5b253
--- /dev/null
+++ b/scripts/trilinos-install.sh
@@ -0,0 +1,176 @@
+#!/bin/bash -e
+
+kokkos_build_type="${1}"
+intel_mkl="${2}"
+
+# If all arguments are valid, you can use them in your script as needed
+echo "Trilinos Kokkos Build Type: $kokkos_build_type"
+
+#check if Trilinos directory exists, git clone Trilinos if it doesn't
+[ -d "${TRILINOS_SOURCE_DIR}" ] && echo "Directory Trilinos exists, skipping Trilinos download"
+
+if [ ! -d "${TRILINOS_SOURCE_DIR}" ]
+then
+  echo "Directory Trilinos does not exist, downloading Trilinos...."
+  git clone --depth 1 https://github.com/trilinos/Trilinos.git ${TRILINOS_SOURCE_DIR}
+fi
+
+#check if Trilinos build directory exists, create Trilinos/build if it doesn't
+[ -d "${TRILINOS_BUILD_DIR}" ] && echo "Directory ${TRILINOS_BUILD_DIR} exists, moving on"
+
+if [ ! -d "${TRILINOS_BUILD_DIR}" ]
+then
+  echo "Directory ${TRILINOS_BUILD_DIR} does not exist, creating it...."
+    rm -rf ${TRILINOS_BUILD_DIR} ${TRILINOS_INSTALL_DIR}
+    mkdir -p ${TRILINOS_BUILD_DIR} 
+fi
+
+
+if [ "$kokkos_build_type" = "cuda" ] || [ "$kokkos_build_type" = "cuda_mpi" ]; then
+    export OMPI_CXX=${TRILINOS_SOURCE_DIR}/packages/kokkos/bin/nvcc_wrapper
+    export CUDA_LAUNCH_BLOCKING=1
+elif [ "$kokkos_build_type" = *"hip"* ] || [ "$kokkos_build_type" = *"hip_mpi"* ]; then
+    export OMPI_CXX=hipcc
+fi
+
+#check if Trilinos library files were installed, install them otherwise.
+[ -d "${TRILINOS_BUILD_DIR}/lib" ] && echo "Directory ${TRILINOS_BUILD_DIR}/lib exists, assuming successful installation; delete build folder and run build script again if there was an environment error that has been corrected."
+
+[ -d "${TRILINOS_BUILD_DIR}/lib64" ] && echo "Directory ${TRILINOS_BUILD_DIR}/lib64 exists, assuming successful installation; delete build folder and run build script again if there was an environment error that has been corrected."
+
+if [ ! -d "${TRILINOS_BUILD_DIR}/lib" ] && [ ! -d "${TRILINOS_BUILD_DIR}/lib64" ]
+then
+  echo "Directory Trilinos/build/lib does not exist, compiling Trilinos (this might take a while)...."
+
+CUDA_ADDITIONS=(
+-D TPL_ENABLE_CUDA=ON
+-D TPL_ENABLE_CUBLAS=ON
+-D TPL_ENABLE_CUSPARSE=ON
+-D Kokkos_ENABLE_CUDA=ON
+-D Kokkos_ENABLE_CUDA_LAMBDA=ON
+-D Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON
+-D Kokkos_ENABLE_DEPRECATED_CODE=OFF
+-D Kokkos_ENABLE_CUDA_UVM=OFF
+-D Trilinos_ENABLE_KokkosKernels=ON
+-D KokkosKernels_ENABLE_TPL_CUBLAS=ON
+-D KokkosKernels_ENABLE_TPL_CUSPARSE=ON
+-D Tpetra_ENABLE_CUDA=ON
+-D MueLu_ENABLE_Kokkos_Refactor=OFF
+-D Tpetra_ASSUME_GPU_AWARE_MPI:BOOL=FALSE
+)
+
+# Kokkos flags for Hip
+HIP_ADDITIONS=(
+export OMPI_CXX=hipcc
+-D Kokkos_ENABLE_HIP=ON
+-D Kokkos_ENABLE_HIP_RELOCATABLE_DEVICE_CODE=ON
+-D Kokkos_ENABLE_DEPRECATED_CODE=OFF
+-D Kokkos_ARCH_VEGA90A=ON
+-D Trilinos_ENABLE_KokkosKernels=ON
+-D KokkosKernels_ENABLE_TPL_CUBLAS=OFF
+-D KokkosKernels_ENABLE_TPL_CUSPARSE=OFF
+-D Tpetra_INST_HIP=ON
+-D Tpetra_ASSUME_GPU_AWARE_MPI:BOOL=FALSE
+)
+
+# Kokkos flags for OpenMP
+OPENMP_ADDITIONS=(
+-D Trilinos_ENABLE_OpenMP=ON
+)
+
+# Flags for building with MKL, which is supported at MSU HPCC
+MSU_ADDITIONS=(
+-D BLAS_LIBRARY_NAMES="libmkl_rt.so"
+-D BLAS_LIBRARY_DIRS="/apps/spack-managed/gcc-11.3.1/intel-oneapi-mkl-2022.2.1-7l7jlsd56x2kljiskrcvsoenmq4y3cu7/mkl/2022.2.1/lib/intel64"
+-D LAPACK_LIBRARY_NAMES="libmkl_rt.so"
+-D LAPACK_LIBRARY_DIRS="/apps/spack-managed/gcc-11.3.1/intel-oneapi-mkl-2022.2.1-7l7jlsd56x2kljiskrcvsoenmq4y3cu7/mkl/2022.2.1/lib/intel64"
+-D TPL_ENABLE_MKL:BOOL=ON
+-D MKL_LIBRARY_DIRS:FILEPATH="/apps/spack-managed/gcc-11.3.1/intel-oneapi-mkl-2022.2.1-7l7jlsd56x2kljiskrcvsoenmq4y3cu7/mkl/2022.2.1/lib/intel64"
+-D MKL_LIBRARY_NAMES:STRING="mkl_rt"
+-D MKL_INCLUDE_DIRS:FILEPATH="/apps/spack-managed/gcc-11.3.1/intel-oneapi-mkl-2022.2.1-7l7jlsd56x2kljiskrcvsoenmq4y3cu7/mkl/2022.2.1/include"
+)
+
+# Configure kokkos using CMake
+cmake_options=(
+-D CMAKE_BUILD_TYPE=Release
+-D Trilinos_MUST_FIND_ALL_TPL_LIBS=TRUE
+-D CMAKE_CXX_STANDARD=17
+-D TPL_ENABLE_MPI=ON
+)
+
+echo "**** Machine = ${machine} ****"
+if [ "$machine" = "msu" ]; then
+    echo "**** WARNING: Verify MKL path in trilinos-install.sh ****"
+    cmake_options+=(
+        ${MSU_ADDITIONS[@]}
+    )
+fi
+
+cmake_options+=(
+-D Trilinos_ENABLE_Kokkos=ON
+${ADDITIONS[@]}
+-D Trilinos_ENABLE_Amesos2=OFF
+-D Trilinos_ENABLE_Belos=OFF
+-D Trilinos_ENABLE_MueLu=OFF 
+-D Trilinos_ENABLE_ROL=OFF 
+-D Trilinos_ENABLE_Ifpack2=OFF
+-D Trilinos_ENABLE_Zoltan2=ON 
+-D Trilinos_ENABLE_Anasazi=OFF 
+-D MueLu_ENABLE_TESTS=OFF 
+-D Trilinos_ENABLE_ALL_PACKAGES=OFF 
+-D Trilinos_ENABLE_ALL_OPTIONAL_PACKAGES=OFF 
+-D Trilinos_ENABLE_TESTS=OFF 
+-D CMAKE_INSTALL_PREFIX=${TRILINOS_INSTALL_DIR}
+-D Xpetra_ENABLE_Kokkos_Refactor=ON 
+)
+
+# Flags for building with Intel MKL library
+INTEL_MKL_ADDITIONS=(
+-D TPL_ENABLE_MKL=ON
+-D BLAS_LIBRARY_NAMES="libmkl_rt.so"
+-D BLAS_LIBRARY_DIRS="$MKLROOT/lib/intel64"
+-D LAPACK_LIBRARY_NAMES="libmkl_rt.so"
+-D LAPACK_LIBRARY_DIRS="$MKLROOT/lib/intel64"
+-D MKL_LIBRARY_DIRS="$MKLROOT/lib/intel64"
+-D MKL_LIBRARY_NAMES="mkl_rt"
+-D MKL_INCLUDE_DIRS="$MKLROOT/include"
+)
+
+echo "**** Intel MKL = ${intel_mkl} ****"
+if [ "$intel_mkl" = "enabled" ]; then
+    echo "**** assuming MKL installation at $MKLROOT ****"
+    cmake_options+=(
+        ${INTEL_MKL_ADDITIONS[@]}
+    )
+fi
+
+if [ "$kokkos_build_type" = "openmp" ] || [ "$kokkos_build_type" = "openmp_mpi" ]; then
+    cmake_options+=(
+        ${OPENMP_ADDITIONS[@]}
+    )
+elif [ "$kokkos_build_type" = "cuda" ] || [ "$kokkos_build_type" = "cuda_mpi" ]; then
+    cmake_options+=(
+        ${CUDA_ADDITIONS[@]}
+    )
+elif [ "$kokkos_build_type" = *"hip"* ] || [ "$kokkos_build_type" = *"hip_mpi"* ]; then
+    cmake_options+=(
+        ${HIP_ADDITIONS[@]}
+    )
+fi
+
+# Print CMake options for reference
+echo "CMake Options: ${cmake_options[@]}"
+
+# Configure Trilinos
+cmake "${cmake_options[@]}" -B "${TRILINOS_BUILD_DIR}" -S "${TRILINOS_SOURCE_DIR}"
+
+# Build Trilinos
+echo "Building Trilinos..."
+make -C "${TRILINOS_BUILD_DIR}" -j${MATAR_BUILD_CORES}
+
+# Install Trilinos
+echo "Installing Trilinos..."
+make -C "${TRILINOS_BUILD_DIR}" install all
+
+echo "Trilinos installation complete."
+fi
diff --git a/src/include/Tpetra_LRMultiVector_decl.hpp b/src/include/Tpetra_LRMultiVector_decl.hpp
new file mode 100644
index 00000000..b14308ee
--- /dev/null
+++ b/src/include/Tpetra_LRMultiVector_decl.hpp
@@ -0,0 +1,2550 @@
+// @HEADER
+// *****************************************************************************
+//          Tpetra: Templated Linear Algebra Services Package
+//
+// Copyright 2008 NTESS and the Tpetra contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+// clang-format off
+#ifndef TPETRA_LRMULTIVECTOR_DECL_HPP
+#define TPETRA_LRMULTIVECTOR_DECL_HPP
+
+/// \file Tpetra_LRMultiVector_decl.hpp
+/// \brief Declaration of the Tpetra::LRMultiVector class
+
+#include "Tpetra_Vector_fwd.hpp"
+#include "Tpetra_FEMultiVector_fwd.hpp"
+#include "Tpetra_DistObject.hpp"
+#include "Tpetra_Map_fwd.hpp"
+#include "Tpetra_Details_Behavior.hpp"
+#include "Kokkos_DualView.hpp"
+#include "Teuchos_BLAS_types.hpp"
+#include "Teuchos_DataAccess.hpp"
+#include "Teuchos_Range1D.hpp"
+#include "Kokkos_ArithTraits.hpp"
+#include "Kokkos_InnerProductSpaceTraits.hpp"
+#include "Tpetra_KokkosRefactor_Details_MultiVectorLocalDeepCopy.hpp"
+#include "Tpetra_Access.hpp"
+#include "Tpetra_LR_WrappedDualView.hpp"
+#include "Tpetra_Details_WrappedDualView.hpp"
+#include <type_traits>
+
+//forward declaration
+#include "Tpetra_Details_DefaultTypes.hpp"
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace Tpetra {
+template<class Scalar = ::Tpetra::Details::DefaultTypes::scalar_type,
+         class LocalOrdinal = ::Tpetra::Details::DefaultTypes::local_ordinal_type,
+         class GlobalOrdinal = ::Tpetra::Details::DefaultTypes::global_ordinal_type,
+         class Node = ::Tpetra::Details::DefaultTypes::node_type>
+class LRMultiVector;
+} // namespace Tpetra
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace Teuchos {
+  template<class OrdinalType, class ScalarType>
+  class SerialDenseMatrix; // forward declaration
+}
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
+
+
+namespace Tpetra {
+
+
+  /// \brief Copy the contents of the LRMultiVector \c src into \c dst.
+  /// \relatesalso LRMultiVector
+  ///
+  /// \pre The two inputs must have the same communicator.
+  /// \pre The Map of \c src must be compatible with the Map of \c dst.
+  /// \pre The two inputs must have the same number of columns.
+  ///
+  /// Copy the contents of the LRMultiVector \c src into the LRMultiVector
+  /// \c dst.  ("Copy the contents" means the same thing as "deep
+  /// copy.")  The two MultiVectors need not necessarily have the same
+  /// template parameters, but the assignment of their entries must
+  /// make sense.  Furthermore, their Maps must be compatible, that
+  /// is, the MultiVectors' local dimensions must be the same on all
+  /// processes.
+  ///
+  /// This method must always be called as a collective operation on
+  /// all processes over which the multivector is distributed.  This
+  /// is because the method reserves the right to check for
+  /// compatibility of the two Maps, at least in debug mode, and throw
+  /// if they are not compatible.
+  template <class DS, class DL, class DG, class DN,
+            class SS, class SL, class SG, class SN>
+  void
+  deep_copy (LRMultiVector<DS, DL, DG, DN>& dst,
+             const LRMultiVector<SS, SL, SG, SN>& src);
+
+#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
+  /// \brief Copy the contents of a Teuchos::SerialDenseMatrix into
+  ///   the local part of the given Tpetra::LRMultiVector.
+  /// \relatesalso LRMultiVector
+  ///
+  /// \pre <tt>src.numRows() == dst.getLocalLength()</tt>
+  /// \pre <tt>src.numCols() == dst.getNumVectors()</tt>
+  template <class ST, class LO, class GO, class NT>
+  void
+  deep_copy (LRMultiVector<ST, LO, GO, NT>& dst,
+             const Teuchos::SerialDenseMatrix<int, ST>& src);
+
+  /// \brief Copy the local part of the Tpetra::LRMultiVector into the
+  ///   Teuchos::SerialDenseMatrix.
+  /// \relatesalso LRMultiVector
+  ///
+  /// \pre <tt>src.numRows() == dst.getLocalLength()</tt>
+  /// \pre <tt>src.numCols() == dst.getNumVectors()</tt>
+  template <class ST, class LO, class GO, class NT>
+  void
+  deep_copy (Teuchos::SerialDenseMatrix<int, ST>& dst,
+             const LRMultiVector<ST, LO, GO, NT>& src);
+#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
+
+  /// \brief Return a deep copy of the given LRMultiVector.
+  /// \relatesalso LRMultiVector
+  ///
+  /// \note LRMultiVector's constructor returns a <i>shallow</i> copy of
+  ///   its input, by default.  If you want a deep copy, use the
+  ///   two-argument copy constructor with Teuchos::Copy as the second
+  ///   argument, or call this function (createCopy).
+  template <class ST, class LO, class GO, class NT>
+  LRMultiVector<ST, LO, GO, NT>
+  createCopy (const LRMultiVector<ST, LO, GO, NT>& src);
+
+  /// \brief Nonmember LRMultiVector "constructor": Create a LRMultiVector
+  ///   from a given Map.
+  /// \relatesalso LRMultiVector
+  /// \relatesalso Vector
+  ///
+  /// \param map [in] Map describing the distribution of rows of the
+  ///   resulting LRMultiVector.
+  /// \param numVectors [in] Number of columns of the resulting
+  ///   LRMultiVector.
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::RCP<LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+  createMultiVector (const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& map,
+                     const size_t numVectors);
+
+  // WARNING NOT FOR USERS
+  // This means we don't need to make LRMultiVector a friend of
+  // Vector or of itself (with different template parameters).
+  template<class SC, class LO, class GO, class NT>
+  Teuchos::ArrayView<const size_t>
+  getMultiVectorWhichVectors (const LRMultiVector<SC, LO, GO, NT>& X);
+
+
+  /// \brief One or more distributed dense vectors.
+  ///
+  /// A "multivector" contains one or more dense vectors.  All the
+  /// vectors in a multivector have the same distribution of rows in
+  /// parallel over the communicator used to create the multivector.
+  /// Multivectors containing more than one vector are useful for
+  /// algorithms that solve multiple linear systems at once, or that
+  /// solve for a cluster of eigenvalues and their corresponding
+  /// eigenvectors at once.  These "block" algorithms often have
+  /// accuracy or performance advantages over corresponding algorithms
+  /// that solve for only one vector at a time.  For example, working
+  /// with multiple vectors at a time allows Tpetra to use faster BLAS
+  /// 3 routines for local computations.  It may also reduce the
+  /// number of parallel reductions.
+  ///
+  /// The Vector class implements the LRMultiVector interface, so if you
+  /// only wish to work with a single vector at a time, you may simply
+  /// use Vector instead of LRMultiVector.  However, if you are writing
+  /// solvers or preconditioners, you would do better to write to the
+  /// LRMultiVector interface and always assume that each LRMultiVector
+  /// contains more than one vector.  This will make your solver or
+  /// preconditioner more compatible with other Trilinos packages, and
+  /// it will also let you exploit the performance optimizations
+  /// mentioned above.
+  ///
+  /// \tparam Scalar The type of each entry of the multivector.  (You
+  ///   can use real-valued or complex-valued types here, unlike in
+  ///   Epetra, where the scalar type is always \c double.)
+  /// \tparam LocalOrdinal The type of local indices.  See the
+  ///   documentation of Map for requirements.
+  /// \tparam GlobalOrdinal The type of global indices.  See the
+  ///   documentation of Map for requirements.
+  /// \tparam Node The Kokkos Node type.
+  ///
+  /// \section Kokkos_KR_MV_prereq Prerequisites
+  ///
+  /// Before reading the rest of this documentation, it helps to know
+  /// a little bit about Kokkos.  In particular, you should know about
+  /// execution spaces, memory spaces, and shallow copy semantics.
+  /// You should also know something about the Teuchos memory
+  /// management classes, in particular Teuchos::RCP, though it helps
+  /// to know a bit about Teuchos::ArrayRCP and Teuchos::ArrayView as
+  /// well.  You may also want to know about the differences between
+  /// BLAS 1, 2, and 3 operations, and learn a little bit about MPI
+  /// (the Message Passing Interface for distributed-memory
+  /// programming).  You won't have to use MPI directly to use
+  /// LRMultiVector, but it helps to be familiar with the general idea
+  /// of distributed storage of data over a communicator.
+  ///
+  /// \section Kokkos_KR_MV_view A LRMultiVector is a view of data
+  ///
+  /// A LRMultiVector is a view of data.  A <i>view</i> behaves like a
+  /// pointer; it provides access to the original multivector's data
+  /// without copying the data.  This means that the copy constructor
+  /// and assignment operator (<tt>operator=</tt>) do <i>shallow</i>
+  /// copies.  They do <i>not</i> copy the data; they just copy
+  /// pointers and other "metadata."  If you would like to copy a
+  /// LRMultiVector into an existing LRMultiVector, call the nonmember
+  /// function deep_copy().  If you would like to create a new
+  /// LRMultiVector which is a deep copy of an existing LRMultiVector,
+  /// call the nonmember function createCopy(), or use the
+  /// two-argument copy constructor with Teuchos::Copy as the second
+  /// argument.
+  ///
+  /// Views have the additional property that they automatically
+  /// handle deallocation.  They use <i>reference counting</i> for
+  /// this, much like how std::shared_ptr works.  That means you do
+  /// not have to worry about "freeing" a LRMultiVector after it has
+  /// been created.  Furthermore, you may pass shallow copies around
+  /// without needing to worry about which is the "master" view of the
+  /// data.  There is no "master" view of the data; when the last view
+  /// falls out of scope, the data will be deallocated.
+  ///
+  /// This is what the documentation means when it speaks of <i>view
+  /// semantics</i>.  The opposite of that is <i>copy</i> or
+  /// <i>container</i> semantics, where the copy constructor and
+  /// <tt>operator=</tt> do deep copies (of the data).  We say that
+  /// "std::vector has container semantics," and "LRMultiVector has view
+  /// semantics."
+  ///
+  /// LRMultiVector also has "subview" methods that give results
+  /// analogous to the Kokkos::subview() function.  That is, they
+  /// return a LRMultiVector which views some subset of another
+  /// LRMultiVector's rows and columns.  The subset of columns in a view
+  /// need not be contiguous.  For example, given a multivector X with
+  /// 43 columns, it is possible to have a multivector Y which is a
+  /// view of columns 1, 3, and 42 (zero-based indices) of X.  We call
+  /// such multivectors <i>noncontiguous</i>.  They have the the
+  /// property that isConstantStride() returns false.
+  ///
+  /// Noncontiguous multivectors lose some performance advantages.
+  /// For example, local computations may be slower, since Tpetra
+  /// cannot use BLAS 3 routines (e.g., matrix-matrix multiply) on a
+  /// noncontiguous multivectors without copying into temporary
+  /// contiguous storage.  For performance reasons, if you get a
+  /// Kokkos::View of a noncontiguous LRMultiVector's local data, it
+  /// does <i>not</i> correspond to the columns that the LRMultiVector
+  /// views.
+  ///
+  /// \section Kokkos_KR_MV_dual DualView semantics
+  ///
+  /// Tpetra was designed to perform well on many different kinds of
+  /// computers.  Some computers have different <i>memory spaces</i>.
+  /// For example, GPUs (Graphics Processing Units) by NVIDIA have
+  /// "device memory" and "host memory."  The GPU has faster access to
+  /// device memory than host memory, but usually there is less device
+  /// memory than host memory.  Intel's "Knights Landing" architecture
+  /// has two different memory spaces, also with different capacity
+  /// and performance characteristics.  Some architectures let the
+  /// processor address memory in any space, possibly with a
+  /// performance penalty.  Others can only access data in certain
+  /// spaces, and require a special system call to copy memory between
+  /// spaces.
+  ///
+  /// The Kokkos package provides abstractions for handling multiple
+  /// memory spaces.  In particular, Kokkos::DualView lets users
+  /// "mirror" data that live in one space, with data in another
+  /// space.  It also lets users manually mark data in one space as
+  /// modified (modify()), and synchronize (sync()) data from one
+  /// space to another.  The latter only actually copies if the data
+  /// have been marked as modified.  Users can access data in a
+  /// particular space by calling view().  All three of these methods
+  /// -- modify(), sync(), and view() -- are templated on the memory
+  /// space.  This is how users select the memory space on which they
+  /// want the method to act.
+  ///
+  /// LRMultiVector implements "DualView semantics."  This means that it
+  /// implements the above three operations:
+  /// <ul>
+  /// <li> modify(): Mark data in a memory space as modified (or about
+  ///      to be modified) </li>
+  /// <li> sync(): If data in the target memory space are least
+  ///      recently modified compared with the other space, copy data
+  ///      to the target memory space </li>
+  /// <li> getLocalView(): Return a Kokkos::View of the data in a
+  ///      given memory space </li>
+  /// </ul>
+  ///
+  /// If your computer only has one memory space, as with conventional
+  /// single-core or multicore processors, you don't have to worry
+  /// about this.  You can ignore the modify() and sync() methods in
+  /// that case.
+  ///
+  /// \section Kokkos_KR_MV_access How to access the local data
+  ///
+  /// The getLocalView() method for getting a Kokkos::View is the main
+  /// way to access a LRMultiVector's local data.  If you want to read
+  /// or write the actual values in a multivector, this is what you
+  /// want.  The resulting Kokkos::View behaves like a 2-D array.  You
+  /// can address it using an index pair (i,j), where i is the local
+  /// row index, and j is the column index.
+  ///
+  /// LRMultiVector also has methods that return an
+  /// Teuchos::ArrayRCP<Scalar> ("1-D view"), or a
+  /// Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > ("2-D view").
+  /// These exist only for backwards compatibility, and also give
+  /// access to the local data.
+  ///
+  /// All of these views only view <i>local</i> data.  This means that
+  /// the corresponding rows of the multivector are owned by the
+  /// calling (MPI) process.  You may <i>not</i> use these methods to
+  /// access <i>remote</i> data, that is, rows that do not belong to
+  /// the calling process.
+  ///
+  /// LRMultiVector's public interface also has methods for modifying
+  /// local data, like sumIntoLocalValue() and replaceGlobalValue().
+  /// These methods act on host data <i>only</i>.  To access or modify
+  /// device data, you must get the Kokkos::View and work with it
+  /// directly.
+  ///
+  /// \section Kokkos_KR_MV_why Why won't you give me a raw pointer?
+  ///
+  /// Tpetra was designed to allow different data representations
+  /// underneath the same interface.  This lets Tpetra run correctly
+  /// and efficiently on many different kinds of hardware.  These
+  /// different kinds of hardware all have in common the following:
+  /// <ul>
+  /// <li> Data layout matters a lot for performance </li>
+  /// <li> The right layout for your data depends on the hardware </li>
+  /// <li> Data may be distributed over different memory spaces in
+  ///      hardware, and efficient code must respect this, whether or
+  ///      not the programming model presents the different memories
+  ///      as a single address space </li>
+  /// <li> Copying between different data layouts or memory spaces is
+  ///      expensive and should be avoided whenever possible </li>
+  /// <li> Optimal data layout may require control over initialization
+  ///      of storage </li>
+  /// </ul>
+  /// These conclusions have practical consequences for the
+  /// LRMultiVector interface.  In particular, we have deliberately made
+  /// it difficult for you to access data directly by raw pointer.
+  /// This is because the underlying layout may not be what you
+  /// expect.  The memory might not even be accessible from the host
+  /// CPU.  Instead, we give access through a Kokkos::View, which
+  /// behaves like a 2-D array.  You can ask the Kokkos::View for a
+  /// raw pointer by calling its <tt>data()</tt> method, but
+  /// then you are responsible for understanding its layout in memory.
+  ///
+  /// \section Kokkos_KR_MV_dist Parallel distribution of data
+  ///
+  /// A LRMultiVector's rows are distributed over processes in its (row)
+  /// Map's communicator.  A LRMultiVector is a DistObject; the Map of
+  /// the DistObject tells which process in the communicator owns
+  /// which rows.  This means that you may use Import and Export
+  /// operations to migrate between different distributions.  Please
+  /// refer to the documentation of Map, Import, and Export for more
+  /// information.
+  ///
+  /// LRMultiVector includes methods that perform parallel all-reduces.
+  /// These include inner products and various kinds of norms.  All of
+  /// these methods have the same blocking semantics as
+  /// <tt>MPI_Allreduce</tt>.
+  ///
+  /// \warning Some computational methods, such as inner products and
+  ///   norms, may return incorrect results if the LRMultiVector's Map
+  ///   is overlapping (not one-to-one) but not locally replicated.
+  ///   That is, if some but not all rows are shared by more than one
+  ///   process in the communicator, then inner products and norms may
+  ///   be wrong.  This behavior may change in future releases.
+  template <class Scalar,
+            class LocalOrdinal,
+            class GlobalOrdinal,
+            class Node>
+  class LRMultiVector :
+    public DistObject<Scalar, LocalOrdinal, GlobalOrdinal, Node>
+  {
+  public:
+    //! @name Typedefs to facilitate template metaprogramming.
+    //@{
+
+    /// \brief The type of each entry in the LRMultiVector.
+    using scalar_type = Scalar;
+    /// \brief The type used internally in place of \c Scalar.
+    ///
+    /// Some \c Scalar types might not work with Kokkos on all
+    /// execution spaces, due to missing CUDA device macros or missing
+    /// volatile overloads of some methods.  The C++ standard type
+    /// <tt>std::complex<T></tt> has this problem.  To fix this, we
+    /// replace <tt>std::complex<T></tt> values internally with the
+    /// bitwise identical type <tt>Kokkos::complex<T></tt>.  The
+    /// latter is the <tt>impl_scalar_type</tt> corresponding to
+    /// <tt>Scalar = std::complex<T></tt>.
+    ///
+    /// Most users don't need to know about this.  Just be aware that
+    /// if you ask for a Kokkos::View or Kokkos::DualView of the
+    /// LRMultiVector's data, its entries have type \c impl_scalar_type,
+    /// not \c scalar_type.
+    using impl_scalar_type =
+      typename Kokkos::ArithTraits<Scalar>::val_type;
+
+    //! The type of the Map specialization used by this class.
+    using map_type = Map<LocalOrdinal, GlobalOrdinal, Node>;
+    //! The type of local indices that this class uses.
+    using local_ordinal_type = typename map_type::local_ordinal_type;
+    //! The type of global indices that this class uses.
+    using global_ordinal_type = typename map_type::global_ordinal_type;
+    //! This class' preferred Kokkos device type.
+    using device_type = typename map_type::device_type;
+    //! Legacy thing that you should not use any more.
+    using node_type = typename map_type::node_type;
+
+    /// \brief Type of an inner ("dot") product result.
+    ///
+    /// This is usually the same as <tt>impl_scalar_type</tt>, but may
+    /// differ if <tt>impl_scalar_type</tt> is e.g., an uncertainty
+    /// quantification type from the Stokhos package.
+    using dot_type =
+      typename Kokkos::Details::InnerProductSpaceTraits<impl_scalar_type>::dot_type;
+
+    /// \brief Type of a norm result.
+    ///
+    /// This is usually the same as the type of the magnitude
+    /// (absolute value) of <tt>impl_scalar_type</tt>, but may differ if
+    /// <tt>impl_scalar_type</tt> is e.g., an uncertainty quantification
+    /// type from the Stokhos package.
+    using mag_type = typename Kokkos::ArithTraits<impl_scalar_type>::mag_type;
+
+    /// \brief Type of the (new) Kokkos execution space.
+    ///
+    /// The execution space implements parallel operations, like
+    /// parallel_for, parallel_reduce, and parallel_scan.
+    using execution_space = typename device_type::execution_space;
+
+    /// \brief Kokkos::DualView specialization used by this class.
+    ///
+    /// This is of interest to users who already have a
+    /// Kokkos::DualView, and want the LRMultiVector to view it.  By
+    /// "view" it, we mean that the LRMultiVector doesn't copy the data
+    /// in the DualView; it just hangs on to the pointer.
+    ///
+    /// We take particular care to template the DualView on an
+    /// execution space, rather than a memory space.  This ensures
+    /// that Tpetra will use exactly the specified execution space(s)
+    /// and no others.  This matters because View (and DualView)
+    /// initialization is a parallel Kokkos kernel.  If the View is
+    /// templated on an execution space, Kokkos uses that execution
+    /// space (and only that execution space) to initialize the View.
+    /// This is what we want.  If the View is templated on a
+    /// <i>memory</i> space, Kokkos uses the memory space's default
+    /// <i>execution</i> space to initialize.  This is not necessarily
+    /// what we want.  For example, if building with OpenMP enabled,
+    /// the default execution space for host memory is Kokkos::OpenMP,
+    /// even if the user-specified DeviceType is Kokkos::Serial.  That
+    /// is why we go through the trouble of asking for the
+    /// execution_space's execution space.
+    using dual_view_type = Kokkos::DualView<impl_scalar_type**,
+                                            Kokkos::LayoutRight,
+                                            device_type>;
+    using wrapped_dual_view_type = Details::LRWrappedDualView<dual_view_type>;
+
+    using host_view_type = typename dual_view_type::t_host;
+    using device_view_type = typename dual_view_type::t_dev;
+
+    //@}
+    //! @name Constructors and destructor
+    //@{
+
+    //! Default constructor: makes a LRMultiVector with no rows or columns.
+    LRMultiVector ();
+
+    /// \brief Basic constuctor.
+    ///
+    /// \param map [in] Map describing the distribution of rows.
+    /// \param numVecs [in] Number of vectors (columns).
+    /// \param zeroOut [in] Whether to initialize all the entries of
+    ///   the LRMultiVector to zero.
+    LRMultiVector (const Teuchos::RCP<const map_type>& map,
+                 const size_t numVecs,
+                 const bool zeroOut = true);
+
+    /// \brief Copy constructor, with option to do deep or shallow copy.
+    ///
+    /// The current (so-called "Kokkos refactor," circa >= 2014/5)
+    /// version of Tpetra, unlike the previous "classic" version,
+    /// always has view semantics.  Thus, copyOrView = Teuchos::View
+    /// has no effect, and copyOrView = Teuchos::Copy does not mark
+    /// this LRMultiVector as having copy semantics.  However,
+    /// copyOrView = Teuchos::Copy will make the resulting LRMultiVector
+    /// a deep copy of the input LRMultiVector.
+    ///
+    LRMultiVector (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& source,
+                 const Teuchos::DataAccess copyOrView);
+
+    /// \brief Create multivector by copying two-dimensional array of local data.
+    ///
+    /// \param map [in] The Map describing the distribution of rows of
+    ///   the multivector.
+    /// \param view [in] A view of column-major dense matrix data.
+    ///   The calling process will make a deep copy of this data.
+    /// \param LDA [in] The leading dimension (a.k.a. "stride") of the
+    ///   column-major input data.
+    /// \param NumVectors [in] The number of columns in the input data.
+    ///   This will be the number of vectors in the returned
+    ///   multivector.
+    ///
+    /// \pre LDA >= A.size()
+    /// \pre NumVectors > 0
+    /// \post isConstantStride() == true
+    LRMultiVector (const Teuchos::RCP<const map_type>& map,
+                 const Teuchos::ArrayView<const Scalar>& A,
+                 const size_t LDA,
+                 const size_t NumVectors);
+
+    /// \brief Create multivector by copying array of views of local data.
+    ///
+    /// \param map [in] The Map describing the distribution of rows of
+    ///   the multivector.
+    /// \param ArrayOfPtrs [in/out] Array of views of each column's data.
+    ///   The calling process will make a deep copy of this data.
+    /// \param NumVectors [in] The number of columns in the input
+    ///   data, and the number of elements in ArrayOfPtrs.  This will
+    ///   be the number of vectors in the returned multivector.
+    ///
+    /// \pre NumVectors > 0
+    /// \pre NumVectors == ArrayOfPtrs.size()
+    /// \post constantStride() == true
+    LRMultiVector (const Teuchos::RCP<const map_type>& map,
+                 const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >&ArrayOfPtrs,
+                 const size_t NumVectors);
+
+    /// \brief Constructor, that takes a Kokkos::DualView of the
+    ///   LRMultiVector's data, and returns a LRMultiVector that views
+    ///   those data.
+    ///
+    /// To "view those data" means that this LRMultiVector and the input
+    /// Kokkos::DualView point to the same data, just like two "raw"
+    /// pointers (e.g., <tt>double*</tt>) can point to the same data.
+    /// If you modify one, the other sees it (subject to the
+    /// limitations of cache coherence).
+    ///
+    /// \param map [in] Map describing the distribution of rows.
+    /// \param view [in] Kokkos::DualView of the data to view.
+    LRMultiVector (const Teuchos::RCP<const map_type>& map,
+                 const dual_view_type& view);
+
+    /// \brief Constructor, that takes a Kokkos::View of the
+    ///   LRMultiVector's data (living in the Device's memory space),
+    ///   and returns a LRMultiVector that views those data.
+    ///
+    /// \param map [in] Map describing the distribution of rows.
+    /// \param view [in] Kokkos::View of the data to view.
+    ///
+    /// Q: What's the difference between this constructor (that takes
+    /// a Kokkos::View), and the constructor above that takes a
+    /// Kokkos::DualView?
+    ///
+    /// A: Suppose that for the LRMultiVector's device type, there are
+    /// actually two memory spaces (e.g., for Kokkos::Cuda with UVM
+    /// off, assuming that this is allowed).  In order for LRMultiVector
+    /// to implement DualView semantics correctly, this constructor
+    /// must allocate a Kokkos::View of host memory (or lazily
+    /// allocate it on modify() or sync()).
+    ///
+    /// Now suppose that you pass in the same Kokkos::View of device
+    /// memory to two different LRMultiVector instances, X and Y.  Each
+    /// would allocate its own Kokkos::View of host memory.  That
+    /// means that X and Y would have different DualView instances,
+    /// but their DualView instances would have the same device View.
+    ///
+    /// Suppose that you do the following:
+    /// <ol>
+    /// <li> Modify X on host (calling modify() correctly) </li>
+    /// <li> Modify Y on host (calling modify() correctly) </li>
+    /// <li> Sync Y to device (calling sync() correctly) </li>
+    /// <li> Sync X to device (calling sync() correctly) </li>
+    /// </ol>
+    /// This would clobber Y's later changes on host, in favor of X's
+    /// earlier changes on host.  That could be very confusing.  We
+    /// allow this behavior because Kokkos::DualView allows it.  That
+    /// is, Kokkos::DualView also lets you get the device View, and
+    /// hand it off to another Kokkos::DualView.  It's confusing, but
+    /// users need to know what they are doing if they start messing
+    /// around with multiple memory spaces.
+    LRMultiVector (const Teuchos::RCP<const map_type>& map,
+                 const typename dual_view_type::t_dev& d_view);
+   
+    /// \brief Expert mode constructor, that takes a Kokkos::DualView
+    ///   of the LRMultiVector's data and the "original"
+    ///   Kokkos::DualView of the data, and returns a LRMultiVector that
+    ///   views those data.
+    ///
+    /// \warning This constructor is only for expert users.  We make
+    ///   no promises about backwards compatibility for this
+    ///   interface.  It may change or go away at any time.  It is
+    ///   mainly useful for Tpetra developers and we do not expect it
+    ///   to be useful for anyone else.
+    ///
+    /// \param map [in] Map describing the distribution of rows.
+    /// \param view [in] View of the data (shallow copy).
+    /// \param origView [in] The </i>original</i> view of the data.
+    ///
+    /// The original view keeps the "original" dimensions.  Doing so
+    /// lets us safely construct a column Map view of a (domain Map
+    /// view of a (column Map LRMultiVector)).  The result of a
+    /// Kokkos::subview does not remember the original dimensions of
+    /// the view, and does not allow constructing a view with a
+    /// superset of rows or columns, so we have to keep the original
+    /// view.
+    LRMultiVector (const Teuchos::RCP<const map_type>& map,
+                 const dual_view_type& view,
+                 const dual_view_type& origView);
+
+    /// \brief Expert mode constructor, that takes a WrappedDualView
+    ///   of the LRMultiVector's data.
+    ///
+    /// \warning This constructor is only for expert users.  We make
+    ///   no promises about backwards compatibility for this
+    ///   interface.  It may change or go away at any time.  It is
+    ///   mainly useful for Tpetra developers and we do not expect it
+    ///   to be useful for anyone else.
+    LRMultiVector (const Teuchos::RCP<const map_type>& map,
+                 const wrapped_dual_view_type& d_view);
+
+
+  protected:
+
+    /// \brief Single-column subview constructor, for derived classes ONLY.
+    ///
+    /// \param X [in] Input LRMultiVector to view (in possibly nonconst fashion).
+    /// \param j [in] The column of X to view.
+    LRMultiVector (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& X,
+                 const size_t j);
+
+  public:
+
+    /// \brief Expert mode constructor for noncontiguous views.
+    ///
+    /// \warning This constructor is only for expert users.  We make
+    ///   no promises about backwards compatibility for this
+    ///   interface.  It may change or go away at any time.  It is
+    ///   mainly useful for Tpetra developers and we do not expect it
+    ///   to be useful for anyone else.
+    ///
+    /// This constructor takes a Kokkos::DualView for the LRMultiVector
+    /// to view, and a list of the columns to view, and returns a
+    /// LRMultiVector that views those data.  The resulting LRMultiVector
+    /// does <i>not</i> have constant stride, that is,
+    /// isConstantStride() returns false.
+    ///
+    /// \param map [in] Map describing the distribution of rows.
+    /// \param view [in] Device view to the data (shallow copy).
+    /// \param whichVectors [in] Which columns (vectors) to view.
+    LRMultiVector (const Teuchos::RCP<const map_type>& map,
+                 const dual_view_type& view,
+                 const Teuchos::ArrayView<const size_t>& whichVectors);
+
+    /// \brief Expert mode constructor for noncontiguous views.
+    ///
+    /// \warning This constructor is only for expert users.  We make
+    ///   no promises about backwards compatibility for this
+    ///   interface.  It may change or go away at any time.  It is
+    ///   mainly useful for Tpetra developers and we do not expect it
+    ///   to be useful for anyone else.
+    ///
+    /// This constructor takes a Kokkos::DualView for the LRMultiVector
+    /// to view, and a list of the columns to view, and returns a
+    /// LRMultiVector that views those data.  The resulting LRMultiVector
+    /// does <i>not</i> have constant stride, that is,
+    /// isConstantStride() returns false.
+    ///
+    /// \param map [in] Map describing the distribution of rows.
+    /// \param view [in] WrappedDualView to the data (shallow copy).
+    /// \param whichVectors [in] Which columns (vectors) to view.
+    LRMultiVector (const Teuchos::RCP<const map_type>& map,
+                 const wrapped_dual_view_type& view,
+                 const Teuchos::ArrayView<const size_t>& whichVectors);
+
+
+    /// \brief Expert mode constructor for noncontiguous views, with
+    ///   original view.
+    ///
+    /// \warning This constructor is only for expert users.  We make
+    ///   no promises about backwards compatibility for this
+    ///   interface.  It may change or go away at any time.  It is
+    ///   mainly useful for Tpetra developers and we do not expect it
+    ///   to be useful for anyone else.
+    ///
+    /// This constructor takes a Kokkos::DualView for the LRMultiVector
+    /// to view, a view of the "original" data, and a list of the
+    /// columns to view, and returns a LRMultiVector that views those
+    /// data.  The resulting LRMultiVector does <i>not</i> have constant
+    /// stride, that is, isConstantStride() returns false.
+    ///
+    /// \param map [in] Map describing the distribution of rows.
+    /// \param view [in] View of the data (shallow copy).
+    /// \param origView [in] The </i>original</i> view of the data.
+    /// \param whichVectors [in] Which columns (vectors) to view.
+    ///
+    /// The original view keeps the "original" dimensions.  Doing so
+    /// lets us safely construct a column Map view of a (domain Map
+    /// view of a (column Map LRMultiVector)).  The result of a
+    /// Kokkos::subview does not remember the original dimensions of
+    /// the view, and does not allow constructing a view with a
+    /// superset of rows or columns, so we have to keep the original
+    /// view.
+    LRMultiVector (const Teuchos::RCP<const map_type>& map,
+                 const dual_view_type& view,
+                 const dual_view_type& origView,
+                 const Teuchos::ArrayView<const size_t>& whichVectors);
+
+    /// \brief "Offset view" constructor; make a view of a contiguous
+    ///   subset of rows on each process.
+    ///
+    /// Return a view of the LRMultiVector \c X, which views a subset of
+    /// the rows of \c X.  Specify the subset by a subset Map of this
+    /// LRMultiVector's current row Map, and an optional (local) offset.
+    /// "View" means "alias": if the original (this) LRMultiVector's
+    /// data change, the view will see the changed data.
+    ///
+    /// \param X [in] The LRMultiVector to view.
+    /// \param subMap [in] The row Map for the new LRMultiVector.  This
+    ///   must be a subset Map of the input LRMultiVector's row Map.
+    /// \param offset [in] The local row offset at which to start the view.
+    ///
+    /// Suppose that you have a LRMultiVector X, and you want to view X,
+    /// on all processes in X's (MPI) communicator, as split into two
+    /// row blocks X1 and X2.  One could express this in Matlab
+    /// notation as X = [X1; X2], except that here, X1 and X2 are
+    /// views into X, rather than copies of X's data.  This method
+    /// assumes that the <i>local</i> indices of X1 and X2 are each
+    /// contiguous, and that the local indices of X2 follow those of
+    /// X1.  If that is not the case, you cannot use views to divide X
+    /// into blocks like this; you must instead use the Import or
+    /// Export functionality, which copies the relevant rows of X.
+    ///
+    /// Here is how you would construct the views X1 and X2.
+    /// \code
+    /// LRMultiVector<> X (...); // the input LRMultiVector
+    /// // ... fill X with data ...
+    ///
+    /// using Teuchos::RCP;
+    ///
+    /// // Map that on each process in X's communicator,
+    /// // contains the global indices of the rows of X1.
+    /// RCP<const Map<>> map1 (new Map<> (...));
+    /// // Map that on each process in X's communicator,
+    /// // contains the global indices of the rows of X2.
+    /// RCP<const Map<>> map2 (new Map<> (...));
+    ///
+    /// // Create the first view X1.  The second argument, the offset,
+    /// // is the index of the local row at which to start the view.
+    /// // X1 is the topmost block of X, so the offset is zero.
+    /// LRMultiVector<> X1 (X, map1, 0);
+    ///
+    /// // Create the second view X2.  X2 is directly below X1 in X,
+    /// // so the offset is the local number of rows in X1.  This is
+    /// // the same as the local number of entries in map1.
+    /// LRMultiVector<> X1 (X, map2, X1.getLocalLength ());
+    /// \endcode
+    ///
+    /// It is legal, in the above example, for X1 or X2 to have zero
+    /// local rows on any or all process(es).  In that case, the
+    /// corresponding Map must have zero local entries on that / those
+    /// process(es).  In particular, if X2 has zero local rows on a
+    /// process, then the corresponding offset on that process would
+    /// be the number of local rows in X (and therefore in X1) on that
+    /// process.  This is the only case in which the sum of the local
+    /// number of entries in \c subMap (in this case, zero) and the
+    /// offset may equal the number of local entries in
+    /// <tt>*this</tt>.
+    LRMultiVector (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& X,
+                 const Teuchos::RCP<const map_type>& subMap,
+                 const local_ordinal_type rowOffset = 0);
+
+    /// \brief "Offset view" constructor, that takes the new Map as a
+    ///   <tt>const Map&</tt> rather than by RCP.
+    ///
+    /// This constructor exists for backwards compatibility.  It
+    /// invokes the input Map's copy constructor, which is a shallow
+    /// copy.  Maps are immutable anyway, so the copy is harmless.
+    LRMultiVector (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& X,
+                 const map_type& subMap,
+                 const size_t offset = 0);
+
+    /// \brief Copy constructor (shallow copy).
+    ///
+    /// LRMultiVector's copy constructor always does a shallow copy.
+    /// Use the nonmember function <tt>Tpetra::deep_copy</tt> (see
+    /// below) to deep-copy one existing LRMultiVector to another, and
+    /// use the two-argument "copy constructor" (in this file, with
+    /// <tt>copyOrView=Teuchos::Copy</tt>) to create a LRMultiVector
+    /// that is a deep copy of an existing LRMultiVector.
+    LRMultiVector (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>&) = default;
+
+    //! Move constructor (shallow move).
+    LRMultiVector (LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>&&) = default;
+
+    /// \brief Copy assigment (shallow copy).
+    ///
+    /// LRMultiVector's copy constructor always does a shallow copy.
+    /// Use the nonmember function <tt>Tpetra::deep_copy</tt> (see
+    /// below) to deep-copy one existing LRMultiVector to another.
+    LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>&
+    operator= (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>&) = default;
+
+    //! Move assigment (shallow move).
+    LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>&
+    operator= (LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>&&) = default;
+
+    /// \brief Destructor (virtual for memory safety of derived classes).
+    ///
+    /// \note To Tpetra developers: See the C++ Core Guidelines C.21
+    ///   ("If you define or <tt>=delete</tt> any default operation,
+    ///   define or <tt>=delete</tt> them all"), in particular the
+    ///   AbstractBase example, for why this destructor declaration
+    ///   implies that we need the above four <tt>=default</tt>
+    ///   declarations for copy construction, move construction, copy
+    ///   assignment, and move assignment.
+    virtual ~LRMultiVector () = default;
+
+    //! Swap contents of \c mv with contents of \c *this.
+    void swap (LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& mv);
+
+    //@}
+    //! @name Post-construction modification routines
+    //@{
+
+  protected:
+    /// \brief Whether sumIntoLocalValue and sumIntoGlobalValue should
+    ///   use atomic updates by default.
+    ///
+    /// \warning This is an implementation detail.
+    static const bool useAtomicUpdatesByDefault =
+#ifdef KOKKOS_ENABLE_SERIAL
+      ! std::is_same<execution_space, Kokkos::Serial>::value;
+#else
+      true;
+#endif // KOKKOS_ENABLE_SERIAL
+
+  public:
+    /// \brief Replace value in host memory, using global row index.
+    ///
+    /// Replace the current value at row \c gblRow (a global index)
+    /// and column \c col with the given value.  The column index is
+    /// zero based.
+    ///
+    /// This method affects the host memory version of the data.  If
+    /// \c device_type is a Kokkos device that has two memory spaces,
+    /// and you want to modify the non-host version of the data, you
+    /// must access the device View directly by calling
+    /// getLocalView().  Please see modify(), sync(), and the
+    /// discussion of DualView semantics elsewhere in the
+    /// documentation.
+    /// This method calls sync_host() before modifying
+    /// host data, and modify_host() afterwards.
+    ///
+    /// This method does not have an "atomic" option like
+    /// sumIntoGlobalValue.  This is deliberate.  Replacement is not
+    /// commutative, unlike += (modulo rounding error).  Concurrent
+    /// calls to replaceGlobalValue on different threads that modify
+    /// the same entry/ies have undefined results.  (It's not just
+    /// that one thread might win; it's that the value might get
+    /// messed up.)
+    ///
+    /// \param gblRow [in] Global row index of the entry to modify.
+    ///   This <i>must</i> be a valid global row index on the calling
+    ///   process with respect to the LRMultiVector's Map.
+    /// \param col [in] Column index of the entry to modify.
+    /// \param value [in] Incoming value to add to the entry.
+    void
+    replaceGlobalValue (const GlobalOrdinal gblRow,
+                        const size_t col,
+                        const impl_scalar_type& value);
+
+    /// \brief Like the above replaceGlobalValue, but only enabled if
+    ///   T differs from impl_scalar_type.
+    ///
+    /// This method only exists if its template parameter \c T and
+    /// impl_scalar_type differ, and if it is syntactically possible
+    /// to convert \c T to impl_scalar_type.  This method is mainly
+    /// useful for backwards compatibility, when the Scalar template
+    /// parameter differs from impl_scalar_type.  That is commonly
+    /// only the case when Scalar is std::complex<U> for some type U.
+    ///
+    /// This method affects the host memory version of the data.  If
+    /// \c device_type is a Kokkos device that has two memory spaces,
+    /// and you want to modify the non-host version of the data, you
+    /// must access the device View directly by calling getLocalView().
+    /// Please see modify(), sync(), and the discussion of DualView
+    /// semantics elsewhere in the documentation.
+    /// This method calls sync_host() before modifying
+    /// host data, and modify_host() afterwards.
+    ///
+    /// This method does not have an "atomic" option like
+    /// sumIntoGlobalValue.  This is deliberate.  Replacement is not
+    /// commutative, unlike += (modulo rounding error).  Concurrent
+    /// calls to replaceGlobalValue on different threads that modify
+    /// the same entry/ies have undefined results.  (It's not just
+    /// that one thread might win; it's that the value might get
+    /// messed up.)
+    ///
+    /// \param gblRow [in] Global row index of the entry to modify.
+    ///   This <i>must</i> be a valid global row index on the calling
+    ///   process with respect to the LRMultiVector's Map.
+    /// \param col [in] Column index of the entry to modify.
+    /// \param value [in] Incoming value to add to the entry.
+    template<typename T>
+    typename std::enable_if<! std::is_same<T, impl_scalar_type>::value && std::is_convertible<T, impl_scalar_type>::value, void>::type
+    replaceGlobalValue (GlobalOrdinal globalRow,
+                        size_t col,
+                        const T& value)
+    {
+      replaceGlobalValue (globalRow, col, static_cast<impl_scalar_type> (value));
+    }
+
+    /// \brief Update (+=) a value in host memory, using global row index.
+    ///
+    /// Add the given value to the existing value at row \c gblRow (a
+    /// global index) and column \c col.  The column index is zero
+    /// based.
+    ///
+    /// This method affects the host memory version of the data.  If
+    /// \c device_type is a Kokkos device that has two memory spaces,
+    /// and you want to modify the non-host version of the data, you
+    /// must access the device View directly by calling
+    /// getLocalView().  Please see modify(), sync(), and the
+    /// discussion of DualView semantics elsewhere in the
+    /// documentation.
+    /// This method calls sync_host() before modifying
+    /// host data, and modify_host() afterwards.
+    ///
+    /// \param gblRow [in] Global row index of the entry to modify.
+    ///   This <i>must</i> be a valid global row index on the calling
+    ///   process with respect to the LRMultiVector's Map.
+    /// \param col [in] Column index of the entry to modify.
+    /// \param value [in] Incoming value to add to the entry.
+    /// \param atomic [in] Whether to use an atomic update.  If this
+    ///   class' execution space is not Kokkos::Serial, then this is
+    ///   true by default, else it is false by default.
+    void
+    sumIntoGlobalValue (const GlobalOrdinal gblRow,
+                        const size_t col,
+                        const impl_scalar_type& value,
+                        const bool atomic = useAtomicUpdatesByDefault);
+
+    /// \brief Like the above sumIntoGlobalValue, but only enabled if
+    ///   T differs from impl_scalar_type.
+    ///
+    /// This method only exists if its template parameter \c T and
+    /// impl_scalar_type differ, and if it is syntactically possible
+    /// to convert \c T to impl_scalar_type.  This method is mainly
+    /// useful for backwards compatibility, when the Scalar template
+    /// parameter differs from impl_scalar_type.  That is commonly
+    /// only the case when Scalar is std::complex<U> for some type U.
+    ///
+    /// This method affects the host memory version of the data.  If
+    /// \c device_type is a Kokkos device that has two memory spaces,
+    /// and you want to modify the non-host version of the data, you
+    /// must access the device View directly by calling
+    /// getLocalView().  Please see modify(), sync(), and the
+    /// discussion of DualView semantics elsewhere in the
+    /// documentation.
+    /// This method calls sync_host() before modifying
+    /// host data, and modify_host() afterwards.
+    ///
+    /// \param gblRow [in] Global row index of the entry to modify.
+    ///   This <i>must</i> be a valid global row index on the calling
+    ///   process with respect to the LRMultiVector's Map.
+    /// \param col [in] Column index of the entry to modify.
+    /// \param val [in] Incoming value to add to the entry.
+    /// \param atomic [in] Whether to use an atomic update.  If this
+    ///   class' execution space is not Kokkos::Serial, then this is
+    ///   true by default, else it is false by default.
+    template<typename T>
+    typename std::enable_if<! std::is_same<T, impl_scalar_type>::value && std::is_convertible<T, impl_scalar_type>::value, void>::type
+    sumIntoGlobalValue (const GlobalOrdinal gblRow,
+                        const size_t col,
+                        const T& val,
+                        const bool atomic = useAtomicUpdatesByDefault)
+    {
+      sumIntoGlobalValue (gblRow, col, static_cast<impl_scalar_type> (val), atomic);
+    }
+
+    /// \brief Replace value in host memory, using local (row) index.
+    ///
+    /// Replace the current value at row \c lclRow (a local index) and
+    /// column \c col with the given value.  The column index is zero
+    /// based.
+    ///
+    /// This method affects the host memory version of the data.  If
+    /// \c device_type is a Kokkos device that has two memory spaces,
+    /// and you want to modify the non-host version of the data, you
+    /// must access the device View directly by calling
+    /// getLocalView().  Please see modify(), sync(), and the
+    /// discussion of DualView semantics elsewhere in the
+    /// documentation.
+    /// This method calls sync_host() before modifying
+    /// host data, and modify_host() afterwards.
+    ///
+    /// This method does not have an "atomic" option like
+    /// sumIntoLocalValue.  This is deliberate.  Replacement is not
+    /// commutative, unlike += (modulo rounding error).  Concurrent
+    /// calls to replaceLocalValue on different threads that modify
+    /// the same entry/ies have undefined results.  (It's not just
+    /// that one thread might win; it's that the value might get
+    /// messed up.)
+    ///
+    /// \param lclRow [in] Local row index of the entry to modify.
+    ///   Must be a valid local index in this LRMultiVector's Map on the
+    ///   calling process.
+    /// \param col [in] Column index of the entry to modify.
+    /// \param value [in] Incoming value to add to the entry.
+    void
+    replaceLocalValue (const LocalOrdinal lclRow,
+                       const size_t col,
+                       const impl_scalar_type& value);
+
+    /// \brief Like the above replaceLocalValue, but only enabled if
+    ///   T differs from impl_scalar_type.
+    ///
+    /// This method only exists if its template parameter \c T and
+    /// impl_scalar_type differ, and if it is syntactically possible
+    /// to convert \c T to impl_scalar_type.  This method is mainly
+    /// useful for backwards compatibility, when the Scalar template
+    /// parameter differs from impl_scalar_type.  That is commonly
+    /// only the case when Scalar is std::complex<U> for some type U.
+    ///
+    /// This method affects the host memory version of the data.  If
+    /// \c device_type is a Kokkos device that has two memory spaces,
+    /// and you want to modify the non-host version of the data, you
+    /// must access the device View directly by calling
+    /// getLocalView().  Please see modify(), sync(), and the
+    /// discussion of DualView semantics elsewhere in the
+    /// documentation.
+    /// This method calls sync_host() before modifying
+    /// host data, and modify_host() afterwards.
+    ///
+    /// This method does not have an "atomic" option like
+    /// sumIntoLocalValue.  This is deliberate.  Replacement is not
+    /// commutative, unlike += (modulo rounding error).  Concurrent
+    /// calls to replaceLocalValue on different threads that modify
+    /// the same entry/ies have undefined results.  (It's not just
+    /// that one thread might win; it's that the value might get
+    /// messed up.)
+    ///
+    /// \param lclRow [in] Local row index of the entry to modify.
+    ///   Must be a valid local index in this LRMultiVector's Map on the
+    ///   calling process.
+    /// \param col [in] Column index of the entry to modify.
+    /// \param val [in] Incoming value to add to the entry.
+    template<typename T>
+    typename std::enable_if<! std::is_same<T, impl_scalar_type>::value && std::is_convertible<T, impl_scalar_type>::value, void>::type
+    replaceLocalValue (const LocalOrdinal lclRow,
+                       const size_t col,
+                       const T& val)
+    {
+      replaceLocalValue (lclRow, col, static_cast<impl_scalar_type> (val));
+    }
+
+    /// \brief Update (+=) a value in host memory, using local row index.
+    ///
+    /// Add the given value to the existing value at row \c localRow
+    /// (a local index) and column \c col.  The column index is zero
+    /// based.
+    ///
+    /// This method affects the host memory version of the data.  If
+    /// \c device_type is a Kokkos device that has two memory spaces,
+    /// and you want to modify the non-host version of the data, you
+    /// must access the device View directly by calling
+    /// getLocalView().  Please see modify(), sync(), and the
+    /// discussion of DualView semantics elsewhere in the
+    /// documentation.
+    /// This method calls sync_host() before modifying
+    /// host data, and modify_host() afterwards.
+    ///
+    /// \param lclRow [in] Local row index of the entry to modify.
+    ///   Must be a valid local index in this LRMultiVector's Map on the
+    ///   calling process.
+    /// \param col [in] Column index of the entry to modify.
+    /// \param val [in] Incoming value to add to the entry.
+    /// \param atomic [in] Whether to use an atomic update.  If this
+    ///   class' execution space is not Kokkos::Serial, then this is
+    ///   true by default, else it is false by default.
+    void
+    sumIntoLocalValue (const LocalOrdinal lclRow,
+                       const size_t col,
+                       const impl_scalar_type& val,
+                       const bool atomic = useAtomicUpdatesByDefault);
+
+    /// \brief Like the above sumIntoLocalValue, but only enabled if
+    ///   T differs from impl_scalar_type.
+    ///
+    /// This method only exists if its template parameter \c T and
+    /// impl_scalar_type differ, and if it is syntactically possible
+    /// to convert \c T to impl_scalar_type.  This method is mainly
+    /// useful for backwards compatibility, when the Scalar template
+    /// parameter differs from impl_scalar_type.  That is commonly
+    /// only the case when Scalar is std::complex<U> for some type U.
+    ///
+    /// This method affects the host memory version of the data.  If
+    /// \c device_type is a Kokkos device that has two memory spaces,
+    /// and you want to modify the non-host version of the data, you
+    /// must access the device View directly by calling
+    /// getLocalView().  Please see modify(), sync(), and the
+    /// discussion of DualView semantics elsewhere in the
+    /// documentation.
+    /// This method calls sync_host() before modifying
+    /// host data, and modify_host() afterwards.
+    ///
+    /// \param lclRow [in] Local row index of the entry to modify.
+    /// \param col [in] Column index of the entry to modify.
+    /// \param val [in] Incoming value to add to the entry.
+    /// \param atomic [in] Whether to use an atomic update.  If this
+    ///   class' execution space is not Kokkos::Serial, then this is
+    ///   true by default, else it is false by default.
+    template<typename T>
+    typename std::enable_if<! std::is_same<T, impl_scalar_type>::value && std::is_convertible<T, impl_scalar_type>::value, void>::type
+    sumIntoLocalValue (const LocalOrdinal lclRow,
+                       const size_t col,
+                       const T& val,
+                       const bool atomic = useAtomicUpdatesByDefault)
+    {
+      sumIntoLocalValue (lclRow, col, static_cast<impl_scalar_type> (val), atomic);
+    }
+
+    //! Set all values in the multivector with the given value.
+    void putScalar (const Scalar& value);
+
+    /// \brief Set all values in the multivector with the given value.
+    ///
+    /// This method only exists if its template parameter \c T and
+    /// impl_scalar_type differ, and if it is syntactically possible
+    /// to convert \c T to impl_scalar_type.  This method is mainly
+    /// useful for backwards compatibility, when the Scalar template
+    /// parameter differs from impl_scalar_type.  That is commonly
+    /// only the case when Scalar is std::complex<U> for some type U.
+    template<typename T>
+    typename std::enable_if<! std::is_same<T, impl_scalar_type>::value && std::is_convertible<T, impl_scalar_type>::value, void>::type
+    putScalar (const T& value)
+    {
+      putScalar (static_cast<impl_scalar_type> (value));
+    }
+
+    /// \brief Set all values in the multivector to pseudorandom numbers.
+    ///
+    /// \note Do not expect repeatable results.
+    /// \note Behavior of this method may or may not depend on
+    ///   external use of the C library routines srand() and rand().
+    ///   In particular, setting the seed there may not affect it
+    ///   here.
+    /// \warning This method does <i>not</i> promise to use a
+    ///   distributed-memory parallel pseudorandom number generator.
+    ///   Corresponding values on different processes might be
+    ///   correlated.  It also does not promise to use a high-quality
+    ///   pseudorandom number generator within each process.
+    void randomize();
+
+    /// \brief Set all values in the multivector to pseudorandom
+    ///   numbers in the given range.
+    ///
+    /// \note Do not expect repeatable results.
+    /// \note Behavior of this method may or may not depend on
+    ///   external use of the C library routines srand() and rand().
+    ///   In particular, setting the seed there may not affect it
+    ///   here.
+    /// \warning This method does <i>not</i> promise to use a
+    ///   distributed-memory parallel pseudorandom number generator.
+    ///   Corresponding values on different processes might be
+    ///   correlated.  It also does not promise to use a high-quality
+    ///   pseudorandom number generator within each process.
+    void randomize (const Scalar& minVal, const Scalar& maxVal);
+
+    /// \brief Replace the underlying Map in place.
+    ///
+    /// \warning The normal use case of this method, with an input Map
+    ///   that is compatible with the object's current Map and has the
+    ///   same communicator, is safe.  However, if the input Map has a
+    ///   different communicator (with a different number of
+    ///   processes, in particular) than this object's current Map,
+    ///   the semantics of this method are tricky.  We recommend that
+    ///   only experts try the latter use case.
+    ///
+    /// \pre If the new Map's communicator is similar to the original
+    ///   Map's communicator, then the original Map and new Map must
+    ///   be compatible: <tt>map->isCompatible (this->getMap ())</tt>.
+    ///   "Similar" means that the communicators have the same number
+    ///   of processes, though these need not be in the same order
+    ///   (have the same assignments of ranks) or represent the same
+    ///   communication contexts.  It means the same thing as the
+    ///   MPI_SIMILAR return value of MPI_COMM_COMPARE.  See MPI 3.0
+    ///   Standard, Section 6.4.1.
+    ///
+    /// \pre If the new Map's communicator contains more processes
+    ///   than the original Map's communicator, then the projection of
+    ///   the original Map onto the new communicator must be
+    ///   compatible with the new Map.
+    ///
+    /// \pre If the new Map's communicator contains fewer processes
+    ///   than the original Map's communicator, then the projection of
+    ///   the new Map onto the original communicator must be
+    ///   compatible with the original Map.
+    ///
+    /// This method replaces this object's Map with the given Map.
+    /// This relabels the rows of the multivector using the global IDs
+    /// in the input Map.  Thus, it implicitly applies a permutation,
+    /// without actually moving data.  If the new Map's communicator
+    /// has more processes than the original Map's communicator, it
+    /// "projects" the LRMultiVector onto the new Map by filling in
+    /// missing rows with zeros.  If the new Map's communicator has
+    /// fewer processes than the original Map's communicator, the
+    /// method "forgets about" any rows that do not exist in the new
+    /// Map.  (It mathematical terms, if one considers a LRMultiVector
+    /// as a function from one vector space to another, this operation
+    /// <i>restricts</i> the range.)
+    ///
+    /// This method must always be called collectively on the
+    /// communicator with the largest number of processes: either this
+    /// object's current communicator
+    /// (<tt>this->getMap()->getComm()</tt>), or the new Map's
+    /// communicator (<tt>map->getComm()</tt>).  If the new Map's
+    /// communicator has fewer processes, then the new Map must be
+    /// null on processes excluded from the original communicator, and
+    /// the current Map must be nonnull on all processes.  If the new
+    /// Map has more processes, then it must be nonnull on all those
+    /// processes, and the original Map must be null on those
+    /// processes which are not in the new Map's communicator.  (The
+    /// latter case can only happen to a LRMultiVector to which a
+    /// replaceMap() operation has happened before.)
+    ///
+    /// \warning This method must always be called as a collective
+    ///   operation on all processes in the original communicator
+    ///   (<tt>this->getMap ()->getComm ()</tt>).  We reserve the
+    ///   right to do checking in debug mode that requires this method
+    ///   to be called collectively in order not to deadlock.
+    ///
+    /// \note This method does <i>not</i> do data redistribution.  If
+    ///   you need to move data around, use Import or Export.
+    void replaceMap (const Teuchos::RCP<const map_type>& map);
+
+    /// \brief Sum values of a locally replicated multivector across all processes.
+    ///
+    /// \warning This method may only be called for locally replicated
+    ///   MultiVectors.
+    ///
+    /// \pre isDistributed() == false
+    void reduce();
+
+    //@}
+
+    //! @name Get a copy or view of a subset of rows and/or columns
+    ///
+    /// The following methods get either a (deep) copy or a view
+    /// (shallow copy) of a subset of rows and/or columns of the
+    /// LRMultiVector.  They return one of the following:
+    ///
+    /// <ul>
+    /// <li> Another LRMultiVector </li>
+    /// <li> A Kokkos::View or Kokkos::DualView </li>
+    /// <li> A Teuchos::ArrayRCP (see the Teuchos Memory Management Classes) </li>
+    /// </ul>
+    ///
+    /// We prefer use of Kokkos classes to Teuchos Memory Management
+    /// Classes.  In particular, Teuchos::ArrayRCP reference counts
+    /// are not thread safe, while Kokkos::View (and Kokkos::DualView)
+    /// reference counts are thread safe.
+    ///
+    /// Not all of these methods are valid for a particular
+    /// LRMultiVector. For instance, calling a method that accesses a
+    /// view of the data in a 1-D format (i.e., get1dView) requires
+    /// that the target LRMultiVector have constant stride.
+    ///
+    /// This category of methods also includes sync(), modify(), and
+    /// getLocalView(), which help LRMultiVector implement DualView
+    /// semantics.
+    //@{
+
+    //! Return a LRMultiVector with copies of selected columns.
+    Teuchos::RCP<LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+    subCopy (const Teuchos::Range1D& colRng) const;
+
+    //! Return a LRMultiVector with copies of selected columns.
+    Teuchos::RCP<LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+    subCopy (const Teuchos::ArrayView<const size_t>& cols) const;
+
+    //! Return a const LRMultiVector with const views of selected columns.
+    Teuchos::RCP<const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+    subView (const Teuchos::Range1D& colRng) const;
+
+    //! Return a const LRMultiVector with const views of selected columns.
+    Teuchos::RCP<const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+    subView (const Teuchos::ArrayView<const size_t>& cols) const;
+
+    //! Return a LRMultiVector with views of selected columns.
+    Teuchos::RCP<LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+    subViewNonConst (const Teuchos::Range1D& colRng);
+
+    //! Return a LRMultiVector with views of selected columns.
+    Teuchos::RCP<LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+    subViewNonConst (const Teuchos::ArrayView<const size_t>& cols);
+
+    /// \brief Return a const view of a subset of rows.
+    ///
+    /// Return a const (nonmodifiable) view of this LRMultiVector
+    /// consisting of a subset of the rows, as specified by an offset
+    /// and a subset Map of this LRMultiVector's current row Map.  If
+    /// you want X1 or X2 to be nonconst (modifiable) views, use
+    /// offsetViewNonConst() with the same arguments.  "View" means
+    /// "alias": if the original (this) LRMultiVector's data change, the
+    /// view will see the changed data.
+    ///
+    /// \param subMap [in] The row Map for the new LRMultiVector.  This
+    ///   must be a subset Map of this LRMultiVector's row Map.
+    /// \param offset [in] The local row offset at which to start the view.
+    ///
+    /// Suppose that you have a LRMultiVector X, and you want to view X,
+    /// on all processes in X's (MPI) communicator, as split into two
+    /// row blocks X1 and X2.  One could express this in Matlab
+    /// notation as X = [X1; X2], except that here, X1 and X2 are
+    /// views into X, rather than copies of X's data.  This method
+    /// assumes that the <i>local</i> indices of X1 and X2 are each
+    /// contiguous, and that the local indices of X2 follow those of
+    /// X1.  If that is not the case, you cannot use views to divide X
+    /// into blocks like this; you must instead use the Import or
+    /// Export functionality, which copies the relevant rows of X.
+    ///
+    /// Here is how you would construct the views X1 and X2.
+    /// \code
+    /// using Teuchos::RCP;
+    /// typedef Tpetra::Map<LO, GO, Node> map_type;
+    /// typedef Tpetra::LRMultiVector<Scalar, LO, GO, Node> MV;
+    ///
+    /// MV X (...); // the input LRMultiVector
+    /// // ... fill X with data ...
+    ///
+    /// // Map that on each process in X's communicator,
+    /// // contains the global indices of the rows of X1.
+    /// RCP<const map_type> map1 (new map_type (...));
+    /// // Map that on each process in X's communicator,
+    /// // contains the global indices of the rows of X2.
+    /// RCP<const map_type> map2 (new map_type (...));
+    ///
+    /// // Create the first view X1.  The second argument, the offset,
+    /// // is the index of the local row at which to start the view.
+    /// // X1 is the topmost block of X, so the offset is zero.
+    /// RCP<const MV> X1 = X.offsetView (map1, 0);
+    ///
+    /// // Create the second view X2.  X2 is directly below X1 in X,
+    /// // so the offset is the local number of rows in X1.  This is
+    /// // the same as the local number of entries in map1.
+    /// RCP<const MV> X2 = X.offsetView (map2, X1->getLocalLength ());
+    /// \endcode
+    ///
+    /// It is legal, in the above example, for X1 or X2 to have zero
+    /// local rows on any or all process(es).  In that case, the
+    /// corresponding Map must have zero local entries on that / those
+    /// process(es).  In particular, if X2 has zero local rows on a
+    /// process, then the corresponding offset on that process would
+    /// be the number of local rows in X (and therefore in X1) on that
+    /// process.  This is the only case in which the sum of the local
+    /// number of entries in \c subMap (in this case, zero) and the \c
+    /// offset may equal the number of local entries in
+    /// <tt>*this</tt>.
+    Teuchos::RCP<const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+    offsetView (const Teuchos::RCP<const map_type>& subMap,
+                const size_t offset) const;
+
+    /// \brief Return a nonconst view of a subset of rows.
+    ///
+    /// Return a nonconst (modifiable) view of this LRMultiVector
+    /// consisting of a subset of the rows, as specified by an offset
+    /// and a subset Map of this LRMultiVector's current row Map.  If
+    /// you want X1 or X2 to be const (nonmodifiable) views, use
+    /// offsetView() with the same arguments.  "View" means "alias":
+    /// if the original (this) LRMultiVector's data change, the view
+    /// will see the changed data, and if the view's data change, the
+    /// original LRMultiVector will see the changed data.
+    ///
+    /// \param subMap [in] The row Map for the new LRMultiVector.  This
+    ///   must be a subset Map of this LRMultiVector's row Map.
+    /// \param offset [in] The local row offset at which to start the view.
+    ///
+    /// See the documentation of offsetView() for a code example and
+    /// an explanation of edge cases.
+    Teuchos::RCP<LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+    offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
+                        const size_t offset);
+
+    //! Return a Vector which is a const view of column j.
+    Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+    getVector (const size_t j) const;
+
+    //! Return a Vector which is a nonconst view of column j.
+    Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+    getVectorNonConst (const size_t j);
+
+    //! Const view of the local values in a particular vector of this multivector.
+    Teuchos::ArrayRCP<const Scalar> getData (size_t j) const;
+
+    //! View of the local values in a particular vector of this multivector.
+    Teuchos::ArrayRCP<Scalar> getDataNonConst (size_t j);
+
+    /// \brief Fill the given array with a copy of this multivector's
+    ///   local values.
+    ///
+    /// \param A [out] View of the array to fill.  We consider A as a
+    ///   matrix with column-major storage.
+    ///
+    /// \param LDA [in] Leading dimension of the matrix A.
+    void
+    get1dCopy (const Teuchos::ArrayView<Scalar>& A,
+               const size_t LDA) const;
+
+    /// \brief Fill the given array with a copy of this multivector's
+    ///   local values.
+    ///
+    /// \param ArrayOfPtrs [out] Array of arrays, one for each column
+    ///   of the multivector.  On output, we fill ArrayOfPtrs[j] with
+    ///   the data for column j of this multivector.
+    void
+    get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const;
+
+    /// \brief Const persisting (1-D) view of this multivector's local values.
+    ///
+    /// This method assumes that the columns of the multivector are
+    /// stored contiguously.  If not, this method throws
+    /// std::runtime_error.
+    Teuchos::ArrayRCP<const Scalar> get1dView () const;
+
+    //! Return const persisting pointers to values.
+    Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > get2dView () const;
+
+    /// \brief Nonconst persisting (1-D) view of this multivector's local values.
+    ///
+    /// This method assumes that the columns of the multivector are
+    /// stored contiguously.  If not, this method throws
+    /// std::runtime_error.
+    Teuchos::ArrayRCP<Scalar> get1dViewNonConst ();
+
+    //! Return non-const persisting pointers to values.
+    Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > get2dViewNonConst ();
+
+    /// \brief Return a read-only, up-to-date view of this LRMultiVector's local data on host.
+    /// This requires that there are no live device-space views.
+    typename dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const;
+
+    /// \brief Return a mutable, up-to-date view of this LRMultiVector's local data on host.
+    /// This requires that there are no live device-space views.
+    typename dual_view_type::t_host getLocalViewHost(Access::ReadWriteStruct);
+
+    /// \brief Return a mutable view of this LRMultiVector's local data on host, assuming all existing data will be overwritten.
+    /// This requires that there are no live device-space views.
+    typename dual_view_type::t_host getLocalViewHost(Access::OverwriteAllStruct);
+
+    /// \brief Return a read-only, up-to-date view of this LRMultiVector's local data on device.
+    /// This requires that there are no live host-space views.
+    typename dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const;
+
+    /// \brief Return a mutable, up-to-date view of this LRMultiVector's local data on device.
+    /// This requires that there are no live host-space views.
+    typename dual_view_type::t_dev getLocalViewDevice(Access::ReadWriteStruct);
+
+    /// \brief Return a mutable view of this LRMultiVector's local data on device, assuming all existing data will be overwritten.
+    /// This requires that there are no live host-space views.
+    typename dual_view_type::t_dev getLocalViewDevice(Access::OverwriteAllStruct);
+
+    /// \brief Return the wrapped dual view holding this LRMultiVector's local data.
+    ///
+    /// \warning This method is ONLY for use by experts. We highly recommend accessing the local data
+    /// by using the member functions getLocalViewHost and getLocalViewDevice.
+    wrapped_dual_view_type getWrappedDualView() const;
+
+    //! Whether this LRMultiVector needs synchronization to the given space.
+    template<class TargetDeviceType>
+    bool need_sync () const {
+      return view_.getDualView().template need_sync<TargetDeviceType> ();
+    }
+
+    //! Whether this LRMultiVector needs synchronization to the host.
+    bool need_sync_host () const;
+
+    //! Whether this LRMultiVector needs synchronization to the device.
+    bool need_sync_device () const;
+
+    /// \brief Return a view of the local data on a specific device, with the given access mode.
+    ///   The return type is either dual_view_type::t_dev, dual_view_type::t_host, or the const_type of
+    ///   one of those.
+    ///
+    /// \tparam TargetDeviceType The Kokkos Device type whose data to return.
+    ///
+    /// For example, suppose you create a Tpetra::LRMultiVector for the
+    /// Kokkos::Cuda device, like this:
+    /// \code
+    /// typedef Tpetra::KokkosCompat::KokkosDeviceWrapperNode<Kokkos::Cuda> > node_type;
+    /// typedef Tpetra::Map<int, int, node_type> map_type;
+    /// typedef Tpetra::LRMultiVector<float, int, int, node_type> mv_type;
+    ///
+    /// RCP<const map_type> map = ...;
+    /// mv_type DV (map, 3);
+    /// \endcode
+    /// If you want to get the CUDA device Kokkos::View as read-write, do this:
+    /// \code
+    /// typedef typename mv_type::dual_view_type dual_view_type;
+    /// typedef typename dual_view_type::t_dev device_view_type;
+    /// device_view_type cudaView = DV.getLocalView<Kokkos::Cuda> (Access::ReadWrite);
+    /// \endcode
+    /// and if you want to get the host mirror of that View, do this:
+    /// \code
+    /// typedef typename dual_view_type::host_mirror_space host_execution_space;
+    /// typedef typename dual_view_type::t_host host_view_type;
+    /// host_view_type hostView = DV.getLocalView<host_execution_space> (Access::ReadWrite);
+    /// \endcode
+    template<class TargetDeviceType>
+    typename std::remove_reference<decltype(std::declval<dual_view_type>().template view<TargetDeviceType>())>::type::const_type
+    getLocalView (Access::ReadOnlyStruct s) const
+    {
+      return view_.template getView<TargetDeviceType>(s);
+    }
+
+
+    template<class TargetDeviceType>
+    typename std::remove_reference<decltype(std::declval<dual_view_type>().template view<TargetDeviceType>())>::type
+    getLocalView (Access::ReadWriteStruct s)
+    {
+      return view_.template getView<TargetDeviceType>(s);
+    }
+
+    template<class TargetDeviceType>
+    typename std::remove_reference<decltype(std::declval<dual_view_type>().template view<TargetDeviceType>())>::type
+    getLocalView (Access::OverwriteAllStruct s)
+    {
+      return view_.template getView<TargetDeviceType>(s);
+    }
+
+    //@}
+    //! @name Mathematical methods
+    //@{
+
+    /// \brief Compute the dot product of each corresponding pair of
+    ///   vectors (columns) in A and B.
+    ///
+    /// The "dot product" is the standard Euclidean inner product.  If
+    /// the type of entries of the vectors (impl_scalar_type) is complex,
+    /// then A is transposed, not <tt>*this</tt>.  For example, if x
+    /// and y each have one column, then <tt>x.dot (y, dots)</tt>
+    /// computes \f$y^* x = \bar{y}^T x = \sum_i \bar{y}_i \cdot x_i\f$.
+    ///
+    /// \pre <tt>*this</tt> and A have the same number of columns (vectors).
+    /// \pre \c dots has at least as many entries as the number of columns in A.
+    ///
+    /// \post <tt>dots[j] == (this->getVector[j])->dot (* (A.getVector[j]))</tt>
+    void
+    dot (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
+         const Teuchos::ArrayView<dot_type>& dots) const;
+
+    /// \brief Compute the dot product of each corresponding pair of
+    ///   vectors (columns) in A and B.
+    /// \tparam T The output type of the dot products.
+    ///
+    /// This method only exists if dot_type and T are different types.
+    /// For example, if impl_scalar_type and dot_type differ, then this
+    /// method ensures backwards compatibility with the previous
+    /// interface (that returned dot products as impl_scalar_type rather
+    /// than as dot_type).  The complicated \c enable_if expression
+    /// just ensures that the method only exists if dot_type and T are
+    /// different types; the method still returns \c void, as above.
+    template <typename T>
+    typename std::enable_if< ! (std::is_same<dot_type, T>::value), void >::type
+    dot (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
+         const Teuchos::ArrayView<T> &dots) const
+    {
+      const size_t sz = static_cast<size_t> (dots.size ());
+      Teuchos::Array<dot_type> dts (sz);
+      this->dot (A, dts);
+      for (size_t i = 0; i < sz; ++i) {
+        // If T and dot_type differ, this does an implicit conversion.
+        dots[i] = dts[i];
+      }
+    }
+
+    //! Like the above dot() overload, but for std::vector output.
+    template <typename T>
+    typename std::enable_if< ! (std::is_same<dot_type, T>::value), void >::type
+    dot (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
+         std::vector<T>& dots) const
+    {
+      const size_t sz = dots.size ();
+      Teuchos::Array<dot_type> dts (sz);
+      this->dot (A, dts);
+      for (size_t i = 0; i < sz; ++i) {
+        // If T and dot_type differ, this does an implicit conversion.
+        dots[i] = dts[i];
+      }
+    }
+
+    /// \brief Compute the dot product of each corresponding pair of
+    ///   vectors (columns) in A and B, storing the result in a device
+    ///   View.
+    ///
+    /// The "dot product" is the standard Euclidean inner product.  If
+    /// the type of entries of the vectors (impl_scalar_type) is complex,
+    /// then A is transposed, not <tt>*this</tt>.  For example, if x
+    /// and y each have one column, then <tt>x.dot (y, dots)</tt>
+    /// computes \f$y^* x = \bar{y}^T x = \sum_i \bar{y}_i \cdot x_i\f$.
+    ///
+    /// \param A [in] LRMultiVector with which to dot \c *this.
+    /// \param dots [out] Device View with getNumVectors() entries.
+    ///
+    /// \pre <tt>this->getNumVectors () == A.getNumVectors ()</tt>
+    /// \pre <tt>dots.extent (0) == A.getNumVectors ()</tt>
+    ///
+    /// \post <tt>dots(j) == (this->getVector[j])->dot (* (A.getVector[j]))</tt>
+    void
+    dot (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
+         const Kokkos::View<dot_type*, Kokkos::HostSpace>& norms) const;
+
+    template<class ViewType>
+    void
+    dot (typename std::enable_if<std::is_same<typename ViewType::value_type,dot_type>::value &&
+                                 std::is_same<typename ViewType::memory_space,typename device_type::memory_space>::value,
+                                 const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>>::type& A,
+         const ViewType& dots) const {
+      const Kokkos::View<dot_type*, Kokkos::HostSpace> h_dots("Tpetra::Dots",dots.extent(0));
+      this->dot (A, h_dots);
+      // DEEP_COPY REVIEW - NOT TESTED
+      Kokkos::deep_copy(dots,h_dots);
+    }
+
+    /// \brief Compute the dot product of each corresponding pair of
+    ///   vectors (columns) in A and B, storing the result in a device
+    ///   view.
+    /// \tparam T The output type of the dot products.
+    ///
+    /// This method only exists if dot_type and T are different types.
+    /// For example, if Scalar and dot_type differ, then this method
+    /// ensures backwards compatibility with the previous interface
+    /// (that returned dot products as Scalar rather than as
+    /// dot_type).  The complicated \c enable_if expression just
+    /// ensures that the method only exists if dot_type and T are
+    /// different types; the method still returns \c void, as above.
+    template <typename T>
+    typename std::enable_if< ! (std::is_same<dot_type, T>::value), void >::type
+    dot (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
+         const Kokkos::View<T*, device_type>& dots) const
+    {
+      const size_t numDots = dots.extent (0);
+      Kokkos::View<dot_type*, device_type> dts ("MV::dot tmp", numDots);
+      // Call overload that takes a Kokkos::View<dot_type*, device_type>.
+      this->dot (A, dts);
+      // FIXME (mfh 14 Jul 2014) Does this actually work if dot_type
+      // and T differ?  We would need a test for this, but only the
+      // Sacado and Stokhos packages are likely to care about this use
+      // case.  It could also come up for Kokkos::complex ->
+      // std::complex conversions, but those two implementations
+      // should generally be bitwise compatible.
+      // CT: no this can't possible work .....
+      // DEEP_COPY REVIEW - NOT TESTED
+      Kokkos::deep_copy (dots, dts);
+    }
+
+    //! Put element-wise absolute values of input Multi-vector in target: A = abs(this)
+    void abs (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A);
+
+    //! Put element-wise reciprocal values of input Multi-vector in target, this(i,j) = 1/A(i,j).
+    void reciprocal (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A);
+
+    /// \brief Scale in place: <tt>this = alpha*this</tt>.
+    ///
+    /// Replace this LRMultiVector with alpha times this LRMultiVector.
+    /// This method will always multiply, even if alpha is zero.  That
+    /// means, for example, that if \c *this contains NaN entries
+    /// before calling this method, the NaN entries will remain after
+    /// this method finishes.
+    void scale (const Scalar& alpha);
+
+    /// \brief Scale each column in place: <tt>this[j] = alpha[j]*this[j]</tt>.
+    ///
+    /// Replace each column j of this LRMultiVector with
+    /// <tt>alpha[j]</tt> times the current column j of this
+    /// LRMultiVector.  This method will always multiply, even if all
+    /// the entries of alpha are zero.  That means, for example, that
+    /// if \c *this contains NaN entries before calling this method,
+    /// the NaN entries will remain after this method finishes.
+    void scale (const Teuchos::ArrayView<const Scalar>& alpha);
+
+    /// \brief Scale each column in place: <tt>this[j] = alpha[j]*this[j]</tt>.
+    ///
+    /// Replace each column j of this LRMultiVector with
+    /// <tt>alpha[j]</tt> times the current column j of this
+    /// LRMultiVector.  This method will always multiply, even if all
+    /// the entries of alpha are zero.  That means, for example, that
+    /// if \c *this contains NaN entries before calling this method,
+    /// the NaN entries will remain after this method finishes.
+    void scale (const Kokkos::View<const impl_scalar_type*, device_type>& alpha);
+
+    /// \brief Scale in place: <tt>this = alpha * A</tt>.
+    ///
+    /// Replace this LRMultiVector with scaled values of A.  This method
+    /// will always multiply, even if alpha is zero.  That means, for
+    /// example, that if \c *this contains NaN entries before calling
+    /// this method, the NaN entries will remain after this method
+    /// finishes.  It is legal for the input A to alias this
+    /// LRMultiVector.
+    void
+    scale (const Scalar& alpha,
+           const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A);
+
+    /// \brief Update: <tt>this = beta*this + alpha*A</tt>.
+    ///
+    /// Update this LRMultiVector with scaled values of A.  If beta is
+    /// zero, overwrite \c *this unconditionally, even if it contains
+    /// NaN entries.  It is legal for the input A to alias this
+    /// LRMultiVector.
+    void
+    update (const Scalar& alpha,
+            const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
+            const Scalar& beta);
+
+    /// \brief Update: <tt>this = gamma*this + alpha*A + beta*B</tt>.
+    ///
+    /// Update this LRMultiVector with scaled values of A and B.  If
+    /// gamma is zero, overwrite \c *this unconditionally, even if it
+    /// contains NaN entries.  It is legal for the inputs A or B to
+    /// alias this LRMultiVector.
+    void
+    update (const Scalar& alpha,
+            const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
+            const Scalar& beta,
+            const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
+            const Scalar& gamma);
+
+    /// \brief Compute the one-norm of each vector (column), storing
+    ///   the result in a host view.
+    ///
+    /// \param norms [out] Host View with getNumVectors() entries.
+    ///
+    /// \pre <tt>norms.extent (0) == this->getNumVectors ()</tt>
+    /// \post <tt>norms(j) == (this->getVector[j])->norm1 (* (A.getVector[j]))</tt>
+    ///
+    /// The one-norm of a vector is the sum of the magnitudes of the
+    /// vector's entries.  On exit, norms(j) is the one-norm of column
+    /// j of this LRMultiVector.
+    void
+    norm1 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const;
+
+    template<class ViewType>
+      typename std::enable_if<std::is_same<typename ViewType::value_type,mag_type>::value &&
+                              std::is_same<typename ViewType::memory_space,typename device_type::memory_space>::value>::type
+    norm1 (const ViewType& norms) const {
+      // FIXME (mfh 11 Apr 2019) The enable_ifs make it useless for
+      // this method to be templated.  (It only exists in case
+      // HostSpace = device_type::memory_space.)
+      using host_norms_view_type = Kokkos::View<mag_type*, Kokkos::HostSpace>;
+      host_norms_view_type h_norms ("Tpetra::MV::h_norms", norms.extent (0));
+      this->norm1 (h_norms);
+      // DEEP_COPY REVIEW - HOST-TO-DEVICE
+      Kokkos::deep_copy (execution_space(), norms, h_norms);
+    }
+
+    /// \brief Compute the one-norm of each vector (column), storing
+    ///   the result in a device view.
+    /// \tparam T The output type of the dot products.
+    ///
+    /// See the above norm1() method for documentation.
+    ///
+    /// This method only exists if mag_type and T are different types.
+    /// For example, if Teuchos::ScalarTraits<Scalar>::magnitudeType
+    /// and mag_type differ, then this method ensures backwards
+    /// compatibility with the previous interface (that returned norms
+    /// products as Teuchos::ScalarTraits<Scalar>::magnitudeType
+    /// rather than as mag_type).  The complicated \c enable_if
+    /// expression just ensures that the method only exists if
+    /// mag_type and T are different types; the method still returns
+    /// \c void, as above.
+    template <typename T>
+    typename std::enable_if< ! (std::is_same<mag_type, T>::value), void >::type
+    norm1 (const Kokkos::View<T*, device_type>& norms) const
+    {
+      const size_t numNorms = norms.extent (0);
+      Kokkos::View<mag_type*, device_type> tmpNorms ("MV::norm1 tmp", numNorms);
+      // Call overload that takes a Kokkos::View<mag_type*, device_type>.
+      this->norm1 (tmpNorms);
+      // FIXME (mfh 15 Jul 2014) Does this actually work if mag_type
+      // and T differ?  We would need a test for this, but only the
+      // Sacado and Stokhos packages are likely to care about this use
+      // case.  It could also come up with Kokkos::complex ->
+      // std::complex conversion.
+      // DEEP_COPY REVIEW - NOT TESTED
+      Kokkos::deep_copy (norms, tmpNorms);
+    }
+
+    /// \brief Compute the one-norm of each vector (column).
+    ///
+    /// See the uppermost norm1() method above for documentation.
+    void norm1 (const Teuchos::ArrayView<mag_type>& norms) const;
+
+    /// \brief Compute the one-norm of each vector (column).
+    /// \tparam T The output type of the norms.
+    ///
+    /// See the uppermost norm1() method above for documentation.
+    ///
+    /// This method only exists if mag_type and T are different types.
+    /// For example, if Teuchos::ScalarTraits<Scalar>::magnitudeType
+    /// and mag_type differ, then this method ensures backwards
+    /// compatibility with the previous interface (that returned norms
+    /// as Teuchos::ScalarTraits<Scalar>::magnitudeType
+    /// rather than as mag_type).  The complicated \c enable_if
+    /// expression just ensures that the method only exists if
+    /// mag_type and T are different types; the method still returns
+    /// \c void, as above.
+    template <typename T>
+    typename std::enable_if< ! (std::is_same<mag_type,T>::value), void >::type
+    norm1 (const Teuchos::ArrayView<T>& norms) const
+    {
+      typedef typename Teuchos::ArrayView<T>::size_type size_type;
+      const size_type sz = norms.size ();
+      Teuchos::Array<mag_type> theNorms (sz);
+      this->norm1 (theNorms);
+      for (size_type i = 0; i < sz; ++i) {
+        // If T and mag_type differ, this does an implicit conversion.
+        norms[i] = theNorms[i];
+      }
+    }
+
+    /// \brief Compute the two-norm of each vector (column), storing
+    ///   the result in a host View.
+    ///
+    /// \param norms [out] Host View with getNumVectors() entries.
+    ///
+    /// \pre <tt>norms.extent (0) == this->getNumVectors ()</tt>
+    /// \post <tt>norms(j) == (this->getVector[j])->dot (* (A.getVector[j]))</tt>
+    ///
+    /// The two-norm of a vector is the standard Euclidean norm, the
+    /// square root of the sum of squares of the magnitudes of the
+    /// vector's entries.  On exit, norms(k) is the two-norm of column
+    /// k of this LRMultiVector.
+    void
+    norm2 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const;
+
+    template<class ViewType>
+      typename std::enable_if<std::is_same<typename ViewType::value_type,mag_type>::value &&
+                              std::is_same<typename ViewType::memory_space,typename device_type::memory_space>::value>::type
+    norm2 (const ViewType& norms) const {
+      // FIXME (mfh 11 Apr 2019) The enable_ifs make it useless for
+      // this method to be templated.  (It only exists in case
+      // HostSpace = device_type::memory_space.)
+      using host_norms_view_type = Kokkos::View<mag_type*, Kokkos::HostSpace>;
+      host_norms_view_type h_norms ("Tpetra::MV::h_norms", norms.extent (0));
+      this->norm2 (h_norms);
+      // DEEP_COPY REVIEW - NOT TESTED
+      Kokkos::deep_copy (norms, h_norms);
+    }
+
+    /// \brief Compute the two-norm of each vector (column), storing
+    ///   the result in a device view.
+    ///
+    /// See the above norm2() method for documentation.
+    ///
+    /// This method only exists if mag_type and T are different types.
+    /// For example, if Teuchos::ScalarTraits<Scalar>::magnitudeType
+    /// and mag_type differ, then this method ensures backwards
+    /// compatibility with the previous interface (that returned norms
+    /// as Teuchos::ScalarTraits<Scalar>::magnitudeType rather than as
+    /// mag_type).  The complicated \c enable_if expression just
+    /// ensures that the method only exists if mag_type and T are
+    /// different types; the method still returns \c void, as above.
+    template<typename T>
+    typename std::enable_if< ! (std::is_same<mag_type, T>::value), void >::type
+    norm2 (const Kokkos::View<T*, device_type>& norms) const
+    {
+      const size_t numNorms = norms.extent (0);
+      Kokkos::View<mag_type*, device_type> theNorms ("MV::norm2 tmp", numNorms);
+      // Call overload that takes a Kokkos::View<mag_type*, device_type>.
+      this->norm2 (theNorms);
+      // FIXME (mfh 14 Jul 2014) Does this actually work if mag_type
+      // and T differ?  We would need a test for this, but only the
+      // Sacado and Stokhos packages are likely to care about this use
+      // case.  This could also come up with Kokkos::complex ->
+      // std::complex conversion.
+      // DEEP_COPY REVIEW - NOT TESTED
+      Kokkos::deep_copy (norms, theNorms);
+    }
+
+    /// \brief Compute the two-norm of each vector (column).
+    ///
+    /// See the uppermost norm2() method above for documentation.
+    void norm2 (const Teuchos::ArrayView<mag_type>& norms) const;
+
+    /// \brief Compute the two-norm of each vector (column).
+    /// \tparam T The output type of the norms.
+    ///
+    /// See the uppermost norm2() method above for documentation.
+    ///
+    /// This method only exists if mag_type and T are different types.
+    /// For example, if Teuchos::ScalarTraits<Scalar>::magnitudeType
+    /// and mag_type differ, then this method ensures backwards
+    /// compatibility with the previous interface (that returned norms
+    /// products as Teuchos::ScalarTraits<Scalar>::magnitudeType
+    /// rather than as mag_type).  The complicated \c enable_if
+    /// expression just ensures that the method only exists if
+    /// mag_type and T are different types; the method still returns
+    /// \c void, as above.
+    template <typename T>
+    typename std::enable_if< ! (std::is_same<mag_type,T>::value), void >::type
+    norm2 (const Teuchos::ArrayView<T>& norms) const
+    {
+      typedef typename Teuchos::ArrayView<T>::size_type size_type;
+      const size_type sz = norms.size ();
+      Teuchos::Array<mag_type> theNorms (sz);
+      this->norm2 (theNorms);
+      for (size_type i = 0; i < sz; ++i) {
+        // If T and mag_type differ, this does an implicit conversion.
+        norms[i] = theNorms[i];
+      }
+    }
+
+    /// \brief Compute the infinity-norm of each vector (column),
+    ///   storing the result in a host View.
+    ///
+    /// The infinity-norm of a vector is the maximum of the magnitudes
+    /// of the vector's entries.  On exit, norms(j) is the
+    /// infinity-norm of column j of this LRMultiVector.
+    void normInf (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const;
+
+    template<class ViewType>
+      typename std::enable_if<std::is_same<typename ViewType::value_type,mag_type>::value &&
+                              std::is_same<typename ViewType::memory_space,typename device_type::memory_space>::value>::type
+    normInf (const ViewType& norms) const {
+      // FIXME (mfh 11 Apr 2019) The enable_ifs make it useless for
+      // this method to be templated.  (It only exists in case
+      // HostSpace = device_type::memory_space.)
+      using host_norms_view_type = Kokkos::View<mag_type*, Kokkos::HostSpace>;
+      host_norms_view_type h_norms ("Tpetra::MV::h_norms", norms.extent (0));
+      this->normInf (h_norms);
+      // DEEP_COPY REVIEW - HOST-TO-DEVICE
+      Kokkos::deep_copy (execution_space(), norms, h_norms);
+    }
+
+    /// \brief Compute the infinity-norm of each vector (column),
+    ///   storing the result in a device view.
+    ///
+    /// See the above normInf() method for documentation.
+    ///
+    /// This method only exists if mag_type and T are different types.
+    /// For example, if Teuchos::ScalarTraits<Scalar>::magnitudeType
+    /// and mag_type differ, then this method ensures backwards
+    /// compatibility with the previous interface (that returned norms
+    /// as Teuchos::ScalarTraits<Scalar>::magnitudeType rather than as
+    /// mag_type).  The complicated \c enable_if expression just
+    /// ensures that the method only exists if mag_type and T are
+    /// different types; the method still returns \c void, as above.
+    template<typename T>
+    typename std::enable_if< ! (std::is_same<mag_type, T>::value), void >::type
+    normInf (const Kokkos::View<T*, device_type>& norms) const
+    {
+      const size_t numNorms = norms.extent (0);
+      Kokkos::View<mag_type*, device_type> theNorms ("MV::normInf tmp", numNorms);
+      // Call overload that takes a Kokkos::View<mag_type*, device_type>.
+      this->normInf (theNorms);
+      // FIXME (mfh 15 Jul 2014) Does this actually work if mag_type
+      // and T differ?  We would need a test for this, but only the
+      // Sacado and Stokhos packages are likely to care about this use
+      // case.  This could also come up with Kokkos::complex ->
+      // std::complex conversion.
+      // DEEP_COPY REVIEW - NOT TESTED
+      Kokkos::deep_copy (norms, theNorms);
+    }
+
+    /// \brief Compute the infinity-norm of each vector (column),
+    ///   storing the result in a Teuchos::ArrayView.
+    ///
+    /// See the uppermost normInf() method above for documentation.
+    void normInf (const Teuchos::ArrayView<mag_type>& norms) const;
+
+    /// \brief Compute the infinity-norm of each vector (column),
+    ///   storing the result in a Teuchos::ArrayView.
+    /// \tparam T The output type of the norms.
+    ///
+    /// See the uppermost normInf() method above for documentation.
+    ///
+    /// This method only exists if mag_type and T are different types.
+    /// For example, if Teuchos::ScalarTraits<Scalar>::magnitudeType
+    /// and mag_type differ, then this method ensures backwards
+    /// compatibility with the previous interface (that returned norms
+    /// products as Teuchos::ScalarTraits<Scalar>::magnitudeType
+    /// rather than as mag_type).  The complicated \c enable_if
+    /// expression just ensures that the method only exists if
+    /// mag_type and T are different types; the method still returns
+    /// \c void, as above.
+    template <typename T>
+    typename std::enable_if< ! (std::is_same<mag_type,T>::value), void >::type
+    normInf (const Teuchos::ArrayView<T>& norms) const
+    {
+      typedef typename Teuchos::ArrayView<T>::size_type size_type;
+      const size_type sz = norms.size ();
+      Teuchos::Array<mag_type> theNorms (sz);
+      this->norm2 (theNorms);
+      for (size_type i = 0; i < sz; ++i) {
+        // If T and mag_type differ, this does an implicit conversion.
+        norms[i] = theNorms[i];
+      }
+    }
+
+
+    /// \brief Compute mean (average) value of each column.
+    ///
+    /// The outcome of this routine is undefined for non-floating
+    /// point scalar types (e.g., int).
+    void meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const;
+
+    template <typename T>
+    typename std::enable_if<! std::is_same<impl_scalar_type, T>::value, void>::type
+    meanValue (const Teuchos::ArrayView<T>& means) const
+    {
+      typedef typename Teuchos::Array<T>::size_type size_type;
+      const size_type numMeans = means.size ();
+
+      Teuchos::Array<impl_scalar_type> theMeans (numMeans);
+      this->meanValue (theMeans ());
+      for (size_type k = 0; k < numMeans; ++k) {
+        means[k] = static_cast<T> (theMeans[k]);
+      }
+    }
+
+    /// \brief Matrix-matrix multiplication: <tt>this = beta*this + alpha*op(A)*op(B)</tt>.
+    ///
+    /// If beta is zero, overwrite \c *this unconditionally, even if
+    /// it contains NaN entries.  This imitates the semantics of
+    /// analogous BLAS routines like DGEMM.
+    void
+    multiply (Teuchos::ETransp transA,
+              Teuchos::ETransp transB,
+              const Scalar& alpha,
+              const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
+              const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
+              const Scalar& beta);
+
+    /// \brief Multiply a Vector A elementwise by a LRMultiVector B.
+    ///
+    /// Compute <tt>this = scalarThis * this + scalarAB * B @ A</tt>
+    /// where <tt>@</tt> denotes element-wise multiplication.  In
+    /// pseudocode, if C denotes <tt>*this</tt> LRMultiVector:
+    /// \code
+    /// C(i,j) = scalarThis * C(i,j) + scalarAB * B(i,j) * A(i,1);
+    /// \endcode
+    /// for all rows i and columns j of C.
+    ///
+    /// B must have the same dimensions as <tt>*this</tt>, while A
+    /// must have the same number of rows but a single column.
+    ///
+    /// We do not require that A, B, and <tt>*this</tt> have
+    /// compatible Maps, as long as the number of rows in A, B, and
+    /// <tt>*this</tt> on each process is the same.  For example, one
+    /// or more of these vectors might have a locally replicated Map,
+    /// or a Map with a local communicator (<tt>MPI_COMM_SELF</tt>).
+    /// This case may occur in block relaxation algorithms when
+    /// applying a diagonal scaling.
+    void
+    elementWiseMultiply (Scalar scalarAB,
+                         const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
+                         const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
+                         Scalar scalarThis);
+    //@}
+    //! @name Attribute access functions
+    //@{
+
+    //! Number of columns in the multivector.
+    size_t getNumVectors() const;
+
+    //! Local number of rows on the calling process.
+    size_t getLocalLength() const;
+
+    //! Global number of rows in the multivector.
+    global_size_t getGlobalLength() const;
+
+    /// \brief Stride between columns in the multivector.
+    ///
+    /// This is only meaningful if \c isConstantStride() returns true.
+    ///
+    /// \warning This may be different on different processes.
+    size_t getStride() const;
+
+    /// \brief Whether this multivector has constant stride between columns.
+    ///
+    /// \warning This may be different on different processes.
+    bool isConstantStride() const;
+
+    /// \brief Whether this multivector's memory might alias other. This is conservative: if either this or other
+    ///     is not constant stride, then it simply checks whether the contiguous memory allocations overlap. It
+    ///     doesn't check whether the sets of columns overlap. This is a symmetric relation: X.aliases(Y) == Y.aliases(X).
+    bool aliases(const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& other) const;
+
+    //@}
+
+    //! @name Overridden from Teuchos::Describable
+    //@{
+
+    //! A simple one-line description of this object.
+    virtual std::string description() const override;
+
+    /// \brief Print the object with the given verbosity level to a FancyOStream.
+    ///
+    /// \param out [out] Output stream to which to print.  For
+    ///   verbosity levels VERB_LOW and lower, only the process with
+    ///   rank 0 ("Proc 0") in the LRMultiVector's communicator prints.
+    ///   For verbosity levels strictly higher than VERB_LOW, all
+    ///   processes in the communicator need to be able to print to
+    ///   the output stream.
+    ///
+    /// \param verbLevel [in] Verbosity level.  The default verbosity
+    ///   (verbLevel=VERB_DEFAULT) is VERB_LOW.
+    ///
+    /// The amount and content of what this method prints depends on
+    /// the verbosity level.  In the list below, each higher level
+    /// includes all the content of the previous levels, as well as
+    /// its own content.
+    ///
+    /// - VERB_LOW: Only Proc 0 prints; it prints the same thing as \c
+    ///   description().
+    /// - VERB_MEDIUM: Each process prints its local length (the
+    ///   number of rows that it owns).
+    /// - VERB_HIGH: Each process prints whether the multivector has
+    ///   constant stride (see \c isConstantStride()), and if so, what
+    ///   that stride is.  (Stride may differ on different processes.)
+    /// - VERB_EXTREME: Each process prints the values in its local
+    ///   part of the multivector.  This will print out as many rows
+    ///   of data as the global number of rows in the multivector, so
+    ///   beware.
+    virtual void
+    describe (Teuchos::FancyOStream& out,
+              const Teuchos::EVerbosityLevel verbLevel =
+              Teuchos::Describable::verbLevel_default) const override;
+    //@}
+
+    /// \brief Remove processes owning zero rows from the Map and their communicator.
+    ///
+    /// \warning This method is ONLY for use by experts.  We highly
+    ///   recommend using the nonmember function of the same name
+    ///   defined in Tpetra_DistObject_decl.hpp.
+    ///
+    /// \warning We make NO promises of backwards compatibility.
+    ///   This method may change or disappear at any time.
+    ///
+    /// \param newMap [in] This <i>must</i> be the result of calling
+    ///   the removeEmptyProcesses() method on the row Map.  If it
+    ///   is not, this method's behavior is undefined.  This pointer
+    ///   will be null on excluded processes.
+    virtual void
+    removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap) override;
+
+    /// \brief Set whether this has copy (copyOrView = Teuchos::Copy)
+    ///   or view (copyOrView = Teuchos::View) semantics.
+    ///
+    /// \warning The Kokkos refactor version of LRMultiVector
+    ///   <i>only</i> implements view semantics.  If you attempt to
+    ///   call this method with copyOrView == Teuchos::Copy, it will
+    ///   throw std::invalid_argument.
+    ///
+    /// \warning This method is only for expert use.  It may change or
+    ///   disappear at any time.
+    void setCopyOrView (const Teuchos::DataAccess copyOrView) {
+      TEUCHOS_TEST_FOR_EXCEPTION(
+        copyOrView == Teuchos::Copy, std::invalid_argument,
+        "Tpetra::LRMultiVector::setCopyOrView: The Kokkos refactor version of "
+        "LRMultiVector _only_ implements view semantics.  You may not call this "
+        "method with copyOrView = Teuchos::Copy.  The only valid argument is "
+        "Teuchos::View.");
+    }
+
+    /// \brief Get whether this has copy (copyOrView = Teuchos::Copy)
+    ///   or view (copyOrView = Teuchos::View) semantics.
+    ///
+    // This method ONLY exists for the circa 2014 "Kokkos refactor"
+    // effort.  It ALWAYS returns Teuchos::View.
+    ///
+    /// \warning This method is only for expert use.  It may change or
+    ///   disappear at any time.
+    Teuchos::DataAccess getCopyOrView () const {
+      return Teuchos::View;
+    }
+
+    /// \brief Copy the contents of \c src into \c *this (deep copy).
+    ///
+    /// \param src [in] Source LRMultiVector (input of the deep copy).
+    ///
+    /// \pre <tt> ! src.getMap ().is_null () && ! this->getMap ().is_null () </tt>
+    /// \pre <tt> src.getMap ()->isCompatible (* (this->getMap ()) </tt>
+    ///
+    /// \post Any outstanding views of \c src or \c *this remain valid.
+    ///
+   /// \note To implementers: The postcondition implies that the
+    ///   implementation must not reallocate any memory of \c *this,
+    ///   or otherwise change its dimensions.  This is <i>not</i> an
+    ///   assignment operator; it does not change anything in \c *this
+    ///   other than the contents of storage.
+    void
+    assign (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& src);
+
+    /// \brief Return another LRMultiVector with the same entries, but
+    ///   converted to a different Scalar type \c T.
+    template <class T>
+    Teuchos::RCP<LRMultiVector<T, LocalOrdinal, GlobalOrdinal, Node> >
+    convert () const;
+
+
+    // \brief Checks to see if the local length, number of vectors and size of Scalar type match
+    /// \param src [in] LRMultiVector
+    ///
+    /// \pre <tt> ! vec.getMap ().is_null () && ! this->getMap ().is_null () </tt>
+    /// \pre <tt> vec.getMap ()->isCompatible (* (this->getMap ()) </tt>
+    ///
+    /// \post Any outstanding views of \c src or \c *this remain valid.
+    ///
+    bool isSameSize(const LRMultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & vec) const;
+
+  private:
+    //! The type of the base class of this class.
+    using base_type = DistObject<scalar_type, local_ordinal_type,
+                                 global_ordinal_type, node_type>;
+
+  protected:
+    template <class DS, class DL, class DG, class DN,
+              class SS, class SL, class SG, class SN>
+    friend void
+    ::Tpetra::deep_copy (LRMultiVector<DS, DL, DG, DN>& dst,
+                         const LRMultiVector<SS, SL, SG, SN>& src);
+
+    /// \brief The Kokkos::DualView containing the LRMultiVector's data.
+    ///
+    /// This has to be declared \c mutable, so that get1dView() can
+    /// retain its current \c const marking, even though it has always
+    /// implied a device->host synchronization.  Lesson to the reader:
+    /// Use \c const sparingly!
+    mutable wrapped_dual_view_type view_;
+
+    /// \brief Indices of columns this multivector is viewing.
+    ///
+    /// If this array has nonzero size, then this multivector is a
+    /// view of another multivector (the "original" multivector).  In
+    /// that case, whichVectors_ contains the indices of the columns
+    /// of the original multivector.  Furthermore, isConstantStride()
+    /// returns false in this case.
+    ///
+    /// If this array has zero size, then this multivector is not a
+    /// view of any other multivector.  Furthermore, the stride
+    /// between columns of this multivector is a constant: thus,
+    /// isConstantStride() returns true.
+    Teuchos::Array<size_t> whichVectors_;
+
+    template<class SC, class LO, class GO, class NT>
+    friend ::Teuchos::ArrayView<const size_t> getMultiVectorWhichVectors (const ::Tpetra::LRMultiVector<SC, LO, GO, NT>& X);
+
+    //@}
+    //! @name Misc. implementation details
+    //@{
+
+    /// \brief Implementation of description() for this class, and its
+    ///   subclass Vector.
+    ///
+    /// \param className [in] Name of the class calling this method:
+    ///   Either "Tpetra::LRMultiVector" or "Tpetra::Vector" (no quotes
+    ///   in the string, in either case).
+    std::string
+    descriptionImpl (const std::string& className) const;
+
+    /// \brief Print the calling process' verbose describe()
+    ///   information to the returned string.
+    ///
+    /// This is an implementation detail of describe().
+    ///
+    /// \param vl [in] Verbosity level with which to print.
+    std::string
+    localDescribeToString (const Teuchos::EVerbosityLevel vl) const;
+
+    /// \brief Implementation of describe() for this class, and its
+    ///   subclass Vector.
+    ///
+    /// \param out [out] Output stream to which to write.  Only
+    ///   Process 0 in this object's communicator may write to the
+    ///   output stream.
+    ///
+    /// \param className [in] Name of the class calling this method.
+    ///
+    /// \param verbLevel [in] Verbosity level.  This also controls
+    ///   whether this method does any communication.  At verbosity
+    ///   levels higher (greater) than Teuchos::VERB_LOW, this method
+    ///   behaves as a collective over the object's communicator.
+    void
+    describeImpl (Teuchos::FancyOStream& out,
+                  const std::string& className,
+                  const Teuchos::EVerbosityLevel verbLevel =
+                    Teuchos::Describable::verbLevel_default) const;
+
+    // Return true if and only if VectorIndex is a valid column index.
+    bool vectorIndexOutOfRange (const size_t VectorIndex) const;
+
+    /// \brief Persisting view of j-th column in the given ArrayRCP.
+    ///
+    /// This method considers isConstantStride().  The ArrayRCP may
+    /// correspond either to a compute buffer or a host view.
+    template <class T>
+    Teuchos::ArrayRCP<T>
+    getSubArrayRCP (Teuchos::ArrayRCP<T> arr, size_t j) const;
+
+    //! "Original" number of rows in the (local) data.
+    size_t getOrigNumLocalRows () const;
+
+    //! "Original" number of columns in the (local) data.
+    size_t getOrigNumLocalCols () const;
+
+    //@}
+    //! @name Implementation of Tpetra::DistObject
+    //@{
+
+    /// \typedef buffer_device_type
+    /// \brief Kokkos::Device specialization for communication buffers.
+    ///
+    /// See #1088 for why this is not just <tt>device_type::device_type</tt>.
+    using buffer_device_type =
+      typename DistObject<scalar_type,
+                          local_ordinal_type,
+                          global_ordinal_type,
+                          node_type>::buffer_device_type;
+
+    /// \brief Whether data redistribution between \c sourceObj and this object is legal.
+    ///
+    /// This method is called in DistObject::doTransfer() to check
+    /// whether data redistribution between the two objects is legal.
+    virtual bool
+    checkSizes (const SrcDistObject& sourceObj) override;
+
+    //! Number of packets to send per LID
+    virtual size_t constantNumberOfPackets () const override;
+
+  // clang-format on
+  virtual void copyAndPermute(
+      const SrcDistObject &sourceObj, const size_t numSameIDs,
+      const Kokkos::DualView<const local_ordinal_type *, buffer_device_type>
+          &permuteToLIDs,
+      const Kokkos::DualView<const local_ordinal_type *, buffer_device_type>
+          &permuteFromLIDs,
+      const CombineMode CM, const execution_space &space) override;
+
+  virtual void copyAndPermute(
+      const SrcDistObject &sourceObj, const size_t numSameIDs,
+      const Kokkos::DualView<const local_ordinal_type *, buffer_device_type>
+          &permuteToLIDs,
+      const Kokkos::DualView<const local_ordinal_type *, buffer_device_type>
+          &permuteFromLIDs,
+      const CombineMode CM) override;
+  // clang-format off
+
+    virtual void
+    packAndPrepare
+    (const SrcDistObject& sourceObj,
+     const Kokkos::DualView<
+       const local_ordinal_type*,
+       buffer_device_type>& exportLIDs,
+     Kokkos::DualView<
+       impl_scalar_type*,
+       buffer_device_type>& exports,
+     Kokkos::DualView<
+       size_t*,
+       buffer_device_type> /* numPacketsPerLID */,
+     size_t& constantNumPackets,
+     const execution_space &space) override;
+
+    virtual void
+    packAndPrepare
+    (const SrcDistObject& sourceObj,
+     const Kokkos::DualView<
+       const local_ordinal_type*,
+       buffer_device_type>& exportLIDs,
+     Kokkos::DualView<
+       impl_scalar_type*,
+       buffer_device_type>& exports,
+     Kokkos::DualView<
+       size_t*,
+       buffer_device_type> /* numPacketsPerLID */,
+     size_t& constantNumPackets) override;
+
+    virtual void
+    unpackAndCombine
+    (const Kokkos::DualView<
+       const local_ordinal_type*,
+       buffer_device_type>& importLIDs,
+     Kokkos::DualView<
+       impl_scalar_type*,
+       buffer_device_type> imports,
+     Kokkos::DualView<
+       size_t*,
+       buffer_device_type> /* numPacketsPerLID */,
+     const size_t constantNumPackets,
+     const CombineMode CM,
+     const execution_space &space) override;
+
+    virtual void
+    unpackAndCombine
+    (const Kokkos::DualView<
+       const local_ordinal_type*,
+       buffer_device_type>& importLIDs,
+     Kokkos::DualView<
+       impl_scalar_type*,
+       buffer_device_type> imports,
+     Kokkos::DualView<
+       size_t*,
+       buffer_device_type> /* numPacketsPerLID */,
+     const size_t constantNumPackets,
+     const CombineMode CM) override;
+
+  private:
+
+    // If comm buffers can be aliased to the data view, use this
+    // implementation.
+    template<class NO=Node>
+    typename std::enable_if<std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
+                                         typename NO::device_type::memory_space>::value, bool>::type
+    reallocImportsIfNeededImpl (const size_t newSize,
+                                 const bool verbose,
+                                 const std::string* prefix,
+                                 const bool areRemoteLIDsContiguous,
+                                 const CombineMode CM);
+
+    // If comm buffers cannot be aliased to the data view, use this
+    // implementation. (Just calls DistObject::reallocImportsIfNeeded.)
+    template<class NO=Node>
+    typename std::enable_if<!std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
+                                          typename NO::device_type::memory_space>::value, bool>::type
+    reallocImportsIfNeededImpl (const size_t newSize,
+                                 const bool verbose,
+                                 const std::string* prefix,
+                                 const bool areRemoteLIDsContiguous,
+                                 const CombineMode CM);
+  protected:
+
+    virtual bool
+    reallocImportsIfNeeded (const size_t newSize,
+                                 const bool verbose,
+                                 const std::string* prefix,
+                                 const bool areRemoteLIDsContiguous=false,
+                                 const CombineMode CM=INSERT) override;
+
+
+  public:
+    bool importsAreAliased();
+
+  protected:
+    Kokkos::DualView<impl_scalar_type*, buffer_device_type> unaliased_imports_;
+
+    //@}
+  }; // class LRMultiVector
+
+  template<class SC, class LO, class GO, class NT>
+  Teuchos::ArrayView<const size_t>
+  getMultiVectorWhichVectors (const LRMultiVector<SC, LO, GO, NT>& X)
+  {
+    return X.whichVectors_ ();
+  }
+
+
+  /// \brief Specialization of deep_copy for LRMultiVector objects with
+  ///   the same template parameters.
+  template <class ST, class LO, class GO, class NT>
+  void
+  deep_copy (LRMultiVector<ST, LO, GO, NT>& dst,
+             const LRMultiVector<ST, LO, GO, NT>& src)
+  {
+    // NOTE (mfh 11 Sep 2014) We can't implement deep_copy with
+    // shallow-copy operator=, because that would invalidate existing
+    // views of dst!
+    dst.assign (src);
+  }
+
+  // Implementation of the most generic version of LRMultiVector deep_copy.
+  template <class DS, class DL, class DG, class DN,
+            class SS, class SL, class SG, class SN>
+  void
+  deep_copy (LRMultiVector<DS, DL, DG, DN>& dst,
+             const LRMultiVector<SS, SL, SG, SN>& src)
+  {
+    using ::Tpetra::getMultiVectorWhichVectors;
+
+    TEUCHOS_TEST_FOR_EXCEPTION(
+      dst.getGlobalLength () != src.getGlobalLength () ||
+      dst.getNumVectors () != src.getNumVectors (), std::invalid_argument,
+      "Tpetra::deep_copy: Global dimensions of the two Tpetra::LRMultiVector "
+      "objects do not match.  src has dimensions [" << src.getGlobalLength ()
+      << "," << src.getNumVectors () << "], and dst has dimensions ["
+      << dst.getGlobalLength () << "," << dst.getNumVectors () << "].");
+
+    // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
+    TEUCHOS_TEST_FOR_EXCEPTION(
+      dst.getLocalLength () != src.getLocalLength (), std::invalid_argument,
+      "Tpetra::deep_copy: The local row counts of the two Tpetra::LRMultiVector "
+      "objects do not match.  src has " << src.getLocalLength () << " row(s) "
+      << " and dst has " << dst.getLocalLength () << " row(s).");
+
+    const bool srcMostUpToDateOnDevice = ! src.need_sync_device ();
+
+    if (src.isConstantStride () && dst.isConstantStride ()) {
+      if (srcMostUpToDateOnDevice) {
+        Details::localDeepCopyConstStride (
+                 dst.getLocalViewDevice (Access::OverwriteAll),
+                 src.getLocalViewDevice (Access::ReadOnly));
+      }
+      else {
+        Details::localDeepCopyConstStride (
+                 dst.getLocalViewDevice (Access::OverwriteAll),
+                 src.getLocalViewHost (Access::ReadOnly));
+      }
+    }
+    else {
+      auto dstWhichVecs = getMultiVectorWhichVectors (dst);
+      auto srcWhichVecs = getMultiVectorWhichVectors (src);
+
+      if (srcMostUpToDateOnDevice) {
+        Details::localDeepCopy (dst.getLocalViewDevice (Access::OverwriteAll),
+                                src.getLocalViewDevice (Access::ReadOnly),
+                                dst.isConstantStride (),
+                                src.isConstantStride (),
+                                dstWhichVecs,
+                                srcWhichVecs);
+      }
+      else {
+        Details::localDeepCopy (dst.getLocalViewDevice (Access::OverwriteAll),
+                                src.getLocalViewHost (Access::ReadOnly),
+                                dst.isConstantStride (),
+                                src.isConstantStride (),
+                                dstWhichVecs,
+                                srcWhichVecs);
+      }
+    }
+  }
+} // namespace Tpetra
+
+
+namespace Teuchos {
+
+  // Give Teuchos::TypeNameTraits<Tpetra::LRMultiVector<...> > a
+  // human-readable definition.
+  template<class SC, class LO, class GO, class NT>
+  class TypeNameTraits<Tpetra::LRMultiVector<SC, LO, GO, NT> > {
+  public:
+    static std::string name () {
+      return std::string ("Tpetra::LRMultiVector<") +
+        TypeNameTraits<SC>::name () + "," +
+        TypeNameTraits<LO>::name () + "," +
+        TypeNameTraits<GO>::name () + "," +
+        TypeNameTraits<NT>::name () + ">";
+    }
+
+    static std::string
+    concreteName (const Tpetra::LRMultiVector<SC, LO, GO, NT>&) {
+      return name ();
+    }
+  };
+} // namespace Teuchos
+
+#endif // TPETRA_LRMULTIVECTOR_DECL_HPP
diff --git a/src/include/Tpetra_LRMultiVector_def.hpp b/src/include/Tpetra_LRMultiVector_def.hpp
new file mode 100644
index 00000000..c2870cb3
--- /dev/null
+++ b/src/include/Tpetra_LRMultiVector_def.hpp
@@ -0,0 +1,4954 @@
+// @HEADER
+// *****************************************************************************
+//          Tpetra: Templated Linear Algebra Services Package
+//
+// Copyright 2008 NTESS and the Tpetra contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+// clang-format off
+#ifndef TPETRA_LRMULTIVECTOR_DEF_HPP
+#define TPETRA_LRMULTIVECTOR_DEF_HPP
+
+/// \file Tpetra_MultiVector_def.hpp
+/// \brief Definition of the Tpetra::LRMultiVector class
+///
+/// If you want to use Tpetra::LRMultiVector, include
+/// "Tpetra_MultiVector.hpp" (a file which CMake generates and
+/// installs for you).  If you only want the declaration of
+/// Tpetra::LRMultiVector, include "Tpetra_MultiVector_decl.hpp".
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Util.hpp"
+#include "Tpetra_Vector.hpp"
+#include "Tpetra_Details_allReduceView.hpp"
+#include "Tpetra_Details_Behavior.hpp"
+#include "Tpetra_Details_checkView.hpp"
+#include "Tpetra_Details_fill.hpp"
+#include "Tpetra_Details_gathervPrint.hpp"
+#include "Tpetra_Details_isInterComm.hpp"
+#include "Tpetra_Details_lclDot.hpp"
+#include "Tpetra_Details_normImpl.hpp"
+#include "Tpetra_Details_PackTraits.hpp"
+#include "Tpetra_Details_Profiling.hpp"
+#include "Tpetra_Details_reallocDualViewIfNeeded.hpp"
+#include "Tpetra_Details_Random.hpp"
+#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
+#  include "Teuchos_SerialDenseMatrix.hpp"
+#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
+#include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
+#include "KokkosCompat_View.hpp"
+#include "KokkosBlas.hpp"
+#include "KokkosKernels_Utils.hpp"
+#include "Kokkos_Random.hpp"
+#include "Kokkos_ArithTraits.hpp"
+#include <memory>
+#include <sstream>
+
+#ifdef HAVE_TPETRA_INST_FLOAT128
+namespace Kokkos {
+  // FIXME (mfh 04 Sep 2015) Just a stub for now!
+  template<class Generator>
+  struct rand<Generator, __float128> {
+    static KOKKOS_INLINE_FUNCTION __float128 max ()
+    {
+      return static_cast<__float128> (1.0);
+    }
+    static KOKKOS_INLINE_FUNCTION __float128
+    draw (Generator& gen)
+    {
+      // Half the smallest normalized double, is the scaling factor of
+      // the lower-order term in the double-double representation.
+      const __float128 scalingFactor =
+        static_cast<__float128> (std::numeric_limits<double>::min ()) /
+        static_cast<__float128> (2.0);
+      const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
+      const __float128 lowerOrderTerm =
+        static_cast<__float128> (gen.drand ()) * scalingFactor;
+      return higherOrderTerm + lowerOrderTerm;
+    }
+    static KOKKOS_INLINE_FUNCTION __float128
+    draw (Generator& gen, const __float128& range)
+    {
+      // FIXME (mfh 05 Sep 2015) Not sure if this is right.
+      const __float128 scalingFactor =
+        static_cast<__float128> (std::numeric_limits<double>::min ()) /
+        static_cast<__float128> (2.0);
+      const __float128 higherOrderTerm =
+        static_cast<__float128> (gen.drand (range));
+      const __float128 lowerOrderTerm =
+        static_cast<__float128> (gen.drand (range)) * scalingFactor;
+      return higherOrderTerm + lowerOrderTerm;
+    }
+    static KOKKOS_INLINE_FUNCTION __float128
+    draw (Generator& gen, const __float128& start, const __float128& end)
+    {
+      // FIXME (mfh 05 Sep 2015) Not sure if this is right.
+      const __float128 scalingFactor =
+        static_cast<__float128> (std::numeric_limits<double>::min ()) /
+        static_cast<__float128> (2.0);
+      const __float128 higherOrderTerm =
+        static_cast<__float128> (gen.drand (start, end));
+      const __float128 lowerOrderTerm =
+        static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
+      return higherOrderTerm + lowerOrderTerm;
+    }
+  };
+} // namespace Kokkos
+#endif // HAVE_TPETRA_INST_FLOAT128
+
+namespace { // (anonymous)
+
+  /// \brief Allocate and return a 2-D Kokkos::DualView for Tpetra::LRMultiVector.
+  ///
+  /// This function takes the same first four template parameters as
+  /// Tpetra::LRMultiVector.
+  ///
+  /// \param lclNumRows [in] Number of rows in the DualView.
+  ///   "Local" means "local to the calling MPI process."
+  /// \param numCols [in] Number of columns in the DualView.
+  /// \param zeroOut [in] Whether to initialize all the entries of the
+  ///   DualView to zero.  Kokkos does first-touch initialization.
+  /// \param allowPadding [in] Whether to give Kokkos the option to
+  ///   pad Views for alignment.
+  ///
+  /// \return The allocated Kokkos::DualView.
+  template<class ST, class LO, class GO, class NT>
+  typename Tpetra::LRMultiVector<ST, LO, GO, NT>::wrapped_dual_view_type
+  allocDualView (const size_t lclNumRows,
+                 const size_t numCols,
+                 const bool zeroOut = true,
+                 const bool allowPadding = false)
+  {
+    using ::Tpetra::Details::Behavior;
+    using Kokkos::AllowPadding;
+    using Kokkos::view_alloc;
+    using Kokkos::WithoutInitializing;
+    typedef typename Tpetra::LRMultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
+    typedef typename Tpetra::LRMultiVector<ST, LO, GO, NT>::wrapped_dual_view_type wrapped_dual_view_type;
+    typedef typename dual_view_type::t_dev dev_view_type;
+
+    // This needs to be a string and not a char*, if given as an
+    // argument to Kokkos::view_alloc.  This is because view_alloc
+    // also allows a raw pointer as its first argument.  See
+    // https://github.com/kokkos/kokkos/issues/434.
+    const std::string label ("MV::DualView");
+    const bool debug = Behavior::debug ();
+
+    // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
+    // creation of the DualView's Views works around
+    // Kokkos::DualView's current inability to accept an
+    // AllocationProperties initial argument (as Kokkos::View does).
+    // However, the work-around is harmless, since it does what the
+    // (currently nonexistent) equivalent DualView constructor would
+    // have done anyway.
+
+    dev_view_type d_view;
+    if (zeroOut) {
+      if (allowPadding) {
+        d_view = dev_view_type (view_alloc (label, AllowPadding),
+                                lclNumRows, numCols);
+      }
+      else {
+        d_view = dev_view_type (view_alloc (label),
+                                lclNumRows, numCols);
+      }
+    }
+    else {
+      if (allowPadding) {
+        d_view = dev_view_type (view_alloc (label,
+                                            WithoutInitializing,
+                                            AllowPadding),
+                                lclNumRows, numCols);
+      }
+      else {
+        d_view = dev_view_type (view_alloc (label, WithoutInitializing),
+                                lclNumRows, numCols);
+      }
+      if (debug) {
+        // Filling with NaN is a cheap and effective way to tell if
+        // downstream code is trying to use a LRMultiVector's data
+        // without them having been initialized.  ArithTraits lets us
+        // call nan() even if the scalar type doesn't define it; it
+        // just returns some undefined value in the latter case.  This
+        // won't hurt anything because by setting zeroOut=false, users
+        // already agreed that they don't care about the contents of
+        // the LRMultiVector.
+        const ST nan = Kokkos::ArithTraits<ST>::nan ();
+        KokkosBlas::fill (d_view, nan);
+      }
+    }
+    if (debug) {
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (static_cast<size_t> (d_view.extent (0)) != lclNumRows ||
+         static_cast<size_t> (d_view.extent (1)) != numCols, std::logic_error,
+         "allocDualView: d_view's dimensions actual dimensions do not match "
+         "requested dimensions.  d_view is " << d_view.extent (0) << " x " <<
+         d_view.extent (1) << "; requested " << lclNumRows << " x " << numCols
+         << ".  Please report this bug to the Tpetra developers.");
+    }
+
+    return wrapped_dual_view_type(d_view);
+  }
+
+  // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
+  //
+  // T: The type of the entries of the View.
+  // ExecSpace: The Kokkos execution space.
+  template<class T, class ExecSpace>
+  struct MakeUnmanagedView {
+    // The 'false' part of the branch carefully ensures that this
+    // won't attempt to use a host execution space that hasn't been
+    // initialized.  For example, if Kokkos::OpenMP is disabled and
+    // Kokkos::Threads is enabled, the latter is always the default
+    // execution space of Kokkos::HostSpace, even when ExecSpace is
+    // Kokkos::Serial.  That's why we go through the trouble of asking
+    // Kokkos::DualView what _its_ space is.  That seems to work
+    // around this default execution space issue.
+    //
+    typedef typename std::conditional<
+      Kokkos::SpaceAccessibility<
+        typename ExecSpace::memory_space,
+        Kokkos::HostSpace>::accessible,
+      typename ExecSpace::device_type,
+      typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
+    typedef Kokkos::LayoutRight array_layout;
+    typedef Kokkos::View<T*, array_layout, host_exec_space,
+                         Kokkos::MemoryUnmanaged> view_type;
+
+    static view_type getView (const Teuchos::ArrayView<T>& x_in)
+    {
+      const size_t numEnt = static_cast<size_t> (x_in.size ());
+      if (numEnt == 0) {
+        return view_type ();
+      } else {
+        return view_type (x_in.getRawPtr (), numEnt);
+      }
+    }
+  };
+
+
+  template<class WrappedDualViewType>
+  WrappedDualViewType
+  takeSubview (const WrappedDualViewType& X,
+               const std::pair<size_t, size_t>& rowRng,
+               const Kokkos::ALL_t& colRng)
+
+  {
+    // The bug we saw below should be harder to trigger here.
+    return WrappedDualViewType(X,rowRng,colRng);
+  }
+
+  template<class WrappedDualViewType>
+  WrappedDualViewType
+  takeSubview (const WrappedDualViewType& X,
+               const Kokkos::ALL_t& rowRng,
+               const std::pair<size_t, size_t>& colRng)
+  {
+    using DualViewType = typename WrappedDualViewType::DVT;
+    // Look carefullly at the comment in the below version of this function.
+    // The comment applies here as well.
+    if (X.extent (0) == 0 && X.extent (1) != 0) {
+      return WrappedDualViewType(DualViewType ("MV::DualView", 0, colRng.second - colRng.first));
+    }
+    else {
+      return  WrappedDualViewType(X,rowRng,colRng);
+    }
+  }
+
+  template<class WrappedDualViewType>
+  WrappedDualViewType
+  takeSubview (const WrappedDualViewType& X,
+               const std::pair<size_t, size_t>& rowRng,
+               const std::pair<size_t, size_t>& colRng)
+  {
+    using DualViewType = typename WrappedDualViewType::DVT;
+    // If you take a subview of a view with zero rows Kokkos::subview()
+    // always returns a DualView with the same data pointers.  This will break
+    // pointer equality testing in between two subviews of the same 2D View if
+    // it has zero row extent.  While the one (known) case where this was actually used 
+    // has been fixed, that sort of check could very easily be reintroduced in the future, 
+    // hence I've added this if check here.
+    //
+    // This is not a bug in Kokkos::subview(), just some very subtle behavior which
+    // future developers should be wary of.
+    if (X.extent (0) == 0 && X.extent (1) != 0) {
+      return WrappedDualViewType(DualViewType ("MV::DualView", 0, colRng.second - colRng.first));
+    }
+    else {
+      return WrappedDualViewType(X,rowRng,colRng);
+    }
+  }
+
+  template<class WrappedOrNotDualViewType>
+  size_t
+  getDualViewStride (const WrappedOrNotDualViewType& dv)
+  {
+    // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
+    // method yet, but its Views do.
+    // NOTE: dv.stride() returns a vector of length one
+    // more than its rank
+    size_t strides[WrappedOrNotDualViewType::t_dev::rank+1];
+    dv.stride(strides);
+    const size_t LDA = strides[1];
+    const size_t numRows = dv.extent (0);
+
+    return numRows;
+  }
+
+  template<class ViewType>
+  size_t
+  getViewStride (const ViewType& view)
+  {
+    const size_t LDA = view.stride (1);
+    const size_t numRows = view.extent (0);
+
+    return numRows;
+  }
+
+  template <class impl_scalar_type, class buffer_device_type>
+  bool
+  runKernelOnHost ( 
+    Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports 
+  )
+  {
+    if (! imports.need_sync_device ()) {
+      return false; // most up-to-date on device
+    }
+    else { // most up-to-date on host, 
+           // but if large enough, worth running on device anyway
+      size_t localLengthThreshold = 
+             Tpetra::Details::Behavior::multivectorKernelLocationThreshold();
+      return imports.extent(0) <= localLengthThreshold;
+    }
+  }
+
+
+  template <class SC, class LO, class GO, class NT>
+  bool
+  runKernelOnHost (const ::Tpetra::LRMultiVector<SC, LO, GO, NT>& X)
+  {
+    if (! X.need_sync_device ()) {
+      return false; // most up-to-date on device
+    }
+    else { // most up-to-date on host
+           // but if large enough, worth running on device anyway
+      size_t localLengthThreshold = 
+             Tpetra::Details::Behavior::multivectorKernelLocationThreshold();
+      return X.getLocalLength () <= localLengthThreshold;
+    }
+  }
+
+  template <class SC, class LO, class GO, class NT>
+  void
+  multiVectorNormImpl (typename ::Tpetra::LRMultiVector<SC, LO, GO, NT>::mag_type norms[],
+                       ::Tpetra::LRMultiVector<SC, LO, GO, NT>& X,
+                       const ::Tpetra::Details::EWhichNorm whichNorm)
+  {
+    using namespace Tpetra;
+    using ::Tpetra::Details::normImpl;
+    using MV = ::Tpetra::LRMultiVector<SC, LO, GO, NT>;
+    using val_type = typename MV::impl_scalar_type;
+    using mag_type = typename MV::mag_type;
+    using dual_view_type = typename MV::dual_view_type;
+
+    auto map = X.getMap ();
+    auto comm = map.is_null () ? nullptr : map->getComm ().getRawPtr ();
+    auto whichVecs = getMultiVectorWhichVectors (X);
+    const bool isConstantStride = X.isConstantStride ();
+    const bool isDistributed = X.isDistributed ();
+
+    const bool runOnHost = runKernelOnHost (X);
+    if (runOnHost) {
+      using view_type    = typename dual_view_type::t_host;
+      using array_layout = typename view_type::array_layout;
+      using device_type  = typename view_type::device_type;
+
+      auto X_lcl = X.getLocalViewHost(Access::ReadOnly);
+      normImpl<val_type, array_layout, device_type,
+        mag_type> (norms, X_lcl, whichNorm, whichVecs,
+                   isConstantStride, isDistributed, comm);
+    }
+    else {
+      using view_type    = typename dual_view_type::t_dev;
+      using array_layout = typename view_type::array_layout;
+      using device_type = typename view_type::device_type;
+
+      auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
+      normImpl<val_type, array_layout, device_type,
+        mag_type> (norms, X_lcl, whichNorm, whichVecs,
+                   isConstantStride, isDistributed, comm);
+    }
+  }
+} // namespace (anonymous)
+
+
+namespace Tpetra {
+
+  namespace Details {
+    template <typename DstView, typename SrcView>
+    struct AddAssignFunctor {
+      // This functor would be better as a lambda, but CUDA cannot compile
+      // lambdas in protected functions.  It compiles fine with the functor.
+      AddAssignFunctor(DstView &tgt_, SrcView &src_) : tgt(tgt_), src(src_) {}
+
+      KOKKOS_INLINE_FUNCTION void
+      operator () (const size_t k) const { tgt(k) += src(k); }
+
+      DstView tgt;
+      SrcView src;
+    };
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  bool
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  vectorIndexOutOfRange (const size_t VectorIndex) const {
+    return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  LRMultiVector () :
+    base_type (Teuchos::rcp (new map_type ()))
+  {}
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  LRMultiVector (const Teuchos::RCP<const map_type>& map,
+               const size_t numVecs,
+               const bool zeroOut) : /* default is true */
+    base_type (map)
+  {
+    ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,numVecs,zeroOut)");
+
+    const size_t lclNumRows = this->getLocalLength ();
+    view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (lclNumRows, numVecs, zeroOut);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  LRMultiVector (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& source,
+               const Teuchos::DataAccess copyOrView) :
+    base_type (source),
+    view_ (source.view_),
+    whichVectors_ (source.whichVectors_)
+  {
+    typedef LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> MV;
+    const char tfecfFuncName[] = "LRMultiVector(const LRMultiVector&, "
+      "const Teuchos::DataAccess): ";
+
+    ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV 2-arg \"copy\" ctor");
+
+    if (copyOrView == Teuchos::Copy) {
+      // Reuse the conveniently already existing function that creates
+      // a deep copy.
+      MV cpy = createCopy (source);
+      this->view_ = cpy.view_;
+      this->whichVectors_ = cpy.whichVectors_;
+    }
+    else if (copyOrView == Teuchos::View) {
+    }
+    else {
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+        true, std::invalid_argument, "Second argument 'copyOrView' has an "
+        "invalid value " << copyOrView << ".  Valid values include "
+        "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
+        << Teuchos::View << ".");
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  LRMultiVector (const Teuchos::RCP<const map_type>& map,
+               const dual_view_type& view) :
+    base_type (map),
+    view_ (wrapped_dual_view_type(view))
+  {
+    const char tfecfFuncName[] = "LRMultiVector(Map,DualView): ";
+    const size_t lclNumRows_map = map.is_null () ? size_t (0) :
+      map->getLocalNumElements ();
+    const size_t lclNumRows_view = view.extent (0);
+    const size_t LDA = getDualViewStride (view_);
+
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
+       std::invalid_argument, "Kokkos::DualView does not match Map. "
+       "map->getLocalNumElements() = " << lclNumRows_map
+       << ", view.extent(0) = " << lclNumRows_view
+       << ", and getStride() = " << LDA << ".");
+
+    using ::Tpetra::Details::Behavior;
+    const bool debug = Behavior::debug ();
+    if (debug) {
+      using ::Tpetra::Details::checkGlobalDualViewValidity;
+      std::ostringstream gblErrStrm;
+      const bool verbose = Behavior::verbose ();
+      const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
+      const bool gblValid =
+        checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
+                                     comm.getRawPtr ());
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (! gblValid, std::runtime_error, gblErrStrm.str ());
+    }
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  LRMultiVector (const Teuchos::RCP<const map_type>& map,
+               const wrapped_dual_view_type& view) :
+    base_type (map),
+    view_ (view)
+  {
+    const char tfecfFuncName[] = "LRMultiVector(Map,DualView): ";
+    const size_t lclNumRows_map = map.is_null () ? size_t (0) :
+      map->getLocalNumElements ();
+    const size_t lclNumRows_view = view.extent (0);
+    const size_t LDA = getDualViewStride (view);
+
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
+       std::invalid_argument, "Kokkos::DualView does not match Map. "
+       "map->getLocalNumElements() = " << lclNumRows_map
+       << ", view.extent(0) = " << lclNumRows_view
+       << ", and getStride() = " << LDA << ".");
+
+    using ::Tpetra::Details::Behavior;
+    const bool debug = Behavior::debug ();
+    if (debug) {
+      using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
+      std::ostringstream gblErrStrm;
+      const bool verbose = Behavior::verbose ();
+      const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
+      const bool gblValid =
+        checkGlobalWrappedDualViewValidity (&gblErrStrm, view, verbose,
+                                            comm.getRawPtr ());
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (! gblValid, std::runtime_error, gblErrStrm.str ());
+    }
+  }
+
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  LRMultiVector (const Teuchos::RCP<const map_type>& map,
+               const typename dual_view_type::t_dev& d_view) :
+    base_type (map)
+  {
+    using Teuchos::ArrayRCP;
+    using Teuchos::RCP;
+    const char tfecfFuncName[] = "LRMultiVector(map,d_view): ";
+
+    ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,d_view)");
+
+    const size_t LDA = getViewStride (d_view);
+    const size_t lclNumRows = map->getLocalNumElements ();
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (LDA < lclNumRows, std::invalid_argument, "Map does not match "
+       "Kokkos::View.  map->getLocalNumElements() = " << lclNumRows
+       << ", View's column stride = " << LDA
+       << ", and View's extent(0) = " << d_view.extent (0) << ".");
+
+    auto h_view = Kokkos::create_mirror_view (d_view);
+    auto dual_view = dual_view_type (d_view, h_view);
+    view_ = wrapped_dual_view_type(dual_view);
+
+    using ::Tpetra::Details::Behavior;
+    const bool debug = Behavior::debug ();
+    if (debug) {
+      using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
+      std::ostringstream gblErrStrm;
+      const bool verbose = Behavior::verbose ();
+      const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
+      const bool gblValid =
+        checkGlobalWrappedDualViewValidity (&gblErrStrm, view_, verbose,
+                                            comm.getRawPtr ());
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (! gblValid, std::runtime_error, gblErrStrm.str ());
+    }
+    // The user gave us a device view.  In order to respect its
+    // initial contents, we mark the DualView as "modified on device."
+    // That way, the next sync will synchronize it with the host view.
+    dual_view.modify_device ();
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  LRMultiVector (const Teuchos::RCP<const map_type>& map,
+               const dual_view_type& view,
+               const dual_view_type& origView) :
+    base_type (map),
+    view_ (wrapped_dual_view_type(view,origView))
+  {
+    const char tfecfFuncName[] = "LRMultiVector(map,view,origView): ";
+
+    const size_t LDA = getDualViewStride (origView);
+    const size_t lclNumRows = this->getLocalLength (); // comes from the Map
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
+      "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
+      << ".  This may also mean that the input origView's first dimension (number "
+      "of rows = " << origView.extent (0) << ") does not not match the number "
+      "of entries in the Map on the calling process.");
+
+    using ::Tpetra::Details::Behavior;
+    const bool debug = Behavior::debug ();
+    if (debug) {
+      using ::Tpetra::Details::checkGlobalDualViewValidity;
+      std::ostringstream gblErrStrm;
+      const bool verbose = Behavior::verbose ();
+      const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
+      const bool gblValid_0 =
+        checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
+                                     comm.getRawPtr ());
+      const bool gblValid_1 =
+        checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
+                                     comm.getRawPtr ());
+      const bool gblValid = gblValid_0 && gblValid_1;
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (! gblValid, std::runtime_error, gblErrStrm.str ());
+    }
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  LRMultiVector (const Teuchos::RCP<const map_type>& map,
+               const dual_view_type& view,
+               const Teuchos::ArrayView<const size_t>& whichVectors) :
+    base_type (map),
+    view_ (view),
+    whichVectors_ (whichVectors.begin (), whichVectors.end ())
+  {
+    using Kokkos::ALL;
+    using Kokkos::subview;
+    const char tfecfFuncName[] = "LRMultiVector(map,view,whichVectors): ";
+
+    using ::Tpetra::Details::Behavior;
+    const bool debug = Behavior::debug ();
+    if (debug) {
+      using ::Tpetra::Details::checkGlobalDualViewValidity;
+      std::ostringstream gblErrStrm;
+      const bool verbose = Behavior::verbose ();
+      const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
+      const bool gblValid =
+        checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
+                                     comm.getRawPtr ());
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (! gblValid, std::runtime_error, gblErrStrm.str ());
+    }
+
+    const size_t lclNumRows = map.is_null () ? size_t (0) :
+      map->getLocalNumElements ();
+    // Check dimensions of the input DualView.  We accept that Kokkos
+    // might not allow construction of a 0 x m (Dual)View with m > 0,
+    // so we only require the number of rows to match if the
+    // (Dual)View has more than zero columns.  Likewise, we only
+    // require the number of columns to match if the (Dual)View has
+    // more than zero rows.
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
+      std::invalid_argument, "view.extent(0) = " << view.extent (0)
+      << " < map->getLocalNumElements() = " << lclNumRows << ".");
+    if (whichVectors.size () != 0) {
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+        view.extent (1) != 0 && view.extent (1) == 0,
+        std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
+        " = " << whichVectors.size () << " > 0.");
+      size_t maxColInd = 0;
+      typedef Teuchos::ArrayView<const size_t>::size_type size_type;
+      for (size_type k = 0; k < whichVectors.size (); ++k) {
+        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+          whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
+          std::invalid_argument, "whichVectors[" << k << "] = "
+          "Teuchos::OrdinalTraits<size_t>::invalid().");
+        maxColInd = std::max (maxColInd, whichVectors[k]);
+      }
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+        view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
+        std::invalid_argument, "view.extent(1) = " << view.extent (1)
+        << " <= max(whichVectors) = " << maxColInd << ".");
+    }
+
+    // If extent(1) is 0, the stride might be 0.  BLAS doesn't like
+    // zero strides, so modify in that case.
+    const size_t LDA = getDualViewStride (view);
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (LDA < lclNumRows, std::invalid_argument,
+       "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
+
+    if (whichVectors.size () == 1) {
+      // If whichVectors has only one entry, we don't need to bother
+      // with nonconstant stride.  Just shift the view over so it
+      // points to the desired column.
+      //
+      // NOTE (mfh 10 May 2014) This is a special case where we set
+      // origView_ just to view that one column, not all of the
+      // original columns.  This ensures that the use of origView_ in
+      // offsetView works correctly.
+      //
+      const std::pair<size_t, size_t> colRng (whichVectors[0],
+                                              whichVectors[0] + 1);
+      view_ = takeSubview (view_, ALL (), colRng);
+      // whichVectors_.size() == 0 means "constant stride."
+      whichVectors_.clear ();
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  LRMultiVector (const Teuchos::RCP<const map_type>& map,
+               const wrapped_dual_view_type& view,
+               const Teuchos::ArrayView<const size_t>& whichVectors) :
+    base_type (map),
+    view_ (view),
+    whichVectors_ (whichVectors.begin (), whichVectors.end ())
+  {
+    using Kokkos::ALL;
+    using Kokkos::subview;
+    const char tfecfFuncName[] = "LRMultiVector(map,view,whichVectors): ";
+
+    using ::Tpetra::Details::Behavior;
+    const bool debug = Behavior::debug ();
+    if (debug) {
+      using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
+      std::ostringstream gblErrStrm;
+      const bool verbose = Behavior::verbose ();
+      const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
+      const bool gblValid =
+        checkGlobalWrappedDualViewValidity (&gblErrStrm, view, verbose,
+                                     comm.getRawPtr ());
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (! gblValid, std::runtime_error, gblErrStrm.str ());
+    }
+
+    const size_t lclNumRows = map.is_null () ? size_t (0) :
+      map->getLocalNumElements ();
+    // Check dimensions of the input DualView.  We accept that Kokkos
+    // might not allow construction of a 0 x m (Dual)View with m > 0,
+    // so we only require the number of rows to match if the
+    // (Dual)View has more than zero columns.  Likewise, we only
+    // require the number of columns to match if the (Dual)View has
+    // more than zero rows.
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
+      std::invalid_argument, "view.extent(0) = " << view.extent (0)
+      << " < map->getLocalNumElements() = " << lclNumRows << ".");
+    if (whichVectors.size () != 0) {
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+        view.extent (1) != 0 && view.extent (1) == 0,
+        std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
+        " = " << whichVectors.size () << " > 0.");
+      size_t maxColInd = 0;
+      typedef Teuchos::ArrayView<const size_t>::size_type size_type;
+      for (size_type k = 0; k < whichVectors.size (); ++k) {
+        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+          whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
+          std::invalid_argument, "whichVectors[" << k << "] = "
+          "Teuchos::OrdinalTraits<size_t>::invalid().");
+        maxColInd = std::max (maxColInd, whichVectors[k]);
+      }
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+        view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
+        std::invalid_argument, "view.extent(1) = " << view.extent (1)
+        << " <= max(whichVectors) = " << maxColInd << ".");
+    }
+
+    // If extent(1) is 0, the stride might be 0.  BLAS doesn't like
+    // zero strides, so modify in that case.
+    const size_t LDA = getDualViewStride (view);
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (LDA < lclNumRows, std::invalid_argument,
+       "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
+
+    if (whichVectors.size () == 1) {
+      // If whichVectors has only one entry, we don't need to bother
+      // with nonconstant stride.  Just shift the view over so it
+      // points to the desired column.
+      //
+      // NOTE (mfh 10 May 2014) This is a special case where we set
+      // origView_ just to view that one column, not all of the
+      // original columns.  This ensures that the use of origView_ in
+      // offsetView works correctly.
+      //
+      const std::pair<size_t, size_t> colRng (whichVectors[0],
+                                              whichVectors[0] + 1);
+      view_ = takeSubview (view_, ALL (), colRng);
+      // whichVectors_.size() == 0 means "constant stride."
+      whichVectors_.clear ();
+    }
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  LRMultiVector (const Teuchos::RCP<const map_type>& map,
+               const dual_view_type& view,
+               const dual_view_type& origView,
+               const Teuchos::ArrayView<const size_t>& whichVectors) :
+    base_type (map),
+    view_ (wrapped_dual_view_type(view,origView)),
+    whichVectors_ (whichVectors.begin (), whichVectors.end ())
+  {
+    using Kokkos::ALL;
+    using Kokkos::subview;
+    const char tfecfFuncName[] = "LRMultiVector(map,view,origView,whichVectors): ";
+
+    using ::Tpetra::Details::Behavior;
+    const bool debug = Behavior::debug ();
+    if (debug) {
+      using ::Tpetra::Details::checkGlobalDualViewValidity;
+      std::ostringstream gblErrStrm;
+      const bool verbose = Behavior::verbose ();
+      const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
+      const bool gblValid_0 =
+        checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
+                                     comm.getRawPtr ());
+      const bool gblValid_1 =
+        checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
+                                     comm.getRawPtr ());
+      const bool gblValid = gblValid_0 && gblValid_1;
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (! gblValid, std::runtime_error, gblErrStrm.str ());
+    }
+
+    const size_t lclNumRows = this->getLocalLength ();
+    // Check dimensions of the input DualView.  We accept that Kokkos
+    // might not allow construction of a 0 x m (Dual)View with m > 0,
+    // so we only require the number of rows to match if the
+    // (Dual)View has more than zero columns.  Likewise, we only
+    // require the number of columns to match if the (Dual)View has
+    // more than zero rows.
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
+      std::invalid_argument, "view.extent(0) = " << view.extent (0)
+      << " < map->getLocalNumElements() = " << lclNumRows << ".");
+    if (whichVectors.size () != 0) {
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+        view.extent (1) != 0 && view.extent (1) == 0,
+        std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
+        " = " << whichVectors.size () << " > 0.");
+      size_t maxColInd = 0;
+      typedef Teuchos::ArrayView<const size_t>::size_type size_type;
+      for (size_type k = 0; k < whichVectors.size (); ++k) {
+        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+          whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
+          std::invalid_argument, "whichVectors[" << k << "] = "
+          "Teuchos::OrdinalTraits<size_t>::invalid().");
+        maxColInd = std::max (maxColInd, whichVectors[k]);
+      }
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+        view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
+        std::invalid_argument, "view.extent(1) = " << view.extent (1)
+        << " <= max(whichVectors) = " << maxColInd << ".");
+    }
+
+    // If extent(1) is 0, the stride might be 0.  BLAS doesn't like
+    // zero strides, so modify in that case.
+    const size_t LDA = getDualViewStride (origView);
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (LDA < lclNumRows, std::invalid_argument, "Map and DualView origView "
+       "do not match.  LDA = " << LDA << " < this->getLocalLength() = " <<
+       lclNumRows << ".  origView.extent(0) = " << origView.extent(0)
+       << ", origView.stride(1) = " << origView.d_view.stride(1) << ".");
+
+    if (whichVectors.size () == 1) {
+      // If whichVectors has only one entry, we don't need to bother
+      // with nonconstant stride.  Just shift the view over so it
+      // points to the desired column.
+      //
+      // NOTE (mfh 10 May 2014) This is a special case where we set
+      // origView_ just to view that one column, not all of the
+      // original columns.  This ensures that the use of origView_ in
+      // offsetView works correctly.
+      const std::pair<size_t, size_t> colRng (whichVectors[0],
+                                              whichVectors[0] + 1);
+      view_ = takeSubview (view_, ALL (), colRng);
+      //      origView_ = takeSubview (origView_, ALL (), colRng);
+      // whichVectors_.size() == 0 means "constant stride."
+      whichVectors_.clear ();
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  LRMultiVector (const Teuchos::RCP<const map_type>& map,
+               const Teuchos::ArrayView<const Scalar>& data,
+               const size_t LDA,
+               const size_t numVecs) :
+    base_type (map)
+  {
+    typedef LocalOrdinal LO;
+    typedef GlobalOrdinal GO;
+    const char tfecfFuncName[] = "LRMultiVector(map,data,LDA,numVecs): ";
+    ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
+
+    // Deep copy constructor, constant stride (NO whichVectors_).
+    // There is no need for a deep copy constructor with nonconstant stride.
+
+    const size_t lclNumRows =
+      map.is_null () ? size_t (0) : map->getLocalNumElements ();
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
+       "map->getLocalNumElements() = " << lclNumRows << ".");
+    if (numVecs != 0) {
+      const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (static_cast<size_t> (data.size ()) < minNumEntries,
+         std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
+         "entries, given the input Map and number of vectors in the LRMultiVector."
+         "  data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
+         "map->getLocalNumElements () = " << minNumEntries << ".");
+    }
+
+    this->view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
+    //Note: X_out will be completely overwritten
+    auto X_out = this->getLocalViewDevice(Access::OverwriteAll);
+
+    // Make an unmanaged host Kokkos::View of the input data.  First
+    // create a View (X_in_orig) with the original stride.  Then,
+    // create a subview (X_in) with the right number of columns.
+    const impl_scalar_type* const X_in_raw =
+      reinterpret_cast<const impl_scalar_type*> (data.getRawPtr ());
+    Kokkos::View<const impl_scalar_type**,
+      Kokkos::LayoutRight,
+      Kokkos::HostSpace,
+      Kokkos::MemoryUnmanaged> X_in_orig (X_in_raw, LDA, numVecs);
+    const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
+    auto X_in = Kokkos::subview (X_in_orig, rowRng, Kokkos::ALL ());
+
+    // If LDA != X_out's column stride, then we need to copy one
+    // column at a time; Kokkos::deep_copy refuses to work in that
+    // case.
+    const size_t outStride =
+      X_out.extent (1) == 0 ? size_t (1) : X_out.stride (1);
+    if (LDA == outStride) { // strides are the same; deep_copy once
+      // This only works because LRMultiVector uses LayoutLeft.
+      // We would need a custom copy functor otherwise.
+      // DEEP_COPY REVIEW - HOST-TO-DEVICE
+      Kokkos::deep_copy (execution_space(), X_out, X_in);
+    }
+    else { // strides differ; copy one column at a time
+      typedef decltype (Kokkos::subview (X_out, Kokkos::ALL (), 0))
+        out_col_view_type;
+      typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0))
+        in_col_view_type;
+      for (size_t j = 0; j < numVecs; ++j) {
+        out_col_view_type X_out_j = Kokkos::subview (X_out, Kokkos::ALL (), j);
+        in_col_view_type X_in_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
+        // DEEP_COPY REVIEW - HOST-TO-DEVICE
+        Kokkos::deep_copy (execution_space(), X_out_j, X_in_j);
+      }
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  LRMultiVector (const Teuchos::RCP<const map_type>& map,
+               const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
+               const size_t numVecs) :
+    base_type (map)
+  {
+    typedef impl_scalar_type IST;
+    typedef LocalOrdinal LO;
+    typedef GlobalOrdinal GO;
+    const char tfecfFuncName[] = "LRMultiVector(map,ArrayOfPtrs,numVecs): ";
+    ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
+
+    const size_t lclNumRows =
+      map.is_null () ? size_t (0) : map->getLocalNumElements ();
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
+       std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
+       "ArrayOfPtrs.size() (= " << ArrayOfPtrs.size () << ") != numVecs.");
+    for (size_t j = 0; j < numVecs; ++j) {
+      Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+        static_cast<size_t> (X_j_av.size ()) < lclNumRows,
+        std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
+        << X_j_av.size () << " < map->getLocalNumElements() = " << lclNumRows
+        << ".");
+    }
+
+    view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
+    auto X_out = this->getLocalViewDevice(Access::ReadWrite);
+
+    // Make sure that the type of a single input column has the same
+    // array layout as each output column does, so we can deep_copy.
+    using array_layout = typename decltype (X_out)::array_layout;
+    using input_col_view_type = typename Kokkos::View<const IST*,
+      array_layout,
+      Kokkos::HostSpace,
+      Kokkos::MemoryUnmanaged>;
+
+    const std::pair<size_t, size_t> rowRng (0, lclNumRows);
+    for (size_t j = 0; j < numVecs; ++j) {
+      Teuchos::ArrayView<const IST> X_j_av =
+        Teuchos::av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
+      input_col_view_type X_j_in (X_j_av.getRawPtr (), lclNumRows);
+      auto X_j_out = Kokkos::subview (X_out, rowRng, j);
+      // DEEP_COPY REVIEW - HOST-TO-DEVICE
+      Kokkos::deep_copy (execution_space(), X_j_out, X_j_in);
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  bool LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  isConstantStride () const {
+    return whichVectors_.empty ();
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  size_t
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getLocalLength () const
+  {
+    if (this->getMap ().is_null ()) { // possible, due to replaceMap().
+      return static_cast<size_t> (0);
+    } else {
+      return this->getMap ()->getLocalNumElements ();
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  global_size_t
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getGlobalLength () const
+  {
+    if (this->getMap ().is_null ()) { // possible, due to replaceMap().
+      return static_cast<size_t> (0);
+    } else {
+      return this->getMap ()->getGlobalNumElements ();
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  size_t
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getStride () const
+  {
+    return isConstantStride () ? getDualViewStride (view_) : size_t (0);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  bool
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  aliases(const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& other) const
+  {
+    //Don't actually get a view, just get pointers.
+    auto thisData = view_.getDualView().h_view.data();
+    auto otherData = other.view_.getDualView().h_view.data();
+    size_t thisSpan = view_.getDualView().h_view.span();
+    size_t otherSpan = other.view_.getDualView().h_view.span();
+    return (otherData <= thisData && thisData < otherData + otherSpan)
+      || (thisData <= otherData && otherData < thisData + thisSpan);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  bool
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  checkSizes (const SrcDistObject& sourceObj)
+  {
+    // Check whether the source object is a LRMultiVector.  If not, then
+    // we can't even compare sizes, so it's definitely not OK to
+    // Import or Export from it.
+    typedef LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> MV;
+    const MV* src = dynamic_cast<const MV*> (&sourceObj);
+    if (src == nullptr) {
+      return false;
+    }
+    else {
+      // The target of the Import or Export calls checkSizes() in
+      // DistObject::doTransfer().  By that point, we've already
+      // constructed an Import or Export object using the two
+      // multivectors' Maps, which means that (hopefully) we've
+      // already checked other attributes of the multivectors.  Thus,
+      // all we need to do here is check the number of columns.
+      return src->getNumVectors () == this->getNumVectors ();
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  size_t
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  constantNumberOfPackets () const {
+    return this->getNumVectors ();
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  copyAndPermute
+  (const SrcDistObject& sourceObj,
+   const size_t numSameIDs,
+   const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
+   const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
+   const CombineMode CM,
+   const execution_space &space)
+  {
+    using ::Tpetra::Details::Behavior;
+    using ::Tpetra::Details::getDualViewCopyFromArrayView;
+    using ::Tpetra::Details::ProfilingRegion;
+    using std::endl;
+    using KokkosRefactor::Details::permute_array_multi_column;
+    using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
+    using Kokkos::Compat::create_const_view;
+    using MV = LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
+
+    // We've already called checkSizes(), so this cast must succeed.
+    MV& sourceMV = const_cast<MV &>(dynamic_cast<const MV&> (sourceObj));
+    const bool copyOnHost = runKernelOnHost(sourceMV);
+    const char longFuncNameHost[] = "Tpetra::LRMultiVector::copyAndPermute[Host]";
+    const char longFuncNameDevice[] = "Tpetra::LRMultiVector::copyAndPermute[Device]";
+    const char tfecfFuncName[] = "copyAndPermute: ";
+    ProfilingRegion regionCAP (copyOnHost ? longFuncNameHost : longFuncNameDevice);
+
+    const bool verbose = Behavior::verbose ();
+    std::unique_ptr<std::string> prefix;
+    if (verbose) {
+      auto map = this->getMap ();
+      auto comm = map.is_null () ? Teuchos::null : map->getComm ();
+      const int myRank = comm.is_null () ? -1 : comm->getRank ();
+      std::ostringstream os;
+      os << "Proc " << myRank << ": MV::copyAndPermute: ";
+      prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
+      os << "Start" << endl;
+      std::cerr << os.str ();
+    }
+
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0),
+       std::logic_error, "permuteToLIDs.extent(0) = "
+       << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = "
+       << permuteFromLIDs.extent (0) << ".");
+    const size_t numCols = this->getNumVectors ();
+
+    // sourceMV doesn't belong to us, so we can't sync it.  Do the
+    // copying where it's currently sync'd.
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
+       std::logic_error, "Input LRMultiVector needs sync to both host "
+       "and device.");
+    if (verbose) {
+      std::ostringstream os;
+      os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
+      std::cerr << os.str ();
+    }
+
+    if (verbose) {
+      std::ostringstream os;
+      os << *prefix << "Copy" << endl;
+      std::cerr << os.str ();
+    }
+
+    // TODO (mfh 15 Sep 2013) When we replace
+    // KokkosClassic::LRMultiVector with a Kokkos::View, there are two
+    // ways to copy the data:
+    //
+    // 1. Get a (sub)view of each column and call deep_copy on that.
+    // 2. Write a custom kernel to copy the data.
+    //
+    // The first is easier, but the second might be more performant in
+    // case we decide to use layouts other than LayoutLeft.  It might
+    // even make sense to hide whichVectors_ in an entirely new layout
+    // for Kokkos Views.
+
+    // Copy rows [0, numSameIDs-1] of the local multivectors.
+    //
+    // Note (ETP 2 Jul 2014)  We need to always copy one column at a
+    // time, even when both multivectors are constant-stride, since
+    // deep_copy between strided subviews with more than one column
+    // doesn't currently work.
+
+    // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
+    // both source and target are constant stride and have multiple
+    // columns.
+    if (numSameIDs > 0) {
+      const std::pair<size_t, size_t> rows (0, numSameIDs);
+      if (copyOnHost) {
+        auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
+        auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
+
+        for (size_t j = 0; j < numCols; ++j) {
+          const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
+          const size_t srcCol =
+            sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
+
+          auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol);
+          auto src_j = Kokkos::subview (src_h, rows, srcCol);
+          if (CM == ADD_ASSIGN) { 
+            // Sum src_j into tgt_j
+            using range_t = 
+                  Kokkos::RangePolicy<execution_space, size_t>;
+            range_t rp(space, 0,numSameIDs);
+            Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
+                    aaf(tgt_j, src_j);
+            Kokkos::parallel_for(rp, aaf);
+          }
+          else { 
+            // Copy src_j into tgt_j
+            // DEEP_COPY REVIEW - HOSTMIRROR-TO-HOSTMIRROR
+            Kokkos::deep_copy (space, tgt_j, src_j); 
+            space.fence();
+          }
+        }
+      }
+      else { // copy on device
+        auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
+        auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
+
+        for (size_t j = 0; j < numCols; ++j) {
+          const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
+          const size_t srcCol =
+            sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
+
+          auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol);
+          auto src_j = Kokkos::subview (src_d, rows, srcCol);
+          if (CM == ADD_ASSIGN) { 
+            // Sum src_j into tgt_j
+            using range_t = 
+                  Kokkos::RangePolicy<execution_space, size_t>;
+            range_t rp(space, 0,numSameIDs);
+            Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
+                    aaf(tgt_j, src_j);
+            Kokkos::parallel_for(rp, aaf);
+          }
+          else { 
+            // Copy src_j into tgt_j
+            // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
+            Kokkos::deep_copy (space, tgt_j, src_j); 
+            space.fence();
+          }
+        }
+      }
+    }
+
+
+    // For the remaining GIDs, execute the permutations.  This may
+    // involve noncontiguous access of both source and destination
+    // vectors, depending on the LID lists.
+    //
+    // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
+    // the same process, this merges their values by replacement of
+    // the last encountered GID, not by the specified merge rule
+    // (such as ADD).
+
+    // If there are no permutations, we are done
+    if (permuteFromLIDs.extent (0) == 0 ||
+        permuteToLIDs.extent (0) == 0) {
+      if (verbose) {
+        std::ostringstream os;
+        os << *prefix << "No permutations. Done!" << endl;
+        std::cerr << os.str ();
+      }
+      return;
+    }
+
+    if (verbose) {
+      std::ostringstream os;
+      os << *prefix << "Permute" << endl;
+      std::cerr << os.str ();
+    }
+
+    // We could in theory optimize for the case where exactly one of
+    // them is constant stride, but we don't currently do that.
+    const bool nonConstStride =
+      ! this->isConstantStride () || ! sourceMV.isConstantStride ();
+
+    if (verbose) {
+      std::ostringstream os;
+      os << *prefix << "nonConstStride="
+         << (nonConstStride ? "true" : "false") << endl;
+      std::cerr << os.str ();
+    }
+
+    // We only need the "which vectors" arrays if either the source or
+    // target MV is not constant stride.  Since we only have one
+    // kernel that must do double-duty, we have to create a "which
+    // vectors" array for the MV that _is_ constant stride.
+    Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
+    Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
+    if (nonConstStride) {
+      if (this->whichVectors_.size () == 0) {
+        Kokkos::DualView<size_t*, device_type> tmpTgt ("tgtWhichVecs", numCols);
+        tmpTgt.modify_host ();
+        for (size_t j = 0; j < numCols; ++j) {
+          tmpTgt.h_view(j) = j;
+        }
+        if (! copyOnHost) {
+          tmpTgt.sync_device ();
+        }
+        tgtWhichVecs = tmpTgt;
+      }
+      else {
+        Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_ ();
+        tgtWhichVecs =
+          getDualViewCopyFromArrayView<size_t, device_type> (tgtWhichVecsT,
+                                                             "tgtWhichVecs",
+                                                             copyOnHost);
+      }
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (static_cast<size_t> (tgtWhichVecs.extent (0)) !=
+         this->getNumVectors (),
+         std::logic_error, "tgtWhichVecs.extent(0) = " <<
+         tgtWhichVecs.extent (0) << " != this->getNumVectors() = " <<
+         this->getNumVectors () << ".");
+
+      if (sourceMV.whichVectors_.size () == 0) {
+        Kokkos::DualView<size_t*, device_type> tmpSrc ("srcWhichVecs", numCols);
+        tmpSrc.modify_host ();
+        for (size_t j = 0; j < numCols; ++j) {
+          tmpSrc.h_view(j) = j;
+        }
+        if (! copyOnHost) {
+          tmpSrc.sync_device ();
+        }
+        srcWhichVecs = tmpSrc;
+      }
+      else {
+        Teuchos::ArrayView<const size_t> srcWhichVecsT =
+          sourceMV.whichVectors_ ();
+        srcWhichVecs =
+          getDualViewCopyFromArrayView<size_t, device_type> (srcWhichVecsT,
+                                                             "srcWhichVecs",
+                                                             copyOnHost);
+      }
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (static_cast<size_t> (srcWhichVecs.extent (0)) !=
+         sourceMV.getNumVectors (), std::logic_error,
+         "srcWhichVecs.extent(0) = " << srcWhichVecs.extent (0)
+         << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors ()
+         << ".");
+    }
+
+    if (copyOnHost) { // permute on host too
+      if (verbose) {
+        std::ostringstream os;
+        os << *prefix << "Get permute LIDs on host" << std::endl;
+        std::cerr << os.str ();
+      }
+      auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
+      auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
+
+      TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
+      auto permuteToLIDs_h = create_const_view (permuteToLIDs.view_host ());
+      TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
+      auto permuteFromLIDs_h =
+        create_const_view (permuteFromLIDs.view_host ());
+
+      if (verbose) {
+        std::ostringstream os;
+        os << *prefix << "Permute on host" << endl;
+        std::cerr << os.str ();
+      }
+      if (nonConstStride) {
+        // No need to sync first, because copyOnHost argument to
+        // getDualViewCopyFromArrayView puts them in the right place.
+        auto tgtWhichVecs_h =
+          create_const_view (tgtWhichVecs.view_host ());
+        auto srcWhichVecs_h =
+          create_const_view (srcWhichVecs.view_host ());
+        if (CM == ADD_ASSIGN) {
+          using op_type = KokkosRefactor::Details::AddOp;
+          permute_array_multi_column_variable_stride (tgt_h, src_h,
+                                                      permuteToLIDs_h,
+                                                      permuteFromLIDs_h,
+                                                      tgtWhichVecs_h,
+                                                      srcWhichVecs_h, numCols,
+                                                      op_type());
+        }
+        else {
+          using op_type = KokkosRefactor::Details::InsertOp;
+          permute_array_multi_column_variable_stride (tgt_h, src_h,
+                                                      permuteToLIDs_h,
+                                                      permuteFromLIDs_h,
+                                                      tgtWhichVecs_h,
+                                                      srcWhichVecs_h, numCols,
+                                                      op_type());
+        }
+      }
+      else {
+        if (CM == ADD_ASSIGN) {
+          using op_type = KokkosRefactor::Details::AddOp;
+          permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
+                                      permuteFromLIDs_h, numCols, op_type());
+        }
+        else {
+          using op_type = KokkosRefactor::Details::InsertOp;
+          permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
+                                      permuteFromLIDs_h, numCols, op_type());
+        }
+      }
+    }
+    else { // permute on device
+      if (verbose) {
+        std::ostringstream os;
+        os << *prefix << "Get permute LIDs on device" << endl;
+        std::cerr << os.str ();
+      }
+      auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
+      auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
+
+      TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () );
+      auto permuteToLIDs_d = create_const_view (permuteToLIDs.view_device ());
+      TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () );
+      auto permuteFromLIDs_d =
+        create_const_view (permuteFromLIDs.view_device ());
+
+      if (verbose) {
+        std::ostringstream os;
+        os << *prefix << "Permute on device" << endl;
+        std::cerr << os.str ();
+      }
+      if (nonConstStride) {
+        // No need to sync first, because copyOnHost argument to
+        // getDualViewCopyFromArrayView puts them in the right place.
+        auto tgtWhichVecs_d = create_const_view (tgtWhichVecs.view_device ());
+        auto srcWhichVecs_d = create_const_view (srcWhichVecs.view_device ());
+        if (CM == ADD_ASSIGN) {
+          using op_type = KokkosRefactor::Details::AddOp;
+          permute_array_multi_column_variable_stride (space, tgt_d, src_d,
+                                                      permuteToLIDs_d,
+                                                      permuteFromLIDs_d,
+                                                      tgtWhichVecs_d,
+                                                      srcWhichVecs_d, numCols,
+                                                      op_type());
+        }
+        else {
+          using op_type = KokkosRefactor::Details::InsertOp;
+          permute_array_multi_column_variable_stride (space, tgt_d, src_d,
+                                                      permuteToLIDs_d,
+                                                      permuteFromLIDs_d,
+                                                      tgtWhichVecs_d,
+                                                      srcWhichVecs_d, numCols,
+                                                      op_type());
+        }
+      }
+      else {
+        if (CM == ADD_ASSIGN) {
+          using op_type = KokkosRefactor::Details::AddOp;
+          permute_array_multi_column (space, tgt_d, src_d, permuteToLIDs_d,
+                                      permuteFromLIDs_d, numCols, op_type());
+        }
+        else {
+          using op_type = KokkosRefactor::Details::InsertOp;
+          permute_array_multi_column (space, tgt_d, src_d, permuteToLIDs_d,
+                                      permuteFromLIDs_d, numCols, op_type());
+        }
+      }
+    }
+
+    if (verbose) {
+      std::ostringstream os;
+      os << *prefix << "Done!" << endl;
+      std::cerr << os.str ();
+    }
+  }
+
+// clang-format on
+template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+void LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::copyAndPermute(
+    const SrcDistObject &sourceObj, const size_t numSameIDs,
+    const Kokkos::DualView<const local_ordinal_type *, buffer_device_type>
+        &permuteToLIDs,
+    const Kokkos::DualView<const local_ordinal_type *, buffer_device_type>
+        &permuteFromLIDs,
+    const CombineMode CM) {
+  copyAndPermute(sourceObj, numSameIDs, permuteToLIDs, permuteFromLIDs, CM,
+                 execution_space());
+}
+// clang-format off
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  packAndPrepare
+  (const SrcDistObject& sourceObj,
+   const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
+   Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
+   Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
+   size_t& constantNumPackets,
+   const execution_space &space)
+  {
+    using ::Tpetra::Details::Behavior;
+    using ::Tpetra::Details::ProfilingRegion;
+    using ::Tpetra::Details::reallocDualViewIfNeeded;
+    using Kokkos::Compat::create_const_view;
+    using Kokkos::Compat::getKokkosViewDeepCopy;
+    using std::endl;
+    using MV = LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
+
+    // We've already called checkSizes(), so this cast must succeed.
+    MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&> (sourceObj));
+
+    const bool packOnHost = runKernelOnHost(sourceMV);
+    const char longFuncNameHost[] = "Tpetra::LRMultiVector::packAndPrepare[Host]";
+    const char longFuncNameDevice[] = "Tpetra::LRMultiVector::packAndPrepare[Device]";
+    const char tfecfFuncName[] = "packAndPrepare: ";
+    ProfilingRegion regionPAP (packOnHost ? longFuncNameHost : longFuncNameDevice);
+
+    // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
+    // have the option to check indices.  We do so when Tpetra is in
+    // debug mode.  It is in debug mode by default in a debug build,
+    // but you may control this at run time, before launching the
+    // executable, by setting the TPETRA_DEBUG environment variable to
+    // "1" (or "TRUE").
+    const bool debugCheckIndices = Behavior::debug ();
+    // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
+    // environment variable to "1" (or "TRUE") for copious debug
+    // output to std::cerr on every MPI process.  This is unwise for
+    // runs with large numbers of MPI processes.
+    const bool printDebugOutput = Behavior::verbose ();
+
+    std::unique_ptr<std::string> prefix;
+    if (printDebugOutput) {
+      auto map = this->getMap ();
+      auto comm = map.is_null () ? Teuchos::null : map->getComm ();
+      const int myRank = comm.is_null () ? -1 : comm->getRank ();
+      std::ostringstream os;
+      os << "Proc " << myRank << ": MV::packAndPrepare: ";
+      prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
+      os << "Start" << endl;
+      std::cerr << os.str ();
+    }
+
+
+    const size_t numCols = sourceMV.getNumVectors ();
+
+    // This spares us from needing to fill numExportPacketsPerLID.
+    // Setting constantNumPackets to a nonzero value signals that
+    // all packets have the same number of entries.
+    constantNumPackets = numCols;
+
+    // If we have no exports, there is nothing to do.  Make sure this
+    // goes _after_ setting constantNumPackets correctly.
+    if (exportLIDs.extent (0) == 0) {
+      if (printDebugOutput) {
+        std::ostringstream os;
+        os << *prefix << "No exports on this proc, DONE" << std::endl;
+        std::cerr << os.str ();
+      }
+      return;
+    }
+
+    /* The layout in the export for MultiVectors is as follows:
+       exports = { all of the data from row exportLIDs.front() ;
+                   ....
+                   all of the data from row exportLIDs.back() }
+      This doesn't have the best locality, but is necessary because
+      the data for a Packet (all data associated with an LID) is
+      required to be contiguous. */
+
+    // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
+    // packing scheme in the above comment?  The data going to a
+    // particular process must be contiguous, of course, but those
+    // data could include entries from multiple LIDs.  DistObject just
+    // needs to know how to index into that data.  Kokkos is good at
+    // decoupling storage intent from data layout choice.
+
+    const size_t numExportLIDs = exportLIDs.extent (0);
+    const size_t newExportsSize = numCols * numExportLIDs;
+    if (printDebugOutput) {
+      std::ostringstream os;
+      os << *prefix << "realloc: "
+         << "numExportLIDs: " << numExportLIDs
+         << ", exports.extent(0): " << exports.extent (0)
+         << ", newExportsSize: " << newExportsSize << std::endl;
+      std::cerr << os.str ();
+    }
+    reallocDualViewIfNeeded (exports, newExportsSize, "exports");
+
+    // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
+    // sync it.  Pack it where it's currently sync'd.
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
+       std::logic_error, "Input LRMultiVector needs sync to both host "
+       "and device.");
+    if (printDebugOutput) {
+      std::ostringstream os;
+      os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
+      std::cerr << os.str ();
+    }
+
+    // Mark 'exports' here, since we might have resized it above.
+    // Resizing currently requires calling the constructor, which
+    // clears out the 'modified' flags.
+    if (packOnHost) {
+      // nde 06 Feb 2020: If 'exports' does not require resize
+      // when reallocDualViewIfNeeded is called, the modified flags 
+      // are not cleared out. This can result in host and device views
+      // being out-of-sync, resuling in an error in exports.modify_* calls.
+      // Clearing the sync flags prevents this possible case.
+      exports.clear_sync_state ();
+      exports.modify_host ();
+    }
+    else {
+      // nde 06 Feb 2020: If 'exports' does not require resize
+      // when reallocDualViewIfNeeded is called, the modified flags 
+      // are not cleared out. This can result in host and device views
+      // being out-of-sync, resuling in an error in exports.modify_* calls.
+      // Clearing the sync flags prevents this possible case.
+      exports.clear_sync_state ();
+      exports.modify_device ();
+    }
+
+    if (numCols == 1) { // special case for one column only
+      // LRMultiVector always represents a single column with constant
+      // stride, but it doesn't hurt to implement both cases anyway.
+      //
+      // ETP:  I'm not sure I agree with the above statement.  Can't a single-
+      // column multivector be a subview of another multi-vector, in which case
+      // sourceMV.whichVectors_[0] != 0 ?  I think we have to handle that case
+      // separately.
+      //
+      // mfh 18 Jan 2016: In answer to ETP's comment above:
+      // LRMultiVector treats single-column MultiVectors created using a
+      // "nonconstant stride constructor" as a special case, and makes
+      // them constant stride (by making whichVectors_ have length 0).
+      if (sourceMV.isConstantStride ()) {
+        using KokkosRefactor::Details::pack_array_single_column;
+        if (printDebugOutput) {
+          std::ostringstream os;
+          os << *prefix << "Pack numCols=1 const stride" << endl;
+          std::cerr << os.str ();
+        }
+        if (packOnHost) {
+          auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
+          pack_array_single_column (exports.view_host (),
+                                    src_host,
+                                    exportLIDs.view_host (),
+                                    0,
+                                    debugCheckIndices);
+        }
+        else { // pack on device
+          auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
+          pack_array_single_column (exports.view_device (),
+                                    src_dev,
+                                    exportLIDs.view_device (),
+                                    0,
+                                    debugCheckIndices,
+                                    space);
+        }
+      }
+      else {
+        using KokkosRefactor::Details::pack_array_single_column;
+        if (printDebugOutput) {
+          std::ostringstream os;
+          os << *prefix << "Pack numCols=1 nonconst stride" << endl;
+          std::cerr << os.str ();
+        }
+        if (packOnHost) {
+          auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
+          pack_array_single_column (exports.view_host (),
+                                    src_host,
+                                    exportLIDs.view_host (),
+                                    sourceMV.whichVectors_[0],
+                                    debugCheckIndices);
+        }
+        else { // pack on device
+          auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
+          pack_array_single_column (exports.view_device (),
+                                    src_dev,
+                                    exportLIDs.view_device (),
+                                    sourceMV.whichVectors_[0],
+                                    debugCheckIndices,
+                                    space);
+        }
+      }
+    }
+    else { // the source LRMultiVector has multiple columns
+      if (sourceMV.isConstantStride ()) {
+        using KokkosRefactor::Details::pack_array_multi_column;
+        if (printDebugOutput) {
+          std::ostringstream os;
+          os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
+          std::cerr << os.str ();
+        }
+        if (packOnHost) {
+          auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
+          pack_array_multi_column (exports.view_host (),
+                                   src_host,
+                                   exportLIDs.view_host (),
+                                   numCols,
+                                   debugCheckIndices);
+        }
+        else { // pack on device
+          auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
+          pack_array_multi_column (exports.view_device (),
+                                   src_dev,
+                                   exportLIDs.view_device (),
+                                   numCols,
+                                   debugCheckIndices,
+                                   space);
+        }
+      }
+      else {
+        using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
+        if (printDebugOutput) {
+          std::ostringstream os;
+          os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
+             << endl;
+          std::cerr << os.str ();
+        }
+        // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
+        // whichVectors_ can be expensive, but pack and unpack for
+        // nonconstant-stride MultiVectors is slower anyway.
+        using IST = impl_scalar_type;
+        using DV = Kokkos::DualView<IST*, device_type>;
+        using HES = typename DV::t_host::execution_space;
+        using DES = typename DV::t_dev::execution_space;
+        Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_ ();
+        if (packOnHost) {
+          auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
+          pack_array_multi_column_variable_stride
+            (exports.view_host (),
+             src_host,
+             exportLIDs.view_host (),
+             getKokkosViewDeepCopy<HES> (whichVecs),
+             numCols,
+             debugCheckIndices);
+        }
+        else { // pack on device
+          auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
+          pack_array_multi_column_variable_stride
+            (exports.view_device (),
+             src_dev,
+             exportLIDs.view_device (),
+             getKokkosViewDeepCopy<DES> (whichVecs),
+             numCols,
+             debugCheckIndices, space);
+        }
+      }
+    }
+
+    if (printDebugOutput) {
+      std::ostringstream os;
+      os << *prefix << "Done!" << endl;
+      std::cerr << os.str ();
+    }
+
+  }
+
+// clang-format on
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  packAndPrepare
+  (const SrcDistObject& sourceObj,
+   const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
+   Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
+   Kokkos::DualView<size_t*, buffer_device_type> numExportPacketsPerLID,
+   size_t& constantNumPackets) {
+     packAndPrepare(sourceObj, exportLIDs, exports, numExportPacketsPerLID, constantNumPackets, execution_space());    
+   }
+// clang-format off
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  template <class NO>
+  typename std::enable_if<std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
+                                       typename NO::device_type::memory_space>::value,
+      bool>::type
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  reallocImportsIfNeededImpl (const size_t newSize,
+                          const bool verbose,
+                          const std::string* prefix,
+                          const bool areRemoteLIDsContiguous,
+                          const CombineMode CM)
+  {
+    // This implementation of reallocImportsIfNeeded is an
+    // optimization that is specific to LRMultiVector. We check if the
+    // imports_ view can be aliased to the end of the data view_. If
+    // that is the case, we can skip the unpackAndCombine call.
+
+    if (verbose) {
+      std::ostringstream os;
+      os << *prefix << "Realloc (if needed) imports_ from "
+         << this->imports_.extent (0) << " to " << newSize << std::endl;
+      std::cerr << os.str ();
+    }
+
+    bool reallocated = false;
+
+    using IST = impl_scalar_type;
+    using DV = Kokkos::DualView<IST*, Kokkos::LayoutRight, buffer_device_type>;
+
+    // Conditions for aliasing memory:
+    // - When both sides of the dual view are in the same memory
+    //   space, we do not need to worry about syncing things.
+    // - When both memory spaces are different, we only alias if this
+    //   does not incur additional sync'ing.
+    // - The remote LIDs need to be contiguous, so that we do not need
+    //   to reorder received information.
+    // - CombineMode needs to be INSERT.
+    // - The number of vectors needs to be 1, otherwise we need to
+    //   reorder the received data.
+    if ((dual_view_type::impl_dualview_is_single_device::value ||
+         (Details::Behavior::assumeMpiIsGPUAware () && !this->need_sync_device()) ||
+         (!Details::Behavior::assumeMpiIsGPUAware () && !this->need_sync_host())) &&
+        areRemoteLIDsContiguous &&
+        (CM == INSERT || CM == REPLACE) &&
+        (getNumVectors() == 1) &&
+        (newSize > 0)) {
+
+      size_t offset = getLocalLength () - newSize;
+      reallocated = this->imports_.d_view.data() != view_.getDualView().d_view.data() + offset;
+      if (reallocated) {
+        typedef std::pair<size_t, size_t> range_type;
+        this->imports_ = DV(view_.getDualView(),
+                            range_type (offset, getLocalLength () ),
+                            0);
+
+        if (verbose) {
+          std::ostringstream os;
+          os << *prefix << "Aliased imports_ to MV.view_" << std::endl;
+          std::cerr << os.str ();
+        }
+      }
+      return reallocated;
+    }
+    {
+      using ::Tpetra::Details::reallocDualViewIfNeeded;
+      reallocated =
+        reallocDualViewIfNeeded (this->unaliased_imports_, newSize, "imports");
+      if (verbose) {
+        std::ostringstream os;
+        os << *prefix << "Finished realloc'ing unaliased_imports_" << std::endl;
+        std::cerr << os.str ();
+      }
+      this->imports_ = this->unaliased_imports_;
+    }
+    return reallocated;
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  template <class NO>
+  typename std::enable_if<!std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
+                                        typename NO::device_type::memory_space>::value,
+      bool>::type
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  reallocImportsIfNeededImpl (const size_t newSize,
+                          const bool verbose,
+                          const std::string* prefix,
+                          const bool areRemoteLIDsContiguous,
+                          const CombineMode CM)
+  {
+    return DistObject<Scalar, LocalOrdinal, GlobalOrdinal, Node>::reallocImportsIfNeeded(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  bool
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  reallocImportsIfNeeded(const size_t newSize,
+                                 const bool verbose,
+                                 const std::string* prefix,
+                                 const bool areRemoteLIDsContiguous,
+                                 const CombineMode CM) {
+    /// return false;
+    return reallocImportsIfNeededImpl<Node>(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  bool
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  importsAreAliased() {
+    return (this->imports_.d_view.data() + this->imports_.d_view.extent(0) ==
+            view_.getDualView().d_view.data() + view_.getDualView().d_view.extent(0));
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  unpackAndCombine
+  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
+   Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
+   Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
+   const size_t constantNumPackets,
+   const CombineMode CM,
+   const execution_space &space)
+  {
+    using ::Tpetra::Details::Behavior;
+    using ::Tpetra::Details::ProfilingRegion;
+    using KokkosRefactor::Details::unpack_array_multi_column;
+    using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
+    using Kokkos::Compat::getKokkosViewDeepCopy;
+    using std::endl;
+
+    const bool unpackOnHost = runKernelOnHost(imports);
+
+    const char longFuncNameHost[] = "Tpetra::LRMultiVector::unpackAndCombine[Host]";
+    const char longFuncNameDevice[] = "Tpetra::LRMultiVector::unpackAndCombine[Device]";
+    const char * longFuncName = unpackOnHost ? longFuncNameHost : longFuncNameDevice;
+    const char tfecfFuncName[] = "unpackAndCombine: ";
+    ProfilingRegion regionUAC (longFuncName);
+
+    // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
+    // have the option to check indices.  We do so when Tpetra is in
+    // debug mode.  It is in debug mode by default in a debug build,
+    // but you may control this at run time, before launching the
+    // executable, by setting the TPETRA_DEBUG environment variable to
+    // "1" (or "TRUE").
+    const bool debugCheckIndices = Behavior::debug ();
+
+    const bool printDebugOutput = Behavior::verbose ();
+    std::unique_ptr<std::string> prefix;
+    if (printDebugOutput) {
+      auto map = this->getMap ();
+      auto comm = map.is_null () ? Teuchos::null : map->getComm ();
+      const int myRank = comm.is_null () ? -1 : comm->getRank ();
+      std::ostringstream os;
+      os << "Proc " << myRank << ": " << longFuncName << ": ";
+      prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
+      os << "Start" << endl;
+      std::cerr << os.str ();
+    }
+
+    // If we have no imports, there is nothing to do
+    if (importLIDs.extent (0) == 0) {
+      if (printDebugOutput) {
+        std::ostringstream os;
+        os << *prefix << "No imports. Done!" << endl;
+        std::cerr << os.str ();
+      }
+      return;
+    }
+
+    // Check, whether imports_ is a subview of the MV view.
+    if (importsAreAliased()) {
+      if (printDebugOutput) {
+        std::ostringstream os;
+        os << *prefix << "Skipping unpack, since imports_ is aliased to MV.view_. Done!" << endl;
+        std::cerr << os.str ();
+      }
+      return;
+    }
+
+
+    const size_t numVecs = getNumVectors ();
+    if (debugCheckIndices) {
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (static_cast<size_t> (imports.extent (0)) !=
+         numVecs * importLIDs.extent (0),
+         std::runtime_error,
+         "imports.extent(0) = " << imports.extent (0)
+         << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
+         << " * " << importLIDs.extent (0) << " = "
+         << numVecs * importLIDs.extent (0) << ".");
+
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (constantNumPackets == static_cast<size_t> (0), std::runtime_error,
+         "constantNumPackets input argument must be nonzero.");
+
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (static_cast<size_t> (numVecs) !=
+         static_cast<size_t> (constantNumPackets),
+         std::runtime_error, "constantNumPackets must equal numVecs.");
+    }
+
+    // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
+    // the memory space in which the imports buffer was last modified and
+    // the size of the imports buffer.
+    // DistObject::doTransferNew decides where it was last modified (based on
+    // whether communication buffers used were on host or device).
+    if (unpackOnHost) {
+      if (this->imports_.need_sync_host()) this->imports_.sync_host();
+    }
+    else {
+      if (this->imports_.need_sync_device()) this->imports_.sync_device();
+    }
+
+    if (printDebugOutput) {
+      std::ostringstream os;
+      os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
+         << endl;
+      std::cerr << os.str ();
+    }
+
+    // We have to sync before modifying, because this method may read
+    // as well as write (depending on the CombineMode).
+    auto imports_d = imports.view_device ();
+    auto imports_h = imports.view_host ();
+    auto importLIDs_d = importLIDs.view_device ();
+    auto importLIDs_h = importLIDs.view_host ();
+
+    Kokkos::DualView<size_t*, device_type> whichVecs;
+    if (! isConstantStride ()) {
+      Kokkos::View<const size_t*, Kokkos::HostSpace,
+        Kokkos::MemoryUnmanaged> whichVecsIn (whichVectors_.getRawPtr (),
+                                              numVecs);
+      whichVecs = Kokkos::DualView<size_t*, device_type> ("whichVecs", numVecs);
+      if (unpackOnHost) {
+        whichVecs.modify_host ();
+        // DEEP_COPY REVIEW - NOT TESTED FOR CUDA BUILD
+        Kokkos::deep_copy (whichVecs.view_host (), whichVecsIn);
+      }
+      else {
+        whichVecs.modify_device ();
+        // DEEP_COPY REVIEW - HOST-TO-DEVICE
+        Kokkos::deep_copy (whichVecs.view_device (), whichVecsIn);
+      }
+    }
+    auto whichVecs_d = whichVecs.view_device ();
+    auto whichVecs_h = whichVecs.view_host ();
+
+    /* The layout in the export for MultiVectors is as follows:
+       imports = { all of the data from row exportLIDs.front() ;
+                   ....
+                   all of the data from row exportLIDs.back() }
+      This doesn't have the best locality, but is necessary because
+      the data for a Packet (all data associated with an LID) is
+      required to be contiguous. */
+
+    if (numVecs > 0 && importLIDs.extent (0) > 0) {
+      using dev_exec_space = typename dual_view_type::t_dev::execution_space;
+      using host_exec_space = typename dual_view_type::t_host::execution_space;
+
+      // This fixes GitHub Issue #4418.
+      const bool use_atomic_updates = unpackOnHost ?
+        host_exec_space().concurrency () != 1 :
+        dev_exec_space().concurrency () != 1;
+
+      if (printDebugOutput) {
+        std::ostringstream os;
+        os << *prefix << "Unpack: " << combineModeToString (CM) << endl;
+        std::cerr << os.str ();
+      }
+
+      // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
+      // custom combine modes, start editing here.
+
+      if (CM == INSERT || CM == REPLACE) {
+        using op_type = KokkosRefactor::Details::InsertOp;
+        if (isConstantStride ()) {
+          if (unpackOnHost) {
+            auto X_h = this->getLocalViewHost(Access::ReadWrite);
+            unpack_array_multi_column (host_exec_space (),
+                                       X_h, imports_h, importLIDs_h,
+                                       op_type (), numVecs,
+                                       use_atomic_updates,
+                                       debugCheckIndices);
+
+          }
+          else { // unpack on device
+            auto X_d = this->getLocalViewDevice(Access::ReadWrite);
+            unpack_array_multi_column (space,
+                                       X_d, imports_d, importLIDs_d,
+                                       op_type (), numVecs,
+                                       use_atomic_updates,
+                                       debugCheckIndices);
+          }
+        }
+        else { // not constant stride
+          if (unpackOnHost) {
+            auto X_h = this->getLocalViewHost(Access::ReadWrite);
+            unpack_array_multi_column_variable_stride (host_exec_space (),
+                                                       X_h, imports_h,
+                                                       importLIDs_h,
+                                                       whichVecs_h,
+                                                       op_type (),
+                                                       numVecs,
+                                                       use_atomic_updates,
+                                                       debugCheckIndices);
+          }
+          else { // unpack on device
+            auto X_d = this->getLocalViewDevice(Access::ReadWrite);
+            unpack_array_multi_column_variable_stride (space,
+                                                       X_d, imports_d,
+                                                       importLIDs_d,
+                                                       whichVecs_d,
+                                                       op_type (),
+                                                       numVecs,
+                                                       use_atomic_updates,
+                                                       debugCheckIndices);
+          }
+        }
+      }
+      else if (CM == ADD || CM == ADD_ASSIGN) {
+        using op_type = KokkosRefactor::Details::AddOp;
+        if (isConstantStride ()) {
+          if (unpackOnHost) {
+            auto X_h = this->getLocalViewHost(Access::ReadWrite);
+            unpack_array_multi_column (host_exec_space (),
+                                       X_h, imports_h, importLIDs_h,
+                                       op_type (), numVecs,
+                                       use_atomic_updates,
+                                       debugCheckIndices);
+          }
+          else { // unpack on device
+            auto X_d = this->getLocalViewDevice(Access::ReadWrite);
+            unpack_array_multi_column (space,
+                                       X_d, imports_d, importLIDs_d,
+                                       op_type (), numVecs,
+                                       use_atomic_updates,
+                                       debugCheckIndices);
+          }
+        }
+        else { // not constant stride
+          if (unpackOnHost) {
+            auto X_h = this->getLocalViewHost(Access::ReadWrite);
+            unpack_array_multi_column_variable_stride (host_exec_space (),
+                                                       X_h, imports_h,
+                                                       importLIDs_h,
+                                                       whichVecs_h,
+                                                       op_type (),
+                                                       numVecs,
+                                                       use_atomic_updates,
+                                                       debugCheckIndices);
+          }
+          else { // unpack on device
+            auto X_d = this->getLocalViewDevice(Access::ReadWrite);
+            unpack_array_multi_column_variable_stride (space,
+                                                       X_d, imports_d,
+                                                       importLIDs_d,
+                                                       whichVecs_d,
+                                                       op_type (),
+                                                       numVecs,
+                                                       use_atomic_updates,
+                                                       debugCheckIndices);
+          }
+        }
+      }
+      else if (CM == ABSMAX) {
+        using op_type = KokkosRefactor::Details::AbsMaxOp;
+        if (isConstantStride ()) {
+          if (unpackOnHost) {
+            auto X_h = this->getLocalViewHost(Access::ReadWrite);
+            unpack_array_multi_column (host_exec_space (),
+                                       X_h, imports_h, importLIDs_h,
+                                       op_type (), numVecs,
+                                       use_atomic_updates,
+                                       debugCheckIndices);
+          }
+          else { // unpack on device
+            auto X_d = this->getLocalViewDevice(Access::ReadWrite);
+            unpack_array_multi_column (space,
+                                       X_d, imports_d, importLIDs_d,
+                                       op_type (), numVecs,
+                                       use_atomic_updates,
+                                       debugCheckIndices);
+          }
+        }
+        else {
+          if (unpackOnHost) {
+            auto X_h = this->getLocalViewHost(Access::ReadWrite);
+            unpack_array_multi_column_variable_stride (host_exec_space (),
+                                                       X_h, imports_h,
+                                                       importLIDs_h,
+                                                       whichVecs_h,
+                                                       op_type (),
+                                                       numVecs,
+                                                       use_atomic_updates,
+                                                       debugCheckIndices);
+          }
+          else { // unpack on device
+            auto X_d = this->getLocalViewDevice(Access::ReadWrite);
+            unpack_array_multi_column_variable_stride (space,
+                                                       X_d, imports_d,
+                                                       importLIDs_d,
+                                                       whichVecs_d,
+                                                       op_type (),
+                                                       numVecs,
+                                                       use_atomic_updates,
+                                                       debugCheckIndices);
+          }
+        }
+      }
+      else {
+        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+          (true, std::logic_error, "Invalid CombineMode");
+      }
+    }
+    else {
+      if (printDebugOutput) {
+        std::ostringstream os;
+        os << *prefix << "Nothing to unpack" << endl;
+        std::cerr << os.str ();
+      }
+    }
+
+    if (printDebugOutput) {
+      std::ostringstream os;
+      os << *prefix << "Done!" << endl;
+      std::cerr << os.str ();
+    }
+  }
+
+  // clang-format on
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  unpackAndCombine
+  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
+   Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
+   Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
+   const size_t constantNumPackets,
+   const CombineMode CM) {
+    unpackAndCombine(importLIDs, imports, numPacketsPerLID, constantNumPackets, CM, execution_space());
+  }
+  // clang-format off
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  size_t
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getNumVectors () const
+  {
+    if (isConstantStride ()) {
+      return static_cast<size_t> (view_.extent (1));
+    } else {
+      return static_cast<size_t> (whichVectors_.size ());
+    }
+  }
+
+  namespace { // (anonymous)
+
+    template<class RV>
+    void
+    gblDotImpl (const RV& dotsOut,
+                const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
+                const bool distributed)
+    {
+      using Teuchos::REDUCE_MAX;
+      using Teuchos::REDUCE_SUM;
+      using Teuchos::reduceAll;
+      typedef typename RV::non_const_value_type dot_type;
+
+      const size_t numVecs = dotsOut.extent (0);
+
+      // If the LRMultiVector is distributed over multiple processes, do
+      // the distributed (interprocess) part of the dot product.  We
+      // assume that the MPI implementation can read from and write to
+      // device memory.
+      //
+      // replaceMap() may have removed some processes.  Those
+      // processes have a null Map.  They must not participate in any
+      // collective operations.  We ask first whether the Map is null,
+      // because isDistributed() defers that question to the Map.  We
+      // still compute and return local dots for processes not
+      // participating in collective operations; those probably don't
+      // make any sense, but it doesn't hurt to do them, since it's
+      // illegal to call dot() on those processes anyway.
+      if (distributed && ! comm.is_null ()) {
+        // The calling process only participates in the collective if
+        // both the Map and its Comm on that process are nonnull.
+        const int nv = static_cast<int> (numVecs);
+        const bool commIsInterComm = ::Tpetra::Details::isInterComm (*comm);
+
+        if (commIsInterComm) {
+          // If comm is an intercomm, then we may not alias input and
+          // output buffers, so we have to make a copy of the local
+          // sum.
+          typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
+          // DEEP_COPY REVIEW - NOT TESTED
+          Kokkos::deep_copy (lclDots, dotsOut);
+          const dot_type* const lclSum = lclDots.data ();
+          dot_type* const gblSum = dotsOut.data ();
+          reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
+        }
+        else {
+          dot_type* const inout = dotsOut.data ();
+          reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, inout, inout);
+        }
+      }
+    }
+  } // namespace (anonymous)
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  dot (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
+       const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const
+  {
+    using ::Tpetra::Details::Behavior;
+    using Kokkos::subview;
+    using Teuchos::Comm;
+    using Teuchos::null;
+    using Teuchos::RCP;
+    // View of all the dot product results.
+    typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
+    typedef typename dual_view_type::t_dev_const XMV;
+    const char tfecfFuncName[] = "Tpetra::LRMultiVector::dot: ";
+
+    ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Kokkos::View)");
+
+    const size_t numVecs = this->getNumVectors ();
+    if (numVecs == 0) {
+      return; // nothing to do
+    }
+    const size_t lclNumRows = this->getLocalLength ();
+    const size_t numDots = static_cast<size_t> (dots.extent (0));
+    const bool debug = Behavior::debug ();
+
+    if (debug) {
+      const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (! compat, std::invalid_argument, "'*this' LRMultiVector is not "
+        "compatible with the input LRMultiVector A.  We only test for this "
+        "in debug mode.");
+    }
+
+    // FIXME (mfh 11 Jul 2014) These exception tests may not
+    // necessarily be thrown on all processes consistently.  We should
+    // instead pass along error state with the inner product.  We
+    // could do this by setting an extra slot to
+    // Kokkos::ArithTraits<dot_type>::one() on error.  The
+    // final sum should be
+    // Kokkos::ArithTraits<dot_type>::zero() if not error.
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      lclNumRows != A.getLocalLength (), std::runtime_error,
+      "MultiVectors do not have the same local length.  "
+      "this->getLocalLength() = " << lclNumRows << " != "
+      "A.getLocalLength() = " << A.getLocalLength () << ".");
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      numVecs != A.getNumVectors (), std::runtime_error,
+      "MultiVectors must have the same number of columns (vectors).  "
+      "this->getNumVectors() = " << numVecs << " != "
+      "A.getNumVectors() = " << A.getNumVectors () << ".");
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      numDots != numVecs, std::runtime_error,
+      "The output array 'dots' must have the same number of entries as the "
+      "number of columns (vectors) in *this and A.  dots.extent(0) = " <<
+      numDots << " != this->getNumVectors() = " << numVecs << ".");
+
+    const std::pair<size_t, size_t> colRng (0, numVecs);
+    RV dotsOut = subview (dots, colRng);
+    RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
+      this->getMap ()->getComm ();
+
+    auto thisView = this->getLocalViewDevice(Access::ReadOnly);
+    auto A_view = A.getLocalViewDevice(Access::ReadOnly);
+
+    ::Tpetra::Details::lclDot<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
+                     this->whichVectors_.getRawPtr (),
+                     A.whichVectors_.getRawPtr (),
+                     this->isConstantStride (), A.isConstantStride ());
+
+    // lbv 15 mar 2023: Kokkos Kernels provides non-blocking BLAS
+    // functions unless they explicitely return a value to Host.
+    // Here while the lclDot are on host, they are not a return
+    // value, therefore they might be avaible to us immediately.
+    // Adding a frnce here guarantees that we will have the lclDot
+    // ahead of the MPI reduction.
+    execution_space exec_space_instance = execution_space();
+    exec_space_instance.fence();
+
+    gblDotImpl (dotsOut, comm, this->isDistributed ());
+  }
+
+  namespace { // (anonymous)
+    template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+    typename LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::dot_type
+    multiVectorSingleColumnDot (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& x,
+                                const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& y)
+    {
+      using ::Tpetra::Details::ProfilingRegion;
+      using MV = ::Tpetra::LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
+      using dot_type = typename MV::dot_type;
+      ProfilingRegion region ("Tpetra::multiVectorSingleColumnDot");
+
+      auto map = x.getMap ();
+      Teuchos::RCP<const Teuchos::Comm<int> > comm =
+        map.is_null () ? Teuchos::null : map->getComm ();
+      if (comm.is_null ()) {
+        return Kokkos::ArithTraits<dot_type>::zero ();
+      }
+      else {
+        using LO = LocalOrdinal;
+        // The min just ensures that we don't overwrite memory that
+        // doesn't belong to us, in the erroneous input case where x
+        // and y have different numbers of rows.
+        const LO lclNumRows = static_cast<LO> (std::min (x.getLocalLength (),
+                                                         y.getLocalLength ()));
+        const Kokkos::pair<LO, LO> rowRng (0, lclNumRows);
+        dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero ();
+        dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero ();
+
+        // All non-unary kernels are executed on the device as per Tpetra policy.  Sync to device if needed.
+        //const_cast<MV&>(x).sync_device ();
+        //const_cast<MV&>(y).sync_device ();
+
+        auto x_2d = x.getLocalViewDevice(Access::ReadOnly);
+        auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
+        auto y_2d = y.getLocalViewDevice(Access::ReadOnly);
+        auto y_1d = Kokkos::subview (y_2d, rowRng, 0);
+        lclDot = KokkosBlas::dot (x_1d, y_1d);
+
+        if (x.isDistributed ()) {
+          using Teuchos::outArg;
+          using Teuchos::REDUCE_SUM;
+          using Teuchos::reduceAll;
+          reduceAll<int, dot_type> (*comm, REDUCE_SUM, lclDot, outArg (gblDot));
+        }
+        else {
+          gblDot = lclDot;
+        }
+        return gblDot;
+      }
+    }
+  } // namespace (anonymous)
+
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  dot (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
+       const Teuchos::ArrayView<dot_type>& dots) const
+  {
+    const char tfecfFuncName[] = "dot: ";
+    ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Teuchos::ArrayView)");
+
+    const size_t numVecs = this->getNumVectors ();
+    const size_t lclNumRows = this->getLocalLength ();
+    const size_t numDots = static_cast<size_t> (dots.size ());
+
+    // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
+    // not necessarily be thrown on all processes consistently.  We
+    // keep them for now, because LRMultiVector's unit tests insist on
+    // them.  In the future, we should instead pass along error state
+    // with the inner product.  We could do this by setting an extra
+    // slot to Kokkos::ArithTraits<dot_type>::one() on error.
+    // The final sum should be
+    // Kokkos::ArithTraits<dot_type>::zero() if not error.
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (lclNumRows != A.getLocalLength (), std::runtime_error,
+       "MultiVectors do not have the same local length.  "
+       "this->getLocalLength() = " << lclNumRows << " != "
+       "A.getLocalLength() = " << A.getLocalLength () << ".");
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (numVecs != A.getNumVectors (), std::runtime_error,
+       "MultiVectors must have the same number of columns (vectors).  "
+       "this->getNumVectors() = " << numVecs << " != "
+       "A.getNumVectors() = " << A.getNumVectors () << ".");
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (numDots != numVecs, std::runtime_error,
+       "The output array 'dots' must have the same number of entries as the "
+       "number of columns (vectors) in *this and A.  dots.extent(0) = " <<
+       numDots << " != this->getNumVectors() = " << numVecs << ".");
+
+    if (numVecs == 1 && this->isConstantStride () && A.isConstantStride ()) {
+      const dot_type gblDot = multiVectorSingleColumnDot (*this, A);
+      dots[0] = gblDot;
+    }
+    else {
+      this->dot (A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr (), numDots));
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  norm2 (const Teuchos::ArrayView<mag_type>& norms) const
+  {
+    using MV = LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
+    using ::Tpetra::Details::NORM_TWO;
+    using ::Tpetra::Details::ProfilingRegion;
+    ProfilingRegion region ("Tpetra::MV::norm2 (host output)");
+
+    // The function needs to be able to sync X.
+    MV& X = const_cast<MV&> (*this);
+    multiVectorNormImpl (norms.getRawPtr (), X, NORM_TWO);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  norm2 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
+  {
+    Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
+    this->norm2 (norms_av);
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  norm1 (const Teuchos::ArrayView<mag_type>& norms) const
+  {
+    using MV = LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
+    using ::Tpetra::Details::NORM_ONE;
+    using ::Tpetra::Details::ProfilingRegion;
+    ProfilingRegion region ("Tpetra::MV::norm1 (host output)");
+
+    // The function needs to be able to sync X.
+    MV& X = const_cast<MV&> (*this);
+    multiVectorNormImpl (norms.data (), X, NORM_ONE);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  norm1 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
+  {
+    Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
+    this->norm1 (norms_av);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  normInf (const Teuchos::ArrayView<mag_type>& norms) const
+  {
+    using MV = LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
+    using ::Tpetra::Details::NORM_INF;
+    using ::Tpetra::Details::ProfilingRegion;
+    ProfilingRegion region ("Tpetra::MV::normInf (host output)");
+
+    // The function needs to be able to sync X.
+    MV& X = const_cast<MV&> (*this);
+    multiVectorNormImpl (norms.getRawPtr (), X, NORM_INF);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  normInf (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
+  {
+    Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
+    this->normInf (norms_av);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
+  {
+    // KR FIXME Overload this method to take a View.
+    using Kokkos::ALL;
+    using Kokkos::subview;
+    using Teuchos::Comm;
+    using Teuchos::RCP;
+    using Teuchos::reduceAll;
+    using Teuchos::REDUCE_SUM;
+    typedef Kokkos::ArithTraits<impl_scalar_type> ATS;
+
+    const size_t lclNumRows = this->getLocalLength ();
+    const size_t numVecs = this->getNumVectors ();
+    const size_t numMeans = static_cast<size_t> (means.size ());
+
+    TEUCHOS_TEST_FOR_EXCEPTION(
+      numMeans != numVecs, std::runtime_error,
+      "Tpetra::LRMultiVector::meanValue: means.size() = " << numMeans
+      << " != this->getNumVectors() = " << numVecs << ".");
+
+    const std::pair<size_t, size_t> rowRng (0, lclNumRows);
+    const std::pair<size_t, size_t> colRng (0, numVecs);
+
+    // Make sure that the final output view has the same layout as the
+    // temporary view's HostMirror.  Left or Right doesn't matter for
+    // a 1-D array anyway; this is just to placate the compiler.
+    typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
+    typedef Kokkos::View<impl_scalar_type*,
+      typename local_view_type::HostMirror::array_layout,
+      Kokkos::HostSpace,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
+    host_local_view_type meansOut (means.getRawPtr (), numMeans);
+
+    RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
+      this->getMap ()->getComm ();
+
+    // If we need sync to device, then host has the most recent version.
+    const bool useHostVersion = this->need_sync_device ();
+    if (useHostVersion) {
+      // DualView was last modified on host, so run the local kernel there.
+      auto X_lcl = subview (getLocalViewHost(Access::ReadOnly),
+                            rowRng, Kokkos::ALL ());
+      // Compute the local sum of each column.
+      Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
+      if (isConstantStride ()) {
+        KokkosBlas::sum (lclSums, X_lcl);
+      }
+      else {
+        for (size_t j = 0; j < numVecs; ++j) {
+          const size_t col = whichVectors_[j];
+          KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
+        }
+      }
+
+      // If there are multiple MPI processes, the all-reduce reads
+      // from lclSums, and writes to meansOut.  Otherwise, we just
+      // copy lclSums into meansOut.
+      if (! comm.is_null () && this->isDistributed ()) {
+        reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
+                   lclSums.data (), meansOut.data ());
+      }
+      else {
+        // DEEP_COPY REVIEW - NOT TESTED
+        Kokkos::deep_copy (meansOut, lclSums);
+      }
+    }
+    else {
+      // DualView was last modified on device, so run the local kernel there.
+      auto X_lcl = subview (this->getLocalViewDevice(Access::ReadOnly),
+                            rowRng, Kokkos::ALL ());
+
+      // Compute the local sum of each column.
+      Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
+      if (isConstantStride ()) {
+        KokkosBlas::sum (lclSums, X_lcl);
+      }
+      else {
+        for (size_t j = 0; j < numVecs; ++j) {
+          const size_t col = whichVectors_[j];
+          KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
+        }
+      }
+      // lbv 10 mar 2023: Kokkos Kernels provides non-blocking BLAS
+      // functions unless they explicitly return a value to Host.
+      // Here while the lclSums are on the host, they are not a return
+      // value, therefore they might be available to us immediately.
+      // Adding a fence here guarantees that we will have the lclSums
+      // ahead of the MPI reduction.
+      execution_space exec_space_instance = execution_space();
+      exec_space_instance.fence();
+
+      // If there are multiple MPI processes, the all-reduce reads
+      // from lclSums, and writes to meansOut.  (We assume that MPI
+      // can read device memory.)  Otherwise, we just copy lclSums
+      // into meansOut.
+      if (! comm.is_null () && this->isDistributed ()) {
+        reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
+                   lclSums.data (), meansOut.data ());
+      }
+      else {
+        // DEEP_COPY REVIEW - HOST-TO-HOST - NOT TESTED FOR MPI BUILD
+        Kokkos::deep_copy (meansOut, lclSums);
+      }
+    }
+
+    // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
+    // to the magnitude type, since operator/ (std::complex<T>, int)
+    // isn't necessarily defined.
+    const impl_scalar_type OneOverN =
+      ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
+    for (size_t k = 0; k < numMeans; ++k) {
+      meansOut(k) = meansOut(k) * OneOverN;
+    }
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  randomize ()
+  {
+    typedef impl_scalar_type IST;
+    typedef Kokkos::ArithTraits<IST> ATS;
+    typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
+    typedef typename pool_type::generator_type generator_type;
+
+    const IST max = Kokkos::rand<generator_type, IST>::max ();
+    const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
+
+    this->randomize (static_cast<Scalar> (min), static_cast<Scalar> (max));
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  randomize (const Scalar& minVal, const Scalar& maxVal)
+  {
+    typedef impl_scalar_type IST;
+    typedef Tpetra::Details::Static_Random_XorShift64_Pool<typename device_type::execution_space> tpetra_pool_type;
+    typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
+
+    // Seed the pool based on the system RNG and the MPI rank, if needed
+    if(!tpetra_pool_type::isSet())
+      tpetra_pool_type::resetPool(this->getMap()->getComm()->getRank());
+
+    pool_type & rand_pool = tpetra_pool_type::getPool();
+    const IST max = static_cast<IST> (maxVal);
+    const IST min = static_cast<IST> (minVal);
+
+    auto thisView = this->getLocalViewDevice(Access::OverwriteAll);
+
+    if (isConstantStride ()) {
+      Kokkos::fill_random (thisView, rand_pool, min, max);
+    }
+    else {
+      const size_t numVecs = getNumVectors ();
+      for (size_t k = 0; k < numVecs; ++k) {
+        const size_t col = whichVectors_[k];
+        auto X_k = Kokkos::subview (thisView, Kokkos::ALL (), col);
+        Kokkos::fill_random (X_k, rand_pool, min, max);
+      }
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  putScalar (const Scalar& alpha)
+  {
+    using ::Tpetra::Details::ProfilingRegion;
+    using ::Tpetra::Details::Blas::fill;
+    using DES = typename dual_view_type::t_dev::execution_space;
+    using HES = typename dual_view_type::t_host::execution_space;
+    using LO = LocalOrdinal;
+    ProfilingRegion region ("Tpetra::LRMultiVector::putScalar");
+
+    // We need this cast for cases like Scalar = std::complex<T> but
+    // impl_scalar_type = Kokkos::complex<T>.
+    const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
+    const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
+    const LO numVecs = static_cast<LO> (this->getNumVectors ());
+
+    // Modify the most recently updated version of the data.  This
+    // avoids sync'ing, which could violate users' expectations.
+    //
+    // If we need sync to device, then host has the most recent version.
+    const bool runOnHost = runKernelOnHost(*this);
+    if (! runOnHost) {
+      auto X = this->getLocalViewDevice(Access::OverwriteAll);
+      if (this->isConstantStride ()) {
+        fill (DES (), X, theAlpha, lclNumRows, numVecs);
+      }
+      else {
+        fill (DES (), X, theAlpha, lclNumRows, numVecs,
+              this->whichVectors_.getRawPtr ());
+      }
+    }
+    else { // last modified in host memory, so modify data there.
+      auto X = this->getLocalViewHost(Access::OverwriteAll);
+      if (this->isConstantStride ()) {
+        fill (HES (), X, theAlpha, lclNumRows, numVecs);
+      }
+      else {
+        fill (HES (), X, theAlpha, lclNumRows, numVecs,
+              this->whichVectors_.getRawPtr ());
+      }
+    }
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  replaceMap (const Teuchos::RCP<const map_type>& newMap)
+  {
+    using Teuchos::ArrayRCP;
+    using Teuchos::Comm;
+    using Teuchos::RCP;
+    using ST = Scalar;
+    using LO = LocalOrdinal;
+    using GO = GlobalOrdinal;
+
+    // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
+    // it might work if the MV is a column view of another MV.
+    // However, things might go wrong when restoring the original
+    // Map, so we don't allow this case for now.
+    TEUCHOS_TEST_FOR_EXCEPTION(
+      ! this->isConstantStride (), std::logic_error,
+      "Tpetra::LRMultiVector::replaceMap: This method does not currently work "
+      "if the LRMultiVector is a column view of another LRMultiVector (that is, if "
+      "isConstantStride() == false).");
+
+    // Case 1: current Map and new Map are both nonnull on this process.
+    // Case 2: current Map is nonnull, new Map is null.
+    // Case 3: current Map is null, new Map is nonnull.
+    // Case 4: both Maps are null: forbidden.
+    //
+    // Case 1 means that we don't have to do anything on this process,
+    // other than assign the new Map.  (We always have to do that.)
+    // It's an error for the user to supply a Map that requires
+    // resizing in this case.
+    //
+    // Case 2 means that the calling process is in the current Map's
+    // communicator, but will be excluded from the new Map's
+    // communicator.  We don't have to do anything on the calling
+    // process; just leave whatever data it may have alone.
+    //
+    // Case 3 means that the calling process is excluded from the
+    // current Map's communicator, but will be included in the new
+    // Map's communicator.  This means we need to (re)allocate the
+    // local DualView if it does not have the right number of rows.
+    // If the new number of rows is nonzero, we'll fill the newly
+    // allocated local data with zeros, as befits a projection
+    // operation.
+    //
+    // The typical use case for Case 3 is that the LRMultiVector was
+    // first created with the Map with more processes, then that Map
+    // was replaced with a Map with fewer processes, and finally the
+    // original Map was restored on this call to replaceMap.
+
+    if (this->getMap ().is_null ()) { // current Map is null
+      // If this->getMap() is null, that means that this LRMultiVector
+      // has already had replaceMap happen to it.  In that case, just
+      // reallocate the DualView with the right size.
+
+      TEUCHOS_TEST_FOR_EXCEPTION(
+        newMap.is_null (), std::invalid_argument,
+        "Tpetra::LRMultiVector::replaceMap: both current and new Maps are null.  "
+        "This probably means that the input Map is incorrect.");
+
+      // Case 3: current Map is null, new Map is nonnull.
+      // Reallocate the DualView with the right dimensions.
+      const size_t newNumRows = newMap->getLocalNumElements ();
+      const size_t origNumRows = view_.extent (0);
+      const size_t numCols = this->getNumVectors ();
+
+      if (origNumRows != newNumRows || view_.extent (1) != numCols) {
+        view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
+      }
+    }
+    else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
+      // I am an excluded process.  Reinitialize my data so that I
+      // have 0 rows.  Keep the number of columns as before.
+      const size_t newNumRows = static_cast<size_t> (0);
+      const size_t numCols = this->getNumVectors ();
+      view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
+    }
+
+    this->map_ = newMap;
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  scale (const Scalar& alpha)
+  {
+    using Kokkos::ALL;
+    using IST = impl_scalar_type;
+
+    const IST theAlpha = static_cast<IST> (alpha);
+    if (theAlpha == Kokkos::ArithTraits<IST>::one ()) {
+      return; // do nothing
+    }
+    const size_t lclNumRows = getLocalLength ();
+    const size_t numVecs = getNumVectors ();
+    const std::pair<size_t, size_t> rowRng (0, lclNumRows);
+    const std::pair<size_t, size_t> colRng (0, numVecs);
+
+    // We can't substitute putScalar(0.0) for scale(0.0), because the
+    // former will overwrite NaNs present in the LRMultiVector.  The
+    // semantics of this call require multiplying them by 0, which
+    // IEEE 754 requires to be NaN.
+
+    // If we need sync to device, then host has the most recent version.
+    const bool useHostVersion = need_sync_device ();
+    if (useHostVersion) {
+      auto Y_lcl = Kokkos::subview (getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
+      if (isConstantStride ()) {
+        KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
+      }
+      else {
+        for (size_t k = 0; k < numVecs; ++k) {
+          const size_t Y_col = whichVectors_[k];
+          auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
+          KokkosBlas::scal (Y_k, theAlpha, Y_k);
+        }
+      }
+    }
+    else { // work on device
+      auto Y_lcl = Kokkos::subview (getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
+      if (isConstantStride ()) {
+        KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
+      }
+      else {
+        for (size_t k = 0; k < numVecs; ++k) {
+          const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
+          auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
+          KokkosBlas::scal (Y_k, theAlpha, Y_k);
+        }
+      }
+    }
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  scale (const Teuchos::ArrayView<const Scalar>& alphas)
+  {
+    const size_t numVecs = this->getNumVectors ();
+    const size_t numAlphas = static_cast<size_t> (alphas.size ());
+    TEUCHOS_TEST_FOR_EXCEPTION(
+      numAlphas != numVecs, std::invalid_argument, "Tpetra::LRMultiVector::"
+      "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
+      << numVecs << ".");
+
+    // Use a DualView to copy the scaling constants onto the device.
+    using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
+    k_alphas_type k_alphas ("alphas::tmp", numAlphas);
+    k_alphas.modify_host ();
+    for (size_t i = 0; i < numAlphas; ++i) {
+      k_alphas.h_view(i) = static_cast<impl_scalar_type> (alphas[i]);
+    }
+    k_alphas.sync_device ();
+    // Invoke the scale() overload that takes a device View of coefficients.
+    this->scale (k_alphas.view_device ());
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
+  {
+    using Kokkos::ALL;
+    using Kokkos::subview;
+
+    const size_t lclNumRows = this->getLocalLength ();
+    const size_t numVecs = this->getNumVectors ();
+    TEUCHOS_TEST_FOR_EXCEPTION(
+      static_cast<size_t> (alphas.extent (0)) != numVecs,
+      std::invalid_argument, "Tpetra::LRMultiVector::scale(alphas): "
+      "alphas.extent(0) = " << alphas.extent (0)
+      << " != this->getNumVectors () = " << numVecs << ".");
+    const std::pair<size_t, size_t> rowRng (0, lclNumRows);
+    const std::pair<size_t, size_t> colRng (0, numVecs);
+
+    // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
+    // type of the return value of subview.  This is because if we
+    // switch the array layout from LayoutLeft to LayoutRight
+    // (preferred for performance of block operations), the types
+    // below won't be valid.  (A view of a column of a LayoutRight
+    // multivector has LayoutStride, not LayoutLeft.)
+
+    // If we need sync to device, then host has the most recent version.
+    const bool useHostVersion = this->need_sync_device ();
+    if (useHostVersion) {
+      // Work in host memory.  This means we need to create a host
+      // mirror of the input View of coefficients.
+      auto alphas_h = Kokkos::create_mirror_view (alphas);
+      // DEEP_COPY REVIEW - NOT TESTED
+      Kokkos::deep_copy (alphas_h, alphas);
+
+      auto Y_lcl = subview (this->getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
+      if (isConstantStride ()) {
+        KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
+      }
+      else {
+        for (size_t k = 0; k < numVecs; ++k) {
+          const size_t Y_col = this->isConstantStride () ? k :
+            this->whichVectors_[k];
+          auto Y_k = subview (Y_lcl, ALL (), Y_col);
+          // We don't have to use the entire 1-D View here; we can use
+          // the version that takes a scalar coefficient.
+          KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
+        }
+      }
+    }
+    else { // Work in device memory, using the input View 'alphas' directly.
+      auto Y_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
+      if (isConstantStride ()) {
+        KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
+      }
+      else {
+        // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
+        // as values on host, so copy them to host.  Another approach
+        // would be to fix scal() so that it takes a 0-D View as the
+        // second argument.
+        auto alphas_h = Kokkos::create_mirror_view (alphas);
+        // DEEP_COPY REVIEW - NOT TESTED
+        Kokkos::deep_copy (alphas_h, alphas);
+
+        for (size_t k = 0; k < numVecs; ++k) {
+          const size_t Y_col = this->isConstantStride () ? k :
+            this->whichVectors_[k];
+          auto Y_k = subview (Y_lcl, ALL (), Y_col);
+          KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
+        }
+      }
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  scale (const Scalar& alpha,
+         const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A)
+  {
+    using Kokkos::ALL;
+    using Kokkos::subview;
+    const char tfecfFuncName[] = "scale: ";
+
+    const size_t lclNumRows = getLocalLength ();
+    const size_t numVecs = getNumVectors ();
+
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      lclNumRows != A.getLocalLength (), std::invalid_argument,
+      "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
+      << A.getLocalLength () << ".");
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      numVecs != A.getNumVectors (), std::invalid_argument,
+      "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
+      << A.getNumVectors () << ".");
+
+    const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
+    const std::pair<size_t, size_t> rowRng (0, lclNumRows);
+    const std::pair<size_t, size_t> colRng (0, numVecs);
+
+    auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
+    auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
+    auto Y_lcl = subview (Y_lcl_orig, rowRng, ALL ());
+    auto X_lcl = subview (X_lcl_orig, rowRng, ALL ());
+
+    if (isConstantStride () && A.isConstantStride ()) {
+      KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
+    }
+    else {
+      // Make sure that Kokkos only uses the local length for add.
+      for (size_t k = 0; k < numVecs; ++k) {
+        const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
+        const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
+        auto Y_k = subview (Y_lcl, ALL (), Y_col);
+          auto X_k = subview (X_lcl, ALL (), X_col);
+
+          KokkosBlas::scal (Y_k, theAlpha, X_k);
+      }
+    }
+  }
+
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  reciprocal (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A)
+  {
+    const char tfecfFuncName[] = "reciprocal: ";
+
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+       getLocalLength () != A.getLocalLength (), std::runtime_error,
+       "MultiVectors do not have the same local length.  "
+       "this->getLocalLength() = " << getLocalLength ()
+       << " != A.getLocalLength() = " << A.getLocalLength () << ".");
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      A.getNumVectors () != this->getNumVectors (), std::runtime_error,
+      ": MultiVectors do not have the same number of columns (vectors).  "
+       "this->getNumVectors() = " << getNumVectors ()
+       << " != A.getNumVectors() = " << A.getNumVectors () << ".");
+
+    const size_t numVecs = getNumVectors ();
+
+    auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
+    auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
+
+    if (isConstantStride () && A.isConstantStride ()) {
+      KokkosBlas::reciprocal (this_view_dev, A_view_dev);
+    }
+    else {
+      using Kokkos::ALL;
+      using Kokkos::subview;
+      for (size_t k = 0; k < numVecs; ++k) {
+        const size_t this_col = isConstantStride () ? k : whichVectors_[k];
+        auto vector_k = subview (this_view_dev, ALL (), this_col);
+        const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
+        auto vector_Ak = subview (A_view_dev, ALL (), A_col);
+        KokkosBlas::reciprocal (vector_k, vector_Ak);
+      }
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  abs (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A)
+  {
+    const char tfecfFuncName[] = "abs";
+
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+       getLocalLength () != A.getLocalLength (), std::runtime_error,
+       ": MultiVectors do not have the same local length.  "
+       "this->getLocalLength() = " << getLocalLength ()
+       << " != A.getLocalLength() = " << A.getLocalLength () << ".");
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      A.getNumVectors () != this->getNumVectors (), std::runtime_error,
+      ": MultiVectors do not have the same number of columns (vectors).  "
+       "this->getNumVectors() = " << getNumVectors ()
+       << " != A.getNumVectors() = " << A.getNumVectors () << ".");
+    const size_t numVecs = getNumVectors ();
+
+    auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
+    auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
+
+    if (isConstantStride () && A.isConstantStride ()) {
+      KokkosBlas::abs (this_view_dev, A_view_dev);
+    }
+    else {
+      using Kokkos::ALL;
+      using Kokkos::subview;
+
+      for (size_t k=0; k < numVecs; ++k) {
+        const size_t this_col = isConstantStride () ? k : whichVectors_[k];
+        auto vector_k = subview (this_view_dev, ALL (), this_col);
+        const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
+        auto vector_Ak = subview (A_view_dev, ALL (), A_col);
+        KokkosBlas::abs (vector_k, vector_Ak);
+      }
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  update (const Scalar& alpha,
+          const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
+          const Scalar& beta)
+  {
+    const char tfecfFuncName[] = "update: ";
+    using Kokkos::subview;
+    using Kokkos::ALL;
+
+    ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)");
+
+    const size_t lclNumRows = getLocalLength ();
+    const size_t numVecs = getNumVectors ();
+
+    if (::Tpetra::Details::Behavior::debug ()) {
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+        lclNumRows != A.getLocalLength (), std::invalid_argument,
+        "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
+        << A.getLocalLength () << ".");
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+        numVecs != A.getNumVectors (), std::invalid_argument,
+        "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
+        << A.getNumVectors () << ".");
+    }
+
+    const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
+    const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
+    const std::pair<size_t, size_t> rowRng (0, lclNumRows);
+    const std::pair<size_t, size_t> colRng (0, numVecs);
+
+    auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
+    auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
+    auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
+    auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
+
+    // The device memory of *this is about to be modified
+    if (isConstantStride () && A.isConstantStride ()) {
+      KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
+    }
+    else {
+      // Make sure that Kokkos only uses the local length for add.
+      for (size_t k = 0; k < numVecs; ++k) {
+        const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
+        const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
+        auto Y_k = subview (Y_lcl, ALL (), Y_col);
+        auto X_k = subview (X_lcl, ALL (), X_col);
+
+        KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
+      }
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  update (const Scalar& alpha,
+          const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
+          const Scalar& beta,
+          const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
+          const Scalar& gamma)
+  {
+    using Kokkos::ALL;
+    using Kokkos::subview;
+
+    const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
+
+    ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta,B,gamma)");
+
+    const size_t lclNumRows = this->getLocalLength ();
+    const size_t numVecs = getNumVectors ();
+
+    if (::Tpetra::Details::Behavior::debug ()) {
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+        lclNumRows != A.getLocalLength (), std::invalid_argument,
+        "The input LRMultiVector A has " << A.getLocalLength () << " local "
+        "row(s), but this LRMultiVector has " << lclNumRows << " local "
+        "row(s).");
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+        lclNumRows != B.getLocalLength (), std::invalid_argument,
+        "The input LRMultiVector B has " << B.getLocalLength () << " local "
+        "row(s), but this LRMultiVector has " << lclNumRows << " local "
+        "row(s).");
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+        A.getNumVectors () != numVecs, std::invalid_argument,
+        "The input LRMultiVector A has " << A.getNumVectors () << " column(s), "
+        "but this LRMultiVector has " << numVecs << " column(s).");
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+        B.getNumVectors () != numVecs, std::invalid_argument,
+        "The input LRMultiVector B has " << B.getNumVectors () << " column(s), "
+        "but this LRMultiVector has " << numVecs << " column(s).");
+    }
+  
+    const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
+    const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
+    const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
+
+    const std::pair<size_t, size_t> rowRng (0, lclNumRows);
+    const std::pair<size_t, size_t> colRng (0, numVecs);
+
+    // Prefer 'auto' over specifying the type explicitly.  This avoids
+    // issues with a subview possibly having a different type than the
+    // original view.
+    auto C_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
+    auto A_lcl = subview (A.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
+    auto B_lcl = subview (B.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
+
+    if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
+      KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
+    }
+    else {
+      // Some input (or *this) is not constant stride,
+      // so perform the update one column at a time.
+      for (size_t k = 0; k < numVecs; ++k) {
+        const size_t this_col = isConstantStride () ? k : whichVectors_[k];
+        const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
+        const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
+        KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
+                            theBeta, subview (B_lcl, rowRng, B_col),
+                            theGamma, subview (C_lcl, rowRng, this_col));
+      }
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::ArrayRCP<const Scalar>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getData (size_t j) const
+  {
+    using Kokkos::ALL;
+    using IST = impl_scalar_type;
+    const char tfecfFuncName[] = "getData: ";
+
+    // Any LRMultiVector method that called the (classic) Kokkos Node's
+    // viewBuffer or viewBufferNonConst methods always implied a
+    // device->host synchronization.  Thus, we synchronize here as
+    // well.
+
+    auto hostView = getLocalViewHost(Access::ReadOnly);
+    const size_t col = isConstantStride () ? j : whichVectors_[j];
+    auto hostView_j = Kokkos::subview (hostView, ALL (), col);
+    Teuchos::ArrayRCP<const IST> dataAsArcp =
+      Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
+
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (static_cast<size_t> (hostView_j.extent (0)) <
+       static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
+       "hostView_j.extent(0) = " << hostView_j.extent (0)
+       << " < dataAsArcp.size() = " << dataAsArcp.size () << ".  "
+       "Please report this bug to the Tpetra developers.");
+
+    return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::ArrayRCP<Scalar>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getDataNonConst (size_t j)
+  {
+    using Kokkos::ALL;
+    using Kokkos::subview;
+    using IST = impl_scalar_type;
+    const char tfecfFuncName[] = "getDataNonConst: ";
+
+    auto hostView = getLocalViewHost(Access::ReadWrite);
+    const size_t col = isConstantStride () ? j : whichVectors_[j];
+    auto hostView_j = subview (hostView, ALL (), col);
+    Teuchos::ArrayRCP<IST> dataAsArcp =
+      Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
+
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (static_cast<size_t> (hostView_j.extent (0)) <
+       static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
+       "hostView_j.extent(0) = " << hostView_j.extent (0)
+       << " < dataAsArcp.size() = " << dataAsArcp.size () << ".  "
+       "Please report this bug to the Tpetra developers.");
+
+    return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::RCP<LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  subCopy (const Teuchos::ArrayView<const size_t>& cols) const
+  {
+    using Teuchos::RCP;
+    typedef LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> MV;
+
+    // Check whether the index set in cols is contiguous.  If it is,
+    // use the more efficient Range1D version of subCopy.
+    bool contiguous = true;
+    const size_t numCopyVecs = static_cast<size_t> (cols.size ());
+    for (size_t j = 1; j < numCopyVecs; ++j) {
+      if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
+        contiguous = false;
+        break;
+      }
+    }
+    if (contiguous && numCopyVecs > 0) {
+      return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
+    }
+    else {
+      RCP<const MV> X_sub = this->subView (cols);
+      RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
+      Y->assign (*X_sub);
+      return Y;
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::RCP<LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  subCopy (const Teuchos::Range1D &colRng) const
+  {
+    using Teuchos::RCP;
+    typedef LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> MV;
+
+    RCP<const MV> X_sub = this->subView (colRng);
+    RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
+    Y->assign (*X_sub);
+    return Y;
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  size_t
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getOrigNumLocalRows () const {
+    return view_.origExtent(0);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  size_t
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getOrigNumLocalCols () const {
+    return view_.origExtent(1);
+  }
+
+  template <class Scalar, class LO, class GO, class Node>
+  LRMultiVector<Scalar, LO, GO, Node>::
+  LRMultiVector (const LRMultiVector<Scalar, LO, GO, Node>& X,
+               const Teuchos::RCP<const map_type>& subMap,
+               const local_ordinal_type rowOffset) :
+    base_type (subMap)
+  {
+    using Kokkos::ALL;
+    using Kokkos::subview;
+    using Teuchos::outArg;
+    using Teuchos::RCP;
+    using Teuchos::rcp;
+    using Teuchos::reduceAll;
+    using Teuchos::REDUCE_MIN;
+    using std::endl;
+    using MV = LRMultiVector<Scalar, LO, GO, Node>;
+    const char prefix[] = "Tpetra::LRMultiVector constructor (offsetView): ";
+    const char suffix[] = "Please report this bug to the Tpetra developers.";
+    int lclGood = 1;
+    int gblGood = 1;
+    std::unique_ptr<std::ostringstream> errStrm;
+    const bool debug = ::Tpetra::Details::Behavior::debug ();
+    const bool verbose = ::Tpetra::Details::Behavior::verbose ();
+
+    // Be careful to use the input Map's communicator, not X's.  The
+    // idea is that, on return, *this is a subview of X, using the
+    // input Map.
+    const auto comm = subMap->getComm ();
+    TEUCHOS_ASSERT( ! comm.is_null () );
+    const int myRank = comm->getRank ();
+
+    const LO lclNumRowsBefore = static_cast<LO> (X.getLocalLength ());
+    const LO numCols = static_cast<LO> (X.getNumVectors ());
+    const LO newNumRows = static_cast<LO> (subMap->getLocalNumElements ());
+    if (verbose) {
+      std::ostringstream os;
+      os << "Proc " << myRank << ": " << prefix
+         << "X: {lclNumRows: " << lclNumRowsBefore
+         << ", origLclNumRows: " << X.getOrigNumLocalRows ()
+         << ", numCols: " << numCols << "}, "
+         << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
+      std::cerr << os.str ();
+    }
+    // We ask for the _original_ number of rows in X, because X could
+    // be a shorter (fewer rows) view of a longer MV.  (For example, X
+    // could be a domain Map view of a column Map MV.)
+    const bool tooManyElts =
+      newNumRows + rowOffset > static_cast<LO> (X.getOrigNumLocalRows ());
+    if (tooManyElts) {
+      errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
+      *errStrm << "  Proc " << myRank << ": subMap->getLocalNumElements() (="
+               << newNumRows << ") + rowOffset (=" << rowOffset
+               << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows ()
+               << ")." << endl;
+      lclGood = 0;
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (! debug && tooManyElts, std::invalid_argument,
+         prefix << errStrm->str () << suffix);
+    }
+
+    if (debug) {
+      reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
+      if (gblGood != 1) {
+        std::ostringstream gblErrStrm;
+        const std::string myErrStr =
+          errStrm.get () != nullptr ? errStrm->str () : std::string ("");
+        ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (true, std::invalid_argument, gblErrStrm.str ());
+      }
+    }
+
+    using range_type = std::pair<LO, LO>;
+    const range_type origRowRng
+      (rowOffset, static_cast<LO> (X.view_.origExtent (0)));
+    const range_type rowRng
+      (rowOffset, rowOffset + newNumRows);
+
+    wrapped_dual_view_type newView = takeSubview (X.view_, rowRng, ALL ());
+
+    // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
+    // handling subviews of degenerate Views quite so well.  For some
+    // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
+    // 0.  We work around by creating a new empty DualView of the
+    // desired (degenerate) dimensions.
+    if (newView.extent (0) == 0 &&
+        newView.extent (1) != X.view_.extent (1)) {
+      newView =
+        allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
+    }
+
+    MV subViewMV = X.isConstantStride () ?
+      MV (subMap, newView) :
+      MV (subMap, newView, X.whichVectors_ ());
+
+    if (debug) {
+      const LO lclNumRowsRet = static_cast<LO> (subViewMV.getLocalLength ());
+      const LO numColsRet = static_cast<LO> (subViewMV.getNumVectors ());
+      if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
+        lclGood = 0;
+        if (errStrm.get () == nullptr) {
+          errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
+        }
+        *errStrm << "  Proc " << myRank <<
+          ": subMap.getLocalNumElements(): " << newNumRows <<
+          ", subViewMV.getLocalLength(): " << lclNumRowsRet <<
+          ", X.getNumVectors(): " << numCols <<
+          ", subViewMV.getNumVectors(): " << numColsRet << endl;
+      }
+      reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
+      if (gblGood != 1) {
+        std::ostringstream gblErrStrm;
+        if (myRank == 0) {
+          gblErrStrm << prefix << "Returned LRMultiVector has the wrong local "
+            "dimensions on one or more processes:" << endl;
+        }
+        const std::string myErrStr =
+          errStrm.get () != nullptr ? errStrm->str () : std::string ("");
+        ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
+        gblErrStrm << suffix << endl;
+        TEUCHOS_TEST_FOR_EXCEPTION
+          (true, std::invalid_argument, gblErrStrm.str ());
+      }
+    }
+
+    if (verbose) {
+      std::ostringstream os;
+      os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
+      std::cerr << os.str ();
+    }
+
+    *this = subViewMV; // shallow copy
+
+    if (verbose) {
+      std::ostringstream os;
+      os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
+      std::cerr << os.str ();
+    }
+  }
+
+  template <class Scalar, class LO, class GO, class Node>
+  LRMultiVector<Scalar, LO, GO, Node>::
+  LRMultiVector (const LRMultiVector<Scalar, LO, GO, Node>& X,
+               const map_type& subMap,
+               const size_t rowOffset) :
+    LRMultiVector (X, Teuchos::RCP<const map_type> (new map_type (subMap)),
+                 static_cast<local_ordinal_type> (rowOffset))
+  {}
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::RCP<const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  offsetView (const Teuchos::RCP<const map_type>& subMap,
+              const size_t offset) const
+  {
+    typedef LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> MV;
+    return Teuchos::rcp (new MV (*this, *subMap, offset));
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::RCP<LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
+                      const size_t offset)
+  {
+    typedef LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> MV;
+    return Teuchos::rcp (new MV (*this, *subMap, offset));
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::RCP<const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  subView (const Teuchos::ArrayView<const size_t>& cols) const
+  {
+    using Teuchos::Array;
+    using Teuchos::rcp;
+    typedef LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> MV;
+
+    const size_t numViewCols = static_cast<size_t> (cols.size ());
+    TEUCHOS_TEST_FOR_EXCEPTION(
+      numViewCols < 1, std::runtime_error, "Tpetra::LRMultiVector::subView"
+      "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
+      "contain at least one entry, but cols.size() = " << cols.size ()
+      << " == 0.");
+
+    // Check whether the index set in cols is contiguous.  If it is,
+    // use the more efficient Range1D version of subView.
+    bool contiguous = true;
+    for (size_t j = 1; j < numViewCols; ++j) {
+      if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
+        contiguous = false;
+        break;
+      }
+    }
+    if (contiguous) {
+      if (numViewCols == 0) {
+        // The output MV has no columns, so there is nothing to view.
+        return rcp (new MV (this->getMap (), numViewCols));
+      } else {
+        // Use the more efficient contiguous-index-range version.
+        return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
+      }
+    }
+
+    if (isConstantStride ()) {
+      return rcp (new MV (this->getMap (), view_, cols));
+    }
+    else {
+      Array<size_t> newcols (cols.size ());
+      for (size_t j = 0; j < numViewCols; ++j) {
+        newcols[j] = whichVectors_[cols[j]];
+      }
+      return rcp (new MV (this->getMap (), view_, newcols ()));
+    }
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::RCP<const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  subView (const Teuchos::Range1D& colRng) const
+  {
+    using ::Tpetra::Details::Behavior;
+    using Kokkos::ALL;
+    using Kokkos::subview;
+    using Teuchos::Array;
+    using Teuchos::RCP;
+    using Teuchos::rcp;
+    typedef LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> MV;
+    const char tfecfFuncName[] = "subView(Range1D): ";
+
+    const size_t lclNumRows = this->getLocalLength ();
+    const size_t numVecs = this->getNumVectors ();
+    // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+    //   colRng.size() == 0, std::runtime_error, prefix << "Range must include "
+    //   "at least one vector.");
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
+      "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
+      << numVecs << ".");
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      numVecs != 0 && colRng.size () != 0 &&
+      (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
+       static_cast<size_t> (colRng.ubound ()) >= numVecs),
+      std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
+      "," << colRng.ubound () << "] exceeds the valid range of column indices "
+      "[0, " << numVecs << "].");
+
+    RCP<const MV> X_ret; // the LRMultiVector subview to return
+
+    // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
+    // broken for the case of views with zero rows.  I will brutally
+    // enforce that the subview has the correct dimensions.  In
+    // particular, in the case of zero rows, I will, if necessary,
+    // create a new dual_view_type with zero rows and the correct
+    // number of columns.  In a debug build, I will use an all-reduce
+    // to ensure that it has the correct dimensions on all processes.
+
+    const std::pair<size_t, size_t> rows (0, lclNumRows);
+    if (colRng.size () == 0) {
+      const std::pair<size_t, size_t> cols (0, 0); // empty range
+      wrapped_dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
+      X_ret = rcp (new MV (this->getMap (), X_sub));
+    }
+    else {
+      // Returned LRMultiVector is constant stride only if *this is.
+      if (isConstantStride ()) {
+        const std::pair<size_t, size_t> cols (colRng.lbound (),
+                                              colRng.ubound () + 1);
+        wrapped_dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
+        X_ret = rcp (new MV (this->getMap (), X_sub));
+      }
+      else {
+        if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
+          // We're only asking for one column, so the result does have
+          // constant stride, even though this LRMultiVector does not.
+          const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
+                                               whichVectors_[0] + colRng.ubound () + 1);
+          wrapped_dual_view_type X_sub = takeSubview (view_, ALL (), col);
+          X_ret = rcp (new MV (this->getMap (), X_sub));
+        }
+        else {
+          Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
+                               whichVectors_.begin () + colRng.ubound () + 1);
+          X_ret = rcp (new MV (this->getMap (), view_, which));
+        }
+      }
+    }
+
+    const bool debug = Behavior::debug ();
+    if (debug) {
+      using Teuchos::Comm;
+      using Teuchos::outArg;
+      using Teuchos::REDUCE_MIN;
+      using Teuchos::reduceAll;
+
+      RCP<const Comm<int> > comm = this->getMap ().is_null () ?
+        Teuchos::null : this->getMap ()->getComm ();
+      if (! comm.is_null ()) {
+        int lclSuccess = 1;
+        int gblSuccess = 1;
+
+        if (X_ret.is_null ()) {
+          lclSuccess = 0;
+        }
+        reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
+        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+          (lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
+           "LRMultiVector; the return value of this method) is null on some MPI "
+           "process in this LRMultiVector's communicator.  This should never "
+           "happen.  Please report this bug to the Tpetra developers.");
+        if (! X_ret.is_null () &&
+            X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
+          lclSuccess = 0;
+        }
+        reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess,
+                             outArg (gblSuccess));
+        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+          (lclSuccess != 1, std::logic_error, "X_ret->getNumVectors() != "
+           "colRng.size(), on at least one MPI process in this LRMultiVector's "
+           "communicator.  This should never happen.  "
+           "Please report this bug to the Tpetra developers.");
+      }
+    }
+    return X_ret;
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::RCP<LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  subViewNonConst (const Teuchos::ArrayView<const size_t> &cols)
+  {
+    typedef LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> MV;
+    return Teuchos::rcp_const_cast<MV> (this->subView (cols));
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::RCP<LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  subViewNonConst (const Teuchos::Range1D &colRng)
+  {
+    typedef LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> MV;
+    return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  LRMultiVector (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& X,
+               const size_t j)
+    : base_type (X.getMap ())
+  {
+    using Kokkos::subview;
+    typedef std::pair<size_t, size_t> range_type;
+    const char tfecfFuncName[] = "LRMultiVector(const LRMultiVector&, const size_t): ";
+
+    const size_t numCols = X.getNumVectors ();
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (j >= numCols, std::invalid_argument, "Input index j (== " << j
+       << ") exceeds valid column index range [0, " << numCols << " - 1].");
+    const size_t jj = X.isConstantStride () ?
+      static_cast<size_t> (j) :
+      static_cast<size_t> (X.whichVectors_[j]);
+    this->view_ = takeSubview (X.view_, Kokkos::ALL (), range_type (jj, jj+1));
+
+    // mfh 31 Jul 2017: It would be unwise to execute concurrent
+    // Export or Import operations with different subviews of a
+    // LRMultiVector.  Thus, it is safe to reuse communication buffers.
+    // See #1560 discussion.
+    //
+    // We only need one column's worth of buffer for imports_ and
+    // exports_.  Taking subviews now ensures that their lengths will
+    // be exactly what we need, so we won't have to resize them later.
+    {
+      const size_t newSize = X.imports_.extent (0) / numCols;
+      const size_t offset = jj*newSize;
+      auto newImports = X.imports_;
+      newImports.d_view = subview (X.imports_.d_view,
+                                   range_type (offset, offset+newSize));
+      newImports.h_view = subview (X.imports_.h_view,
+                                   range_type (offset, offset+newSize));
+      this->imports_ = newImports;
+    }
+    {
+      const size_t newSize = X.exports_.extent (0) / numCols;
+      const size_t offset = jj*newSize;
+      auto newExports = X.exports_;
+      newExports.d_view = subview (X.exports_.d_view,
+                                   range_type (offset, offset+newSize));
+      newExports.h_view = subview (X.exports_.h_view,
+                                   range_type (offset, offset+newSize));
+      this->exports_ = newExports;
+    }
+    // These two DualViews already either have the right number of
+    // entries, or zero entries.  This means that we don't need to
+    // resize them.
+    this->numImportPacketsPerLID_ = X.numImportPacketsPerLID_;
+    this->numExportPacketsPerLID_ = X.numExportPacketsPerLID_;
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getVector (const size_t j) const
+  {
+    typedef Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> V;
+    return Teuchos::rcp (new V (*this, j));
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getVectorNonConst (const size_t j)
+  {
+    typedef Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> V;
+    return Teuchos::rcp (new V (*this, j));
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
+  {
+    using IST = impl_scalar_type;
+    using input_view_type = Kokkos::View<IST**, Kokkos::LayoutRight,
+                                         Kokkos::HostSpace,
+                                         Kokkos::MemoryUnmanaged>;
+    const char tfecfFuncName[] = "get1dCopy: ";
+
+    const size_t numRows = this->getLocalLength ();
+    const size_t numCols = this->getNumVectors ();
+
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (LDA < numRows, std::runtime_error,
+       "LDA = " << LDA << " < numRows = " << numRows << ".");
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (numRows > size_t (0) && numCols > size_t (0) &&
+       size_t (A.size ()) < LDA * (numCols - 1) + numRows,
+       std::runtime_error,
+       "A.size() = " << A.size () << ", but its size must be at least "
+       << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
+
+    const std::pair<size_t, size_t> rowRange (0, numRows);
+    const std::pair<size_t, size_t> colRange (0, numCols);
+
+    input_view_type A_view_orig (reinterpret_cast<IST*> (A.getRawPtr ()),
+                                 LDA, numCols);
+    auto A_view = Kokkos::subview (A_view_orig, rowRange, colRange);
+
+    /// problematic example
+    ///   - a host view is checked out by users ; this increase ref count on host view
+    ///   - when the view is const view, it does not raise modify_host() flag
+    ///   - when we use useHostVersion logic (this->need_sync_device()) to figure out
+    ///     more recent version, the flag is false as the user checked out a const view
+    ///   - as a result, it attempts to use device view and it volate ref count rule
+    ///   - the other case of using device view is also problematic
+    /// solution
+    ///   - any non const view is alive outside, we cannot give a copy
+    ///   - if a user takes a const view, we use the same host/device view not to violate
+    ///     ref count rule.
+    if (this->need_sync_host() && this->need_sync_device()) {
+      /// there is non-const host or device view outside, we cannot give a copy as a user
+      /// can change the local data and we do not know which one the user want as a copy
+      throw std::runtime_error("Tpetra::LRMultiVector: A non-const view is alive outside and we cannot give a copy where host or device view can be modified outside");
+    }
+    else { 
+      const bool useHostView = view_.host_view_use_count() >= view_.device_view_use_count();
+      if (this->isConstantStride ()) {
+        if (useHostView) {
+          auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
+          // DEEP_COPY REVIEW - NOT TESTED
+          Kokkos::deep_copy (A_view, srcView_host);
+        } else {
+          auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
+           // DEEP_COPY REVIEW - NOT TESTED
+          Kokkos::deep_copy (A_view, srcView_device);
+        }
+      }
+      else {
+        for (size_t j = 0; j < numCols; ++j) {
+          const size_t srcCol = this->whichVectors_[j];
+          auto dstColView = Kokkos::subview (A_view, rowRange, j);
+          
+          if (useHostView) {
+            auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
+            auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol);
+             // DEEP_COPY REVIEW - NOT TESTED
+            Kokkos::deep_copy (dstColView, srcColView_host);
+          } else {
+            auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
+            auto srcColView_device = Kokkos::subview (srcView_device, rowRange, srcCol);
+             // DEEP_COPY REVIEW - NOT TESTED
+            Kokkos::deep_copy (dstColView, srcColView_device);
+          }
+        }
+      }
+    }
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
+  {
+    typedef Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> V;
+    const char tfecfFuncName[] = "get2dCopy: ";
+    const size_t numRows = this->getLocalLength ();
+    const size_t numCols = this->getNumVectors ();
+
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
+      std::runtime_error, "Input array of pointers must contain as many "
+      "entries (arrays) as the LRMultiVector has columns.  ArrayOfPtrs.size() = "
+      << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
+
+    if (numRows != 0 && numCols != 0) {
+      // No side effects until we've validated the input.
+      for (size_t j = 0; j < numCols; ++j) {
+        const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
+        TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+          dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
+          "the input array of arrays is not long enough to fit all entries in "
+          "that column of the LRMultiVector.  ArrayOfPtrs[j].size() = " << dstLen
+          << " < getLocalLength() = " << numRows << ".");
+      }
+
+      // We've validated the input, so it's safe to start copying.
+      for (size_t j = 0; j < numCols; ++j) {
+        Teuchos::RCP<const V> X_j = this->getVector (j);
+        const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
+        X_j->get1dCopy (ArrayOfPtrs[j], LDA);
+      }
+    }
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::ArrayRCP<const Scalar>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  get1dView () const
+  {
+    if (getLocalLength () == 0 || getNumVectors () == 0) {
+      return Teuchos::null;
+    } else {
+      TEUCHOS_TEST_FOR_EXCEPTION(
+        ! isConstantStride (), std::runtime_error, "Tpetra::LRMultiVector::"
+        "get1dView: This LRMultiVector does not have constant stride, so it is "
+        "not possible to view its data as a single array.  You may check "
+        "whether a LRMultiVector has constant stride by calling "
+        "isConstantStride().");
+      // Since get1dView() is and was always marked const, I have to
+      // cast away const here in order not to break backwards
+      // compatibility.
+      auto X_lcl = getLocalViewHost(Access::ReadOnly);
+      Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
+        Kokkos::Compat::persistingView (X_lcl);
+      return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::ArrayRCP<Scalar>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  get1dViewNonConst ()
+  {
+    if (this->getLocalLength () == 0 || this->getNumVectors () == 0) {
+      return Teuchos::null;
+    }
+    else {
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (! isConstantStride (), std::runtime_error, "Tpetra::LRMultiVector::"
+         "get1dViewNonConst: This LRMultiVector does not have constant stride, "
+         "so it is not possible to view its data as a single array.  You may "
+         "check whether a LRMultiVector has constant stride by calling "
+         "isConstantStride().");
+      auto X_lcl = getLocalViewHost(Access::ReadWrite);
+      Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
+        Kokkos::Compat::persistingView (X_lcl);
+      return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  get2dViewNonConst ()
+  {
+    auto X_lcl = getLocalViewHost(Access::ReadWrite);
+
+    // Don't use the row range here on the outside, in order to avoid
+    // a strided return type (in case Kokkos::subview is conservative
+    // about that).  Instead, use the row range for the column views
+    // in the loop.
+    const size_t myNumRows = this->getLocalLength ();
+    const size_t numCols = this->getNumVectors ();
+    const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
+
+    Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
+    for (size_t j = 0; j < numCols; ++j) {
+      const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
+      auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
+      Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
+        Kokkos::Compat::persistingView (X_lcl_j);
+      views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_lcl_j_arcp);
+    }
+    return views;
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  typename LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::dual_view_type::t_host::const_type
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getLocalViewHost(Access::ReadOnlyStruct s) const
+  {
+    return view_.getHostView(s);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  typename LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::dual_view_type::t_host
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getLocalViewHost(Access::ReadWriteStruct s)
+  {
+    return view_.getHostView(s);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  typename LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::dual_view_type::t_host
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getLocalViewHost(Access::OverwriteAllStruct s)
+  {
+    return view_.getHostView(s);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  typename LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::dual_view_type::t_dev::const_type
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getLocalViewDevice(Access::ReadOnlyStruct s) const
+  {
+    return view_.getDeviceView(s);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  typename LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::dual_view_type::t_dev
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getLocalViewDevice(Access::ReadWriteStruct s)
+  {
+    return view_.getDeviceView(s);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  typename LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::dual_view_type::t_dev
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getLocalViewDevice(Access::OverwriteAllStruct s)
+  {
+    return view_.getDeviceView(s);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  typename LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::wrapped_dual_view_type 
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getWrappedDualView() const {
+    return view_;
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  get2dView () const
+  {
+    // Since get2dView() is and was always marked const, I have to
+    // cast away const here in order not to break backwards
+    // compatibility.
+    auto X_lcl = getLocalViewHost(Access::ReadOnly);
+
+    // Don't use the row range here on the outside, in order to avoid
+    // a strided return type (in case Kokkos::subview is conservative
+    // about that).  Instead, use the row range for the column views
+    // in the loop.
+    const size_t myNumRows = this->getLocalLength ();
+    const size_t numCols = this->getNumVectors ();
+    const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
+
+    Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
+    for (size_t j = 0; j < numCols; ++j) {
+      const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
+      auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
+      Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
+        Kokkos::Compat::persistingView (X_lcl_j);
+      views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_lcl_j_arcp);
+    }
+    return views;
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  multiply (Teuchos::ETransp transA,
+            Teuchos::ETransp transB,
+            const Scalar& alpha,
+            const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
+            const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
+            const Scalar& beta)
+  {
+    using ::Tpetra::Details::ProfilingRegion;
+    using Teuchos::CONJ_TRANS;
+    using Teuchos::NO_TRANS;
+    using Teuchos::TRANS;
+    using Teuchos::RCP;
+    using Teuchos::rcp;
+    using Teuchos::rcpFromRef;
+    using std::endl;
+    using ATS = Kokkos::ArithTraits<impl_scalar_type>;
+    using LO = local_ordinal_type;
+    using STS = Teuchos::ScalarTraits<Scalar>;
+    using MV = LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
+    const char tfecfFuncName[] = "multiply: ";
+    ProfilingRegion region ("Tpetra::MV::multiply");
+
+    // This routine performs a variety of matrix-matrix multiply
+    // operations, interpreting the LRMultiVector (this-aka C , A and B)
+    // as 2D matrices.  Variations are due to the fact that A, B and C
+    // can be locally replicated or globally distributed MultiVectors
+    // and that we may or may not operate with the transpose of A and
+    // B.  Possible cases are:
+    //
+    //     Operations                          # Cases  Notes
+    //  1) C(local) = A^X(local) * B^X(local)  4        X=Trans or Not, no comm needed
+    //  2) C(local) = A^T(distr) * B  (distr)  1        2-D dot product, replicate C
+    //  3) C(distr) = A  (distr) * B^X(local)  2        2-D vector update, no comm needed
+    //
+    // The following operations are not meaningful for 1-D
+    // distributions:
+    //
+    // u1) C(local) = A^T(distr) * B^T(distr)  1
+    // u2) C(local) = A  (distr) * B^X(distr)  2
+    // u3) C(distr) = A^X(local) * B^X(local)  4
+    // u4) C(distr) = A^X(local) * B^X(distr)  4
+    // u5) C(distr) = A^T(distr) * B^X(local)  2
+    // u6) C(local) = A^X(distr) * B^X(local)  4
+    // u7) C(distr) = A^X(distr) * B^X(local)  4
+    // u8) C(local) = A^X(local) * B^X(distr)  4
+    //
+    // Total number of cases: 32 (= 2^5).
+
+    impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
+
+    const bool A_is_local = ! A.isDistributed ();
+    const bool B_is_local = ! B.isDistributed ();
+    const bool C_is_local = ! this->isDistributed ();
+
+    // In debug mode, check compatibility of local dimensions.  We
+    // only do this in debug mode, since it requires an all-reduce
+    // to ensure correctness on all processses.  It's entirely
+    // possible that only some processes may have incompatible local
+    // dimensions.  Throwing an exception only on those processes
+    // could cause this method to hang.
+    const bool debug = ::Tpetra::Details::Behavior::debug ();
+    if (debug) {
+      auto myMap = this->getMap ();
+      if (! myMap.is_null () && ! myMap->getComm ().is_null ()) {
+        using Teuchos::REDUCE_MIN;
+        using Teuchos::reduceAll;
+        using Teuchos::outArg;
+
+        auto comm = myMap->getComm ();
+        const size_t A_nrows =
+          (transA != NO_TRANS) ? A.getNumVectors () : A.getLocalLength ();
+        const size_t A_ncols =
+          (transA != NO_TRANS) ? A.getLocalLength () : A.getNumVectors ();
+        const size_t B_nrows =
+          (transB != NO_TRANS) ? B.getNumVectors () : B.getLocalLength ();
+        const size_t B_ncols =
+          (transB != NO_TRANS) ? B.getLocalLength () : B.getNumVectors ();
+
+        const bool lclBad = this->getLocalLength () != A_nrows ||
+          this->getNumVectors () != B_ncols || A_ncols != B_nrows;
+
+        const int myRank = comm->getRank ();
+        std::ostringstream errStrm;
+        if (this->getLocalLength () != A_nrows) {
+          errStrm << "Proc " << myRank << ": this->getLocalLength()="
+            << this->getLocalLength () << " != A_nrows=" << A_nrows
+            << "." << std::endl;
+        }
+        if (this->getNumVectors () != B_ncols) {
+          errStrm << "Proc " << myRank << ": this->getNumVectors()="
+            << this->getNumVectors () << " != B_ncols=" << B_ncols
+            << "." << std::endl;
+        }
+        if (A_ncols != B_nrows) {
+          errStrm << "Proc " << myRank << ": A_ncols="
+            << A_ncols << " != B_nrows=" << B_nrows
+            << "." << std::endl;
+        }
+
+        const int lclGood = lclBad ? 0 : 1;
+        int gblGood = 0;
+        reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
+        if (gblGood != 1) {
+          std::ostringstream os;
+          ::Tpetra::Details::gathervPrint (os, errStrm.str (), *comm);
+
+          TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+            (true, std::runtime_error, "Inconsistent local dimensions on at "
+             "least one process in this object's communicator." << std::endl
+             << "Operation: "
+             << "C(" << (C_is_local ? "local" : "distr") << ") = "
+             << alpha << "*A"
+             << (transA == Teuchos::TRANS ? "^T" :
+                 (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
+             << "(" << (A_is_local ? "local" : "distr") << ") * "
+             << beta << "*B"
+             << (transA == Teuchos::TRANS ? "^T" :
+                 (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
+             << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
+             << "Global dimensions: C(" << this->getGlobalLength () << ", "
+             << this->getNumVectors () << "), A(" << A.getGlobalLength ()
+             << ", " << A.getNumVectors () << "), B(" << B.getGlobalLength ()
+             << ", " << B.getNumVectors () << ")." << std::endl
+             << os.str ());
+        }
+      }
+    }
+
+    // Case 1: C(local) = A^X(local) * B^X(local)
+    const bool Case1 = C_is_local && A_is_local && B_is_local;
+    // Case 2: C(local) = A^T(distr) * B  (distr)
+    const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
+      transA != NO_TRANS &&
+      transB == NO_TRANS;
+    // Case 3: C(distr) = A  (distr) * B^X(local)
+    const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
+      transA == NO_TRANS;
+
+    // Test that we are considering a meaningful case
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (! Case1 && ! Case2 && ! Case3, std::runtime_error,
+       "Multiplication of op(A) and op(B) into *this is not a "
+       "supported use case.");
+
+    if (beta != STS::zero () && Case2) {
+      // If Case2, then C is local and contributions must be summed
+      // across all processes.  However, if beta != 0, then accumulate
+      // beta*C into the sum.  When summing across all processes, we
+      // only want to accumulate this once, so set beta == 0 on all
+      // processes except Process 0.
+      const int myRank = this->getMap ()->getComm ()->getRank ();
+      if (myRank != 0) {
+        beta_local = ATS::zero ();
+      }
+    }
+
+    // We only know how to do matrix-matrix multiplies if all the
+    // MultiVectors have constant stride.  If not, we have to make
+    // temporary copies of those MultiVectors (including possibly
+    // *this) that don't have constant stride.
+    RCP<MV> C_tmp;
+    if (! isConstantStride ()) {
+      C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
+    } else {
+      C_tmp = rcp (this, false);
+    }
+
+    RCP<const MV> A_tmp;
+    if (! A.isConstantStride ()) {
+      A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
+    } else {
+      A_tmp = rcpFromRef (A);
+    }
+
+    RCP<const MV> B_tmp;
+    if (! B.isConstantStride ()) {
+      B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
+    } else {
+      B_tmp = rcpFromRef (B);
+    }
+
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
+       ! A_tmp->isConstantStride (), std::logic_error,
+       "Failed to make temporary constant-stride copies of MultiVectors.");
+
+    {
+      const LO A_lclNumRows = A_tmp->getLocalLength ();
+      const LO A_numVecs = A_tmp->getNumVectors ();
+      auto A_lcl = A_tmp->getLocalViewDevice(Access::ReadOnly);
+      auto A_sub = Kokkos::subview (A_lcl,
+                                    std::make_pair (LO (0), A_lclNumRows),
+                                    std::make_pair (LO (0), A_numVecs));
+
+
+      const LO B_lclNumRows = B_tmp->getLocalLength ();
+      const LO B_numVecs = B_tmp->getNumVectors ();
+      auto B_lcl = B_tmp->getLocalViewDevice(Access::ReadOnly);
+      auto B_sub = Kokkos::subview (B_lcl,
+                                    std::make_pair (LO (0), B_lclNumRows),
+                                    std::make_pair (LO (0), B_numVecs));
+
+      const LO C_lclNumRows = C_tmp->getLocalLength ();
+      const LO C_numVecs = C_tmp->getNumVectors ();
+
+      auto C_lcl = C_tmp->getLocalViewDevice(Access::ReadWrite);
+      auto C_sub = Kokkos::subview (C_lcl,
+                                    std::make_pair (LO (0), C_lclNumRows),
+                                    std::make_pair (LO (0), C_numVecs));
+      const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' :
+                            (transA == Teuchos::TRANS ? 'T' : 'C'));
+      const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' :
+                            (transB == Teuchos::TRANS ? 'T' : 'C'));
+      const impl_scalar_type alpha_IST (alpha);
+
+      ProfilingRegion regionGemm ("Tpetra::MV::multiply-call-gemm");
+
+      KokkosBlas::gemm (&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
+                        beta_local, C_sub);
+    }
+
+    if (! isConstantStride ()) {
+      ::Tpetra::deep_copy (*this, *C_tmp); // Copy the result back into *this.
+    }
+
+    // Dispose of (possibly) extra copies of A and B.
+    A_tmp = Teuchos::null;
+    B_tmp = Teuchos::null;
+
+    // If Case 2 then sum up *this and distribute it to all processes.
+    if (Case2) {
+      this->reduce ();
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  elementWiseMultiply (Scalar scalarAB,
+                       const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
+                       const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
+                       Scalar scalarThis)
+  {
+    using Kokkos::ALL;
+    using Kokkos::subview;
+    const char tfecfFuncName[] = "elementWiseMultiply: ";
+
+    const size_t lclNumRows = this->getLocalLength ();
+    const size_t numVecs = this->getNumVectors ();
+
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+      (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (),
+       std::runtime_error, "MultiVectors do not have the same local length.");
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
+      numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
+      "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
+      << ".");
+
+    auto this_view = this->getLocalViewDevice(Access::ReadWrite);
+    auto A_view = A.getLocalViewDevice(Access::ReadOnly);
+    auto B_view = B.getLocalViewDevice(Access::ReadOnly);
+
+    if (isConstantStride () && B.isConstantStride ()) {
+      // A is just a Vector; it only has one column, so it always has
+      // constant stride.
+      //
+      // If both *this and B have constant stride, we can do an
+      // element-wise multiply on all columns at once.
+      KokkosBlas::mult (scalarThis,
+                        this_view,
+                        scalarAB,
+                        subview (A_view, ALL (), 0),
+                        B_view);
+    }
+    else {
+      for (size_t j = 0; j < numVecs; ++j) {
+        const size_t C_col = isConstantStride () ? j : whichVectors_[j];
+        const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
+        KokkosBlas::mult (scalarThis,
+                          subview (this_view, ALL (), C_col),
+                          scalarAB,
+                          subview (A_view, ALL (), 0),
+                          subview (B_view, ALL (), B_col));
+      }
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  reduce ()
+  {
+    using ::Tpetra::Details::allReduceView;
+    using ::Tpetra::Details::ProfilingRegion;
+    ProfilingRegion region ("Tpetra::MV::reduce");
+
+    const auto map = this->getMap ();
+    if (map.get () == nullptr) {
+      return;
+    }
+    const auto comm = map->getComm ();
+    if (comm.get () == nullptr) {
+      return;
+    }
+
+    // Avoid giving device buffers to MPI.  Even if MPI handles them
+    // correctly, doing so may not perform well.
+    const bool changed_on_device = this->need_sync_host ();
+    if (changed_on_device) {
+      // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
+      // and host will be separate allocations.  In that case, it may
+      // pay to do the all-reduce from device to host.
+      Kokkos::fence("LRMultiVector::reduce"); // for UVM getLocalViewDevice is UVM which can be read as host by allReduceView, so we must not read until device is fenced
+      auto X_lcl = this->getLocalViewDevice(Access::ReadWrite);
+      allReduceView (X_lcl, X_lcl, *comm);
+    }
+    else {
+      auto X_lcl = this->getLocalViewHost(Access::ReadWrite);
+      allReduceView (X_lcl, X_lcl, *comm);
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  replaceLocalValue (const LocalOrdinal lclRow,
+                     const size_t col,
+                     const impl_scalar_type& ScalarValue)
+  {
+    if (::Tpetra::Details::Behavior::debug()) {
+      const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
+      const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
+      TEUCHOS_TEST_FOR_EXCEPTION(
+        lclRow < minLocalIndex || lclRow > maxLocalIndex,
+        std::runtime_error,
+        "Tpetra::LRMultiVector::replaceLocalValue: row index " << lclRow
+        << " is invalid.  The range of valid row indices on this process "
+        << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
+        << ", " << maxLocalIndex << "].");
+      TEUCHOS_TEST_FOR_EXCEPTION(
+        vectorIndexOutOfRange(col),
+        std::runtime_error,
+        "Tpetra::LRMultiVector::replaceLocalValue: vector index " << col
+        << " of the multivector is invalid.");
+    }
+
+    auto view = this->getLocalViewHost(Access::ReadWrite);
+    const size_t colInd = isConstantStride () ? col : whichVectors_[col];
+    view (lclRow, colInd) = ScalarValue;
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  sumIntoLocalValue (const LocalOrdinal lclRow,
+                     const size_t col,
+                     const impl_scalar_type& value,
+                     const bool atomic)
+  {
+    if (::Tpetra::Details::Behavior::debug()) {
+      const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
+      const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
+      TEUCHOS_TEST_FOR_EXCEPTION(
+        lclRow < minLocalIndex || lclRow > maxLocalIndex,
+        std::runtime_error,
+        "Tpetra::LRMultiVector::sumIntoLocalValue: row index " << lclRow
+        << " is invalid.  The range of valid row indices on this process "
+        << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
+        << ", " << maxLocalIndex << "].");
+      TEUCHOS_TEST_FOR_EXCEPTION(
+        vectorIndexOutOfRange(col),
+        std::runtime_error,
+        "Tpetra::LRMultiVector::sumIntoLocalValue: vector index " << col
+        << " of the multivector is invalid.");
+    }
+
+    const size_t colInd = isConstantStride () ? col : whichVectors_[col];
+
+    auto view = this->getLocalViewHost(Access::ReadWrite);
+    if (atomic) {
+      Kokkos::atomic_add (& (view(lclRow, colInd)), value);
+    }
+    else {
+      view(lclRow, colInd) += value;
+    }
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  replaceGlobalValue (const GlobalOrdinal gblRow,
+                      const size_t col,
+                      const impl_scalar_type& ScalarValue)
+  {
+    // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
+    // touches the RCP's reference count, which isn't thread safe.
+    const LocalOrdinal lclRow = this->map_->getLocalElement (gblRow);
+
+    if (::Tpetra::Details::Behavior::debug()) {
+      const char tfecfFuncName[] = "replaceGlobalValue: ";
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
+         std::runtime_error,
+         "Global row index " << gblRow << " is not present on this process "
+         << this->getMap ()->getComm ()->getRank () << ".");
+      TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
+        (this->vectorIndexOutOfRange (col), std::runtime_error,
+         "Vector index " << col << " of the LRMultiVector is invalid.");
+    }
+
+    this->replaceLocalValue (lclRow, col, ScalarValue);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  sumIntoGlobalValue (const GlobalOrdinal globalRow,
+                      const size_t col,
+                      const impl_scalar_type& value,
+                      const bool atomic)
+  {
+    // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
+    // touches the RCP's reference count, which isn't thread safe.
+    const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
+
+    if (::Tpetra::Details::Behavior::debug()) {
+      TEUCHOS_TEST_FOR_EXCEPTION(
+          lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
+          std::runtime_error,
+          "Tpetra::LRMultiVector::sumIntoGlobalValue: Global row index " << globalRow
+          << " is not present on this process "
+          << this->getMap ()->getComm ()->getRank () << ".");
+      TEUCHOS_TEST_FOR_EXCEPTION(
+          vectorIndexOutOfRange(col),
+          std::runtime_error,
+          "Tpetra::LRMultiVector::sumIntoGlobalValue: Vector index " << col
+          << " of the multivector is invalid.");
+    }
+
+    this->sumIntoLocalValue (lclRow, col, value, atomic);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  template <class T>
+  Teuchos::ArrayRCP<T>
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
+                  size_t j) const
+  {
+    typedef Kokkos::DualView<impl_scalar_type*,
+      typename dual_view_type::array_layout,
+      execution_space> col_dual_view_type;
+    const size_t col = isConstantStride () ? j : whichVectors_[j];
+    col_dual_view_type X_col =
+      Kokkos::subview (view_, Kokkos::ALL (), col);
+    return Kokkos::Compat::persistingView (X_col.d_view);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  bool
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  need_sync_host () const {
+    return  view_.need_sync_host ();
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  bool
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  need_sync_device () const {
+    return  view_.need_sync_device ();
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  std::string
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  descriptionImpl (const std::string& className) const
+  {
+    using Teuchos::TypeNameTraits;
+
+    std::ostringstream out;
+    out << "\"" << className << "\": {";
+    out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name ()
+        << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name ()
+        << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name ()
+        << ", Node" << Node::name ()
+        << "}, ";
+    if (this->getObjectLabel () != "") {
+      out << "Label: \"" << this->getObjectLabel () << "\", ";
+    }
+    out << ", numRows: " << this->getGlobalLength ();
+    if (className != "Tpetra::Vector") {
+      out << ", numCols: " << this->getNumVectors ()
+          << ", isConstantStride: " << this->isConstantStride ();
+    }
+    if (this->isConstantStride ()) {
+      out << ", columnStride: " << this->getStride ();
+    }
+    out << "}";
+
+    return out.str ();
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  std::string
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  description () const
+  {
+    return this->descriptionImpl ("Tpetra::LRMultiVector");
+  }
+
+  namespace Details
+  {
+    template<typename ViewType>
+    void print_vector(Teuchos::FancyOStream & out, const char* prefix, const ViewType& v)
+    {
+      using std::endl;
+      static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, typename ViewType::memory_space>::accessible,
+          "Tpetra::LRMultiVector::localDescribeToString: Details::print_vector should be given a host-accessible view.");
+      static_assert(ViewType::rank == 2,
+          "Tpetra::LRMultiVector::localDescribeToString: Details::print_vector should be given a rank-2 view.");
+      // The square braces [] and their contents are in Matlab
+      // format, so users may copy and paste directly into Matlab.
+      out << "Values("<<prefix<<"): " << std::endl
+          << "[";
+      const size_t numRows = v.extent(0);
+      const size_t numCols = v.extent(1);
+      if (numCols == 1) {
+        for (size_t i = 0; i < numRows; ++i) {
+          out << v(i,0);
+          if (i + 1 < numRows) {
+            out << "; ";
+          }
+        }
+      }
+      else {
+        for (size_t i = 0; i < numRows; ++i) {
+          for (size_t j = 0; j < numCols; ++j) {
+            out << v(i,j);
+            if (j + 1 < numCols) {
+              out << ", ";
+            }
+          }
+          if (i + 1 < numRows) {
+            out << "; ";
+          }
+        }
+      }
+      out << "]" << endl;
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  std::string
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  localDescribeToString (const Teuchos::EVerbosityLevel vl) const
+  {
+    typedef LocalOrdinal LO;
+    using std::endl;
+
+    if (vl <= Teuchos::VERB_LOW) {
+      return std::string ();
+    }
+    auto map = this->getMap ();
+    if (map.is_null ()) {
+      return std::string ();
+    }
+    auto outStringP = Teuchos::rcp (new std::ostringstream ());
+    auto outp = Teuchos::getFancyOStream (outStringP);
+    Teuchos::FancyOStream& out = *outp;
+    auto comm = map->getComm ();
+    const int myRank = comm->getRank ();
+    const int numProcs = comm->getSize ();
+
+    out << "Process " << myRank << " of " << numProcs << ":" << endl;
+    Teuchos::OSTab tab1 (out);
+
+    // VERB_MEDIUM and higher prints getLocalLength()
+    const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
+    out << "Local number of rows: " << lclNumRows << endl;
+
+    if (vl > Teuchos::VERB_MEDIUM) {
+      // VERB_HIGH and higher prints isConstantStride() and getStride().
+      // The first is only relevant if the Vector has multiple columns.
+      if (this->getNumVectors () != static_cast<size_t> (1)) {
+        out << "isConstantStride: " << this->isConstantStride () << endl;
+      }
+      if (this->isConstantStride ()) {
+        out << "Column stride: " << this->getStride () << endl;
+      }
+
+      if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
+        // VERB_EXTREME prints values.  If we're not doing univied memory, we'll
+        /// want to print both the host and device views, *without chaging state*,
+        // so we can't use our regular accessor functins
+
+        // NOTE: This is an occasion where we do *not* want the auto-sync stuff
+        // to trigger (since this function is conceptually const).  Thus, we 
+        // get *copies* of the view's data instead.
+        auto X_dev  = view_.getDeviceCopy();
+        auto X_host = view_.getHostCopy();
+
+        if(X_dev.data() == X_host.data()) {
+          // One single allocation
+          Details::print_vector(out,"unified",X_host);
+        }
+        else {          
+          Details::print_vector(out,"host",X_host);
+          Details::print_vector(out,"dev",X_dev);
+        }
+      }
+    } 
+    out.flush (); // make sure the ostringstream got everything
+    return outStringP->str ();
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  describeImpl (Teuchos::FancyOStream& out,
+                const std::string& className,
+                const Teuchos::EVerbosityLevel verbLevel) const
+  {
+    using Teuchos::TypeNameTraits;
+    using Teuchos::VERB_DEFAULT;
+    using Teuchos::VERB_NONE;
+    using Teuchos::VERB_LOW;
+    using std::endl;
+    const Teuchos::EVerbosityLevel vl =
+      (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
+
+    if (vl == VERB_NONE) {
+      return; // don't print anything
+    }
+    // If this Vector's Comm is null, then the Vector does not
+    // participate in collective operations with the other processes.
+    // In that case, it is not even legal to call this method.  The
+    // reasonable thing to do in that case is nothing.
+    auto map = this->getMap ();
+    if (map.is_null ()) {
+      return;
+    }
+    auto comm = map->getComm ();
+    if (comm.is_null ()) {
+      return;
+    }
+
+    const int myRank = comm->getRank ();
+
+    // Only Process 0 should touch the output stream, but this method
+    // in general may need to do communication.  Thus, we may need to
+    // preserve the current tab level across multiple "if (myRank ==
+    // 0) { ... }" inner scopes.  This is why we sometimes create
+    // OSTab instances by pointer, instead of by value.  We only need
+    // to create them by pointer if the tab level must persist through
+    // multiple inner scopes.
+    Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
+
+    // VERB_LOW and higher prints the equivalent of description().
+    if (myRank == 0) {
+      tab0 = Teuchos::rcp (new Teuchos::OSTab (out));
+      out << "\"" << className << "\":" << endl;
+      tab1 = Teuchos::rcp (new Teuchos::OSTab (out));
+      {
+        out << "Template parameters:" << endl;
+        Teuchos::OSTab tab2 (out);
+        out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
+            << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
+            << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
+            << "Node: " << Node::name () << endl;
+      }
+      if (this->getObjectLabel () != "") {
+        out << "Label: \"" << this->getObjectLabel () << "\", ";
+      }
+      out << "Global number of rows: " << this->getGlobalLength () << endl;
+      if (className != "Tpetra::Vector") {
+        out << "Number of columns: " << this->getNumVectors () << endl;
+      }
+      // getStride() may differ on different processes, so it (and
+      // isConstantStride()) properly belong to per-process data.
+    }
+
+    // This is collective over the Map's communicator.
+    if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
+      const std::string lclStr = this->localDescribeToString (vl);
+      ::Tpetra::Details::gathervPrint (out, lclStr, *comm);
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  describe (Teuchos::FancyOStream &out,
+            const Teuchos::EVerbosityLevel verbLevel) const
+  {
+    this->describeImpl (out, "Tpetra::LRMultiVector", verbLevel);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
+  {
+    replaceMap (newMap);
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  assign (const LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& src)
+  {
+    using ::Tpetra::Details::localDeepCopy;
+    const char prefix[] = "Tpetra::LRMultiVector::assign: ";
+
+    TEUCHOS_TEST_FOR_EXCEPTION
+      (this->getGlobalLength () != src.getGlobalLength () ||
+       this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
+       prefix << "Global dimensions of the two Tpetra::LRMultiVector "
+       "objects do not match.  src has dimensions [" << src.getGlobalLength ()
+       << "," << src.getNumVectors () << "], and *this has dimensions ["
+       << this->getGlobalLength () << "," << this->getNumVectors () << "].");
+    // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
+    TEUCHOS_TEST_FOR_EXCEPTION
+      (this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
+       prefix << "The local row counts of the two Tpetra::LRMultiVector "
+       "objects do not match.  src has " << src.getLocalLength () << " row(s) "
+       << " and *this has " << this->getLocalLength () << " row(s).");
+
+
+    // See #1510.  We're writing to *this, so we don't care about its
+    // contents in either memory space, and we don't want
+    // DualView::modify to complain about "concurrent modification" of
+    // host and device Views.
+
+    /// KJ: this is problematic. assign funtion is used to construct a subvector
+    ///     if the sync flag is reset here, it lose all our control over getLocalView interface
+    ///this->clear_sync_state();
+
+    // If need sync to device, then host has most recent version.
+    const bool src_last_updated_on_host = src.need_sync_device ();
+
+    if (src_last_updated_on_host) {
+      localDeepCopy (this->getLocalViewHost(Access::ReadWrite),
+                     src.getLocalViewHost(Access::ReadOnly),
+                     this->isConstantStride (),
+                     src.isConstantStride (),
+                     this->whichVectors_ (),
+                     src.whichVectors_ ());
+    }
+    else {
+      localDeepCopy (this->getLocalViewDevice(Access::ReadWrite),
+                     src.getLocalViewDevice(Access::ReadOnly),
+                     this->isConstantStride (),
+                     src.isConstantStride (),
+                     this->whichVectors_ (),
+                     src.whichVectors_ ());
+    }
+  }
+
+  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  template<class T>
+  Teuchos::RCP<LRMultiVector<T, LocalOrdinal, GlobalOrdinal, Node> >
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  convert () const
+  {
+    using Teuchos::RCP;
+
+    auto newMV = Teuchos::rcp(new LRMultiVector<T,LocalOrdinal,GlobalOrdinal,Node>(this->getMap(),
+                                                                                 this->getNumVectors(),
+                                                                                 /*zeroOut=*/false));
+    Tpetra::deep_copy(*newMV, *this);
+    return newMV;
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  bool
+  LRMultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  isSameSize (const LRMultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& vec) const
+  {
+    using ::Tpetra::Details::PackTraits;
+
+    const size_t l1 = this->getLocalLength();
+    const size_t l2 = vec.getLocalLength();
+    if ((l1!=l2) || (this->getNumVectors() != vec.getNumVectors())) {
+      return false;
+    }
+
+    return true;
+  }
+
+  template <class ST, class LO, class GO, class NT>
+  void LRMultiVector<ST, LO, GO, NT>::
+  swap(LRMultiVector<ST, LO, GO, NT> & mv) {
+    std::swap(mv.map_, this->map_);
+    std::swap(mv.view_, this->view_);
+    std::swap(mv.whichVectors_, this->whichVectors_);
+  }
+
+#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
+  template <class ST, class LO, class GO, class NT>
+  void
+  deep_copy (LRMultiVector<ST, LO, GO, NT>& dst,
+             const Teuchos::SerialDenseMatrix<int, ST>& src)
+  {
+    using ::Tpetra::Details::localDeepCopy;
+    using MV = LRMultiVector<ST, LO, GO, NT>;
+    using IST = typename MV::impl_scalar_type;
+    using input_view_type =
+      Kokkos::View<const IST**, Kokkos::LayoutRight,
+        Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
+    using pair_type = std::pair<LO, LO>;
+
+    const LO numRows = static_cast<LO> (src.numRows ());
+    const LO numCols = static_cast<LO> (src.numCols ());
+
+    TEUCHOS_TEST_FOR_EXCEPTION
+      (numRows != static_cast<LO> (dst.getLocalLength ()) ||
+       numCols != static_cast<LO> (dst.getNumVectors ()),
+       std::invalid_argument, "Tpetra::deep_copy: On Process "
+       << dst.getMap ()->getComm ()->getRank () << ", dst is "
+       << dst.getLocalLength () << " x " << dst.getNumVectors ()
+       << ", but src is " << numRows << " x " << numCols << ".");
+
+    const IST* src_raw = reinterpret_cast<const IST*> (src.values ());
+    input_view_type src_orig (src_raw, src.stride (), numCols);
+    input_view_type src_view =
+      Kokkos::subview (src_orig, pair_type (0, numRows), Kokkos::ALL ());
+
+    constexpr bool src_isConstantStride = true;
+    Teuchos::ArrayView<const size_t> srcWhichVectors (nullptr, 0);
+    localDeepCopy (dst.getLocalViewHost(Access::ReadWrite),
+                   src_view,
+                   dst.isConstantStride (),
+                   src_isConstantStride,
+                   getMultiVectorWhichVectors (dst),
+                   srcWhichVectors);
+  }
+
+  template <class ST, class LO, class GO, class NT>
+  void
+  deep_copy (Teuchos::SerialDenseMatrix<int, ST>& dst,
+             const LRMultiVector<ST, LO, GO, NT>& src)
+  {
+    using ::Tpetra::Details::localDeepCopy;
+    using MV = LRMultiVector<ST, LO, GO, NT>;
+    using IST = typename MV::impl_scalar_type;
+    using output_view_type =
+      Kokkos::View<IST**, Kokkos::LayoutRight,
+        Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
+    using pair_type = std::pair<LO, LO>;
+
+    const LO numRows = static_cast<LO> (dst.numRows ());
+    const LO numCols = static_cast<LO> (dst.numCols ());
+
+    TEUCHOS_TEST_FOR_EXCEPTION
+      (numRows != static_cast<LO> (src.getLocalLength ()) ||
+       numCols != static_cast<LO> (src.getNumVectors ()),
+       std::invalid_argument, "Tpetra::deep_copy: On Process "
+       << src.getMap ()->getComm ()->getRank () << ", src is "
+       << src.getLocalLength () << " x " << src.getNumVectors ()
+       << ", but dst is " << numRows << " x " << numCols << ".");
+
+    IST* dst_raw = reinterpret_cast<IST*> (dst.values ());
+    output_view_type dst_orig (dst_raw, dst.stride (), numCols);
+    auto dst_view =
+      Kokkos::subview (dst_orig, pair_type (0, numRows), Kokkos::ALL ());
+
+    constexpr bool dst_isConstantStride = true;
+    Teuchos::ArrayView<const size_t> dstWhichVectors (nullptr, 0);
+
+    // Prefer the host version of src's data.
+    localDeepCopy (dst_view,
+                   src.getLocalViewHost(Access::ReadOnly),
+                   dst_isConstantStride,
+                   src.isConstantStride (),
+                   dstWhichVectors,
+                   getMultiVectorWhichVectors (src));
+  }
+#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
+
+  template <class Scalar, class LO, class GO, class NT>
+  Teuchos::RCP<LRMultiVector<Scalar, LO, GO, NT> >
+  createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
+                     size_t numVectors)
+  {
+    typedef LRMultiVector<Scalar, LO, GO, NT> MV;
+    return Teuchos::rcp (new MV (map, numVectors));
+  }
+
+  template <class ST, class LO, class GO, class NT>
+  LRMultiVector<ST, LO, GO, NT>
+  createCopy (const LRMultiVector<ST, LO, GO, NT>& src)
+  {
+    typedef LRMultiVector<ST, LO, GO, NT> MV;
+    MV cpy (src.getMap (), src.getNumVectors (), false);
+    cpy.assign (src);
+    return cpy;
+  }
+
+} // namespace Tpetra
+
+//
+// Explicit instantiation macro
+//
+// Must be expanded from within the Tpetra namespace!
+//
+
+#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
+#  define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
+  template class LRMultiVector< SCALAR , LO , GO , NODE >; \
+  template LRMultiVector< SCALAR , LO , GO , NODE > createCopy( const LRMultiVector< SCALAR , LO , GO , NODE >& src); \
+  template Teuchos::RCP<LRMultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
+  template void deep_copy (LRMultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
+  template void deep_copy (Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const LRMultiVector<SCALAR, LO, GO, NODE>& src);
+
+#else
+#  define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
+  template class LRMultiVector< SCALAR , LO , GO , NODE >; \
+  template LRMultiVector< SCALAR , LO , GO , NODE > createCopy( const LRMultiVector< SCALAR , LO , GO , NODE >& src);
+
+#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
+
+
+#define TPETRA_MULTIVECTOR_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
+  \
+  template Teuchos::RCP< LRMultiVector< SO , LO , GO , NODE > >   \
+                LRMultiVector< SI , LO , GO , NODE >::convert< SO > () const;
+
+
+#endif // TPETRA_MULTIVECTOR_DEF_HPP
diff --git a/src/include/Tpetra_LR_WrappedDualView.hpp b/src/include/Tpetra_LR_WrappedDualView.hpp
new file mode 100644
index 00000000..245615a2
--- /dev/null
+++ b/src/include/Tpetra_LR_WrappedDualView.hpp
@@ -0,0 +1,731 @@
+// @HEADER
+// *****************************************************************************
+//          Tpetra: Templated Linear Algebra Services Package
+//
+// Copyright 2008 NTESS and the Tpetra contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef TPETRA_DETAILS_LRWRAPPEDDUALVIEW_HPP
+#define TPETRA_DETAILS_LRWRAPPEDDUALVIEW_HPP
+
+#include <Tpetra_Access.hpp>
+#include <Tpetra_Details_temporaryViewUtils.hpp>
+#include <Kokkos_DualView.hpp>
+#include "Teuchos_TestForException.hpp"
+#include "Tpetra_Details_ExecutionSpaces.hpp"
+#include "Tpetra_Details_gathervPrint.hpp"
+#include <sstream>
+
+// #include "Tpetra_Details_WrappedDualView.hpp"
+// #include "Kokkos_DualView.hpp"
+// #include "Teuchos_TypeNameTraits.hpp"
+// #include "Teuchos_Comm.hpp"
+// #include "Teuchos_CommHelpers.hpp"
+
+//#define DEBUG_UVM_REMOVAL  // Works only with gcc > 4.8
+
+#ifdef DEBUG_UVM_REMOVAL
+
+#define DEBUG_UVM_REMOVAL_ARGUMENT ,const char* callerstr = __builtin_FUNCTION(),const char * filestr=__builtin_FILE(),const int linnum = __builtin_LINE()
+
+#define DEBUG_UVM_REMOVAL_PRINT_CALLER(fn) \
+  { \
+  auto envVarSet = std::getenv("TPETRA_UVM_REMOVAL"); \
+  if (envVarSet && (std::strcmp(envVarSet,"1") == 0)) \
+    std::cout << (fn) << " called from " << callerstr \
+              << " at " << filestr << ":"<<linnum \
+              << " host cnt " << dualView.h_view.use_count()  \
+              << " device cnt " << dualView.d_view.use_count()  \
+              << std::endl; \
+  }
+
+#else
+
+#define DEBUG_UVM_REMOVAL_ARGUMENT
+#define DEBUG_UVM_REMOVAL_PRINT_CALLER(fn)
+
+#endif
+
+//! Namespace for Tpetra classes and methods
+namespace Tpetra {
+
+  // We really need this forward declaration here for friend to work
+  template<typename SC, typename LO, typename GO, typename NO>
+  class LRMultiVector;
+
+
+/// \brief Namespace for Tpetra implementation details.
+/// \warning Do NOT rely on the contents of this namespace.
+namespace Details {
+
+/// \brief Whether LRWrappedDualView reference count checking is enabled. Initially true.
+/// Since the DualView sync functions are not thread-safe, tracking should be disabled
+/// during host-parallel regions where LRWrappedDualView is used.
+
+extern bool wdvTrackingEnabled;
+
+
+/// \brief A wrapper around Kokkos::DualView to safely manage data
+///        that might be replicated between host and device.
+template <typename DualViewType>
+class LRWrappedDualView {
+public:
+
+  using DVT = DualViewType;
+  using t_host = typename DualViewType::t_host;
+  using t_dev  = typename DualViewType::t_dev;
+
+  using HostType   = typename t_host::device_type;
+  using DeviceType = typename t_dev::device_type;
+
+private:
+  static constexpr bool dualViewHasNonConstData = !impl::hasConstData<DualViewType>::value;
+  static constexpr bool deviceMemoryIsHostAccessible =
+    Kokkos::SpaceAccessibility<Kokkos::DefaultHostExecutionSpace, typename t_dev::memory_space>::accessible;
+
+private:
+  template <typename>
+  friend class LRWrappedDualView;
+
+public:
+  LRWrappedDualView() {}
+
+  LRWrappedDualView(DualViewType dualV)
+    : originalDualView(dualV),
+      dualView(originalDualView)
+  { }
+
+  //! Conversion copy constructor.
+  template <class SrcDualViewType>
+  LRWrappedDualView(const LRWrappedDualView<SrcDualViewType>& src)
+    : originalDualView(src.originalDualView),
+      dualView(src.dualView)
+  { }
+  
+  //! Conversion assignment operator.
+  template <class SrcDualViewType>
+  LRWrappedDualView& operator=(const LRWrappedDualView<SrcDualViewType>& src) {
+    originalDualView = src.originalDualView;
+    dualView = src.dualView;
+    return *this;
+  }
+
+  // This is an expert-only constructor
+  // For LRWrappedDualView to manage synchronizations correctly,
+  // it must have an DualView which is not a subview to due the
+  // sync's on.  This is what origDualV is for.  In this case,
+  // dualV is a subview of origDualV.
+  LRWrappedDualView(DualViewType dualV,DualViewType origDualV)
+    : originalDualView(origDualV),
+      dualView(dualV)
+  { }
+
+
+  LRWrappedDualView(const t_dev deviceView) {
+    TEUCHOS_TEST_FOR_EXCEPTION(
+        deviceView.data() != nullptr && deviceView.use_count() == 0,
+        std::invalid_argument,
+        "Tpetra::Details::LRWrappedDualView: cannot construct with a device view that\n"
+        "does not own its memory (i.e. constructed with a raw pointer and dimensions)\n"
+        "because the LRWrappedDualView needs to assume ownership of the memory.");
+    //If the provided view is default-constructed (null, 0 extent, 0 use count),
+    //leave the host mirror default-constructed as well in order to have a matching use count of 0.
+    t_host hostView;
+    if(deviceView.use_count() != 0)
+    {
+      hostView = Kokkos::create_mirror_view(
+          Kokkos::WithoutInitializing,
+          typename t_host::memory_space(),
+          deviceView);
+    }
+    originalDualView = DualViewType(deviceView, hostView);
+    originalDualView.clear_sync_state();
+    originalDualView.modify_device();
+    dualView = originalDualView;
+  }
+
+  // 1D View constructors
+  LRWrappedDualView(const LRWrappedDualView parent, int offset, int numEntries) {
+    originalDualView = parent.originalDualView;
+    dualView = getSubview(parent.dualView, offset, numEntries);
+  }
+
+
+  // 2D View Constructors
+  LRWrappedDualView(const LRWrappedDualView parent,const Kokkos::pair<size_t,size_t>& rowRng, const Kokkos::ALL_t& colRng) {
+    originalDualView = parent.originalDualView;
+    dualView = getSubview2D(parent.dualView,rowRng,colRng);
+  }
+
+  LRWrappedDualView(const LRWrappedDualView parent,const Kokkos::ALL_t &rowRng, const Kokkos::pair<size_t,size_t>& colRng) {
+    originalDualView = parent.originalDualView;
+    dualView = getSubview2D(parent.dualView,rowRng,colRng);
+  }
+
+  LRWrappedDualView(const LRWrappedDualView parent,const Kokkos::pair<size_t,size_t>& rowRng, const Kokkos::pair<size_t,size_t>& colRng) {
+    originalDualView = parent.originalDualView;
+    dualView = getSubview2D(parent.dualView,rowRng,colRng);
+  }
+
+  size_t extent(const int i) const {
+    return dualView.h_view.extent(i);
+  }
+
+  void stride(size_t * stride_) const {
+    dualView.stride(stride_);
+  }
+
+
+  size_t origExtent(const int i) const {
+    return originalDualView.h_view.extent(i);
+  }
+
+  const char * label() const {
+    return dualView.d_view.label();
+  }
+
+
+  typename t_host::const_type
+  getHostView(Access::ReadOnlyStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  ) const
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getHostViewReadOnly");
+    
+    if(needsSyncPath()) {
+      throwIfDeviceViewAlive();
+      impl::sync_host(originalDualView);
+    }
+    return dualView.view_host();
+  }
+
+  t_host
+  getHostView(Access::ReadWriteStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  )
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getHostViewReadWrite");
+    static_assert(dualViewHasNonConstData,
+        "ReadWrite views are not available for DualView with const data");
+    if(needsSyncPath()) {
+      throwIfDeviceViewAlive();
+      impl::sync_host(originalDualView);
+      originalDualView.modify_host();
+    }
+
+    return dualView.view_host();
+  }
+
+  t_host
+  getHostView(Access::OverwriteAllStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  )
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getHostViewOverwriteAll");
+    static_assert(dualViewHasNonConstData,
+        "OverwriteAll views are not available for DualView with const data");
+    if (iAmASubview()) {
+      return getHostView(Access::ReadWrite);
+    }
+    if(needsSyncPath()) {
+      throwIfDeviceViewAlive();
+      if (deviceMemoryIsHostAccessible) Kokkos::fence("LRWrappedDualView::getHostView");
+      dualView.clear_sync_state();
+      dualView.modify_host();
+    }
+    return dualView.view_host();
+  }
+
+  typename t_dev::const_type
+  getDeviceView(Access::ReadOnlyStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  ) const
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceViewReadOnly");
+    if(needsSyncPath()) {
+      throwIfHostViewAlive();
+      impl::sync_device(originalDualView);
+    }
+    return dualView.view_device();
+  }
+
+  t_dev
+  getDeviceView(Access::ReadWriteStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  )
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceViewReadWrite");
+    static_assert(dualViewHasNonConstData,
+        "ReadWrite views are not available for DualView with const data");
+    if(needsSyncPath()) {
+      throwIfHostViewAlive();
+      impl::sync_device(originalDualView);
+      originalDualView.modify_device();
+    }
+    return dualView.view_device();
+  }
+
+  t_dev
+  getDeviceView(Access::OverwriteAllStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  )
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceViewOverwriteAll");
+    static_assert(dualViewHasNonConstData,
+        "OverwriteAll views are not available for DualView with const data");
+    if (iAmASubview()) {
+      return getDeviceView(Access::ReadWrite);
+    }
+    if(needsSyncPath()) {
+      throwIfHostViewAlive();
+      if (deviceMemoryIsHostAccessible) Kokkos::fence("LRWrappedDualView::getDeviceView");
+      dualView.clear_sync_state();
+      dualView.modify_device();
+    }
+    return dualView.view_device();
+  }
+
+  template<class TargetDeviceType>
+  typename std::remove_reference<decltype(std::declval<DualViewType>().template view<TargetDeviceType>())>::type::const_type
+  getView (Access::ReadOnlyStruct s DEBUG_UVM_REMOVAL_ARGUMENT) const {
+    using ReturnViewType = typename std::remove_reference<decltype(std::declval<DualViewType>().template view<TargetDeviceType>())>::type::const_type;
+    using ReturnDeviceType = typename ReturnViewType::device_type;
+    constexpr bool returnDevice = std::is_same<ReturnDeviceType, DeviceType>::value;
+    if(returnDevice) {
+      DEBUG_UVM_REMOVAL_PRINT_CALLER("getView<Device>ReadOnly");
+      if(needsSyncPath()) {
+	throwIfHostViewAlive();
+	impl::sync_device(originalDualView);
+      }
+    }
+    else {
+      DEBUG_UVM_REMOVAL_PRINT_CALLER("getView<Host>ReadOnly");
+      if(needsSyncPath()) {
+	throwIfDeviceViewAlive();
+	impl::sync_host(originalDualView);
+      }
+    }
+
+    return dualView.template view<TargetDeviceType>();
+  }
+
+
+  template<class TargetDeviceType>
+  typename std::remove_reference<decltype(std::declval<DualViewType>().template view<TargetDeviceType>())>::type
+  getView (Access::ReadWriteStruct s DEBUG_UVM_REMOVAL_ARGUMENT) const {
+    using ReturnViewType = typename std::remove_reference<decltype(std::declval<DualViewType>().template view<TargetDeviceType>())>::type;
+    using ReturnDeviceType = typename ReturnViewType::device_type;
+    constexpr bool returnDevice = std::is_same<ReturnDeviceType, DeviceType>::value;
+
+    if(returnDevice) {
+      DEBUG_UVM_REMOVAL_PRINT_CALLER("getView<Device>ReadWrite");
+      static_assert(dualViewHasNonConstData,
+                    "ReadWrite views are not available for DualView with const data");
+      if(needsSyncPath()) {
+	throwIfHostViewAlive();
+	impl::sync_device(originalDualView);
+	originalDualView.modify_device();
+      }
+    }
+    else {
+      DEBUG_UVM_REMOVAL_PRINT_CALLER("getView<Host>ReadWrite");
+      static_assert(dualViewHasNonConstData,
+                    "ReadWrite views are not available for DualView with const data");
+      if(needsSyncPath()) {
+	throwIfDeviceViewAlive();
+	impl::sync_host(originalDualView);
+	originalDualView.modify_host();
+      }
+    }
+
+    return dualView.template view<TargetDeviceType>();
+  }
+
+
+  template<class TargetDeviceType>
+  typename std::remove_reference<decltype(std::declval<DualViewType>().template view<TargetDeviceType>())>::type
+  getView (Access::OverwriteAllStruct s DEBUG_UVM_REMOVAL_ARGUMENT) const {
+    using ReturnViewType = typename std::remove_reference<decltype(std::declval<DualViewType>().template view<TargetDeviceType>())>::type;
+    using ReturnDeviceType = typename ReturnViewType::device_type;
+
+    if (iAmASubview())
+      return getView<TargetDeviceType>(Access::ReadWrite);
+
+    constexpr bool returnDevice = std::is_same<ReturnDeviceType, DeviceType>::value;
+
+    if(returnDevice) {
+      DEBUG_UVM_REMOVAL_PRINT_CALLER("getView<Device>OverwriteAll");
+      static_assert(dualViewHasNonConstData,
+                    "OverwriteAll views are not available for DualView with const data");
+      if(needsSyncPath()) {
+	throwIfHostViewAlive();
+	dualView.clear_sync_state();
+	dualView.modify_host();
+      }
+    }
+    else {
+      DEBUG_UVM_REMOVAL_PRINT_CALLER("getView<Host>OverwriteAll");
+      static_assert(dualViewHasNonConstData,
+                    "OverwriteAll views are not available for DualView with const data");
+      if(needsSyncPath()) {
+	throwIfDeviceViewAlive();
+	dualView.clear_sync_state();
+	dualView.modify_device();
+      }
+    }
+
+    return dualView.template view<TargetDeviceType>();
+  }
+
+
+  typename t_host::const_type
+  getHostSubview(int offset, int numEntries, Access::ReadOnlyStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  ) const
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getHostSubviewReadOnly");
+    if(needsSyncPath()) {
+      throwIfDeviceViewAlive();
+      impl::sync_host(originalDualView);
+    }
+    return getSubview(dualView.view_host(), offset, numEntries);
+  }
+
+  t_host
+  getHostSubview(int offset, int numEntries, Access::ReadWriteStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  )
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getHostSubviewReadWrite");
+    static_assert(dualViewHasNonConstData,
+        "ReadWrite views are not available for DualView with const data");
+    if(needsSyncPath()) {
+      throwIfDeviceViewAlive();
+      impl::sync_host(originalDualView);
+      originalDualView.modify_host();
+    }
+    return getSubview(dualView.view_host(), offset, numEntries);
+  }
+
+  t_host
+  getHostSubview(int offset, int numEntries, Access::OverwriteAllStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  )
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getHostSubviewOverwriteAll");
+    static_assert(dualViewHasNonConstData,
+        "OverwriteAll views are not available for DualView with const data");
+    return getHostSubview(offset, numEntries, Access::ReadWrite);
+  }
+
+  typename t_dev::const_type
+  getDeviceSubview(int offset, int numEntries, Access::ReadOnlyStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  ) const
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceSubviewReadOnly");
+    if(needsSyncPath()) {
+      throwIfHostViewAlive();
+      impl::sync_device(originalDualView);
+    }
+    return getSubview(dualView.view_device(), offset, numEntries);
+  }
+
+  t_dev
+  getDeviceSubview(int offset, int numEntries, Access::ReadWriteStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  )
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceSubviewReadWrite");
+    static_assert(dualViewHasNonConstData,
+        "ReadWrite views are not available for DualView with const data");
+    if(needsSyncPath()) {
+      throwIfHostViewAlive();
+      impl::sync_device(originalDualView);
+      originalDualView.modify_device();
+    }
+    return getSubview(dualView.view_device(), offset, numEntries);
+  }
+
+  t_dev
+  getDeviceSubview(int offset, int numEntries, Access::OverwriteAllStruct
+    DEBUG_UVM_REMOVAL_ARGUMENT
+  )
+  {
+    DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceSubviewOverwriteAll");
+    static_assert(dualViewHasNonConstData,
+        "OverwriteAll views are not available for DualView with const data");
+    return getDeviceSubview(offset, numEntries, Access::ReadWrite);
+  }
+
+
+  // Debugging functions to get copies of the view state
+  typename t_host::HostMirror getHostCopy() const {
+    auto X_dev = dualView.view_host();
+    if(X_dev.span_is_contiguous()) {
+      auto mirror = Kokkos::create_mirror_view(X_dev);
+      Kokkos::deep_copy(mirror,X_dev);
+      return mirror;
+    }
+    else {
+      auto X_contig = Tpetra::Details::TempView::toLayout<decltype(X_dev), Kokkos::LayoutRight>(X_dev);
+      auto mirror = Kokkos::create_mirror_view(X_contig);
+      Kokkos::deep_copy(mirror,X_contig);
+      return mirror;
+    }
+  }
+
+  typename t_dev::HostMirror getDeviceCopy() const {
+    auto X_dev = dualView.view_device();
+    if(X_dev.span_is_contiguous()) {
+      auto mirror = Kokkos::create_mirror_view(X_dev);
+      Kokkos::deep_copy(mirror,X_dev);
+      return mirror;
+    }
+    else {
+      auto X_contig = Tpetra::Details::TempView::toLayout<decltype(X_dev), Kokkos::LayoutRight>(X_dev);
+      auto mirror = Kokkos::create_mirror_view(X_contig);
+      Kokkos::deep_copy(mirror,X_contig);
+      return mirror;
+    }
+  }
+
+  // Debugging functions for validity checks
+  bool is_valid_host() const {
+    return dualView.view_host().size() == 0   || dualView.view_host().data();
+  }
+
+  bool is_valid_device() const {
+    return dualView.view_device().size() == 0 || dualView.view_device().data();
+  }
+
+
+  bool need_sync_host() const {
+    return originalDualView.need_sync_host();
+  }
+
+  bool need_sync_device() const {
+    return originalDualView.need_sync_device();
+  }
+
+  int host_view_use_count() const {
+    return originalDualView.h_view.use_count();
+  }
+
+  int device_view_use_count() const {
+    return originalDualView.d_view.use_count();
+  }
+
+
+  // MultiVector really needs to get at the raw DualViews,
+  // but we'd very much prefer that users not.
+  template<typename SC, typename LO, typename GO, typename NO>
+  friend class ::Tpetra::LRMultiVector;
+
+private:
+  // A Kokkos implementation of LRWrappedDualView will have to make these
+  // functions publically accessable, but in the Tpetra version, we'd
+  // really rather not.
+  DualViewType getOriginalDualView() const {
+    return originalDualView;
+  }
+
+  DualViewType getDualView() const {
+    return dualView;
+  }
+
+  template <typename ViewType>
+  ViewType getSubview(ViewType view, int offset, int numEntries) const {
+    return Kokkos::subview(view, Kokkos::pair<int, int>(offset, offset+numEntries));
+  }
+
+  template <typename ViewType,typename int_type>
+  ViewType getSubview2D(ViewType view, Kokkos::pair<int_type,int_type> offset0, const Kokkos::ALL_t&) const {
+    return Kokkos::subview(view,offset0,Kokkos::ALL());
+  }
+
+  template <typename ViewType,typename int_type>
+  ViewType getSubview2D(ViewType view, const Kokkos::ALL_t&, Kokkos::pair<int_type,int_type> offset1) const {
+    return Kokkos::subview(view,Kokkos::ALL(),offset1);
+  }
+
+  template <typename ViewType,typename int_type>
+  ViewType getSubview2D(ViewType view, Kokkos::pair<int_type,int_type> offset0, Kokkos::pair<int_type,int_type> offset1) const {
+    return Kokkos::subview(view,offset0,offset1);
+  }
+
+  bool memoryIsAliased() const {
+    return deviceMemoryIsHostAccessible && dualView.h_view.data() == dualView.d_view.data();
+  }
+
+
+  /// \brief needsSyncPath tells us whether we need the "sync path" where we (potentially) fence,
+  ///        check use counts and take care of sync/modify for the underlying DualView.
+  ///
+  /// The logic is this:
+  /// 1. If LRWrappedDualView tracking is disabled, then never take the sync path.
+  /// 2. For non-GPU architectures where the host/device pointers are aliased
+  ///    we don't need the "sync path."
+  /// 3. For GPUs, we always need the "sync path."  For shared host/device memory (e.g. CudaUVM)
+  ///    the Kokkos::deep_copy in the sync is a no-op, but the fence associated with it matters.
+  ///
+  ///
+  /// Avoiding the "sync path" speeds up calculations on architectures where we can
+  /// avoid it (e.g. SerialNode) by not not touching the modify flags.
+  ///
+  /// Note for the future: Memory spaces that can be addressed on both host and device
+  /// that don't otherwise have an intrinsic fencing mechanism will need to trigger the
+  /// "sync path"
+  bool needsSyncPath() const {
+    if(!wdvTrackingEnabled)
+      return false;
+
+    // We check to see if the memory is not aliased *or* if it is a supported
+    // (heterogeneous memory) accelerator (for shared host/device memory).
+    return !memoryIsAliased() || Spaces::is_gpu_exec_space<typename DualViewType::execution_space>();
+  }
+
+
+  void throwIfViewsAreDifferentSizes() const {    
+    // Here we check *size* (the product of extents) rather than each extent individually.
+    // This is mostly designed to catch people resizing one view, but not the other.
+    if(dualView.d_view.size() != dualView.h_view.size()) {    
+        std::ostringstream msg;
+        msg << "Tpetra::Details::LRWrappedDualView (name = " << dualView.d_view.label()
+            << "; host and device views are different sizes: "
+            << dualView.h_view.size() << " vs " <<dualView.h_view.size();
+        throw std::runtime_error(msg.str());
+    }
+  }
+
+  void throwIfHostViewAlive() const {
+    throwIfViewsAreDifferentSizes();
+    if (dualView.h_view.use_count() > dualView.d_view.use_count()) {
+      std::ostringstream msg;
+      msg << "Tpetra::Details::LRWrappedDualView (name = " << dualView.d_view.label()
+          << "; host use_count = " << dualView.h_view.use_count()
+          << "; device use_count = " << dualView.d_view.use_count() << "): "
+          << "Cannot access data on device while a host view is alive";
+      throw std::runtime_error(msg.str());
+    }
+  }
+
+  void throwIfDeviceViewAlive() const {
+    throwIfViewsAreDifferentSizes();
+    if (dualView.d_view.use_count() > dualView.h_view.use_count()) {
+      std::ostringstream msg;
+      msg << "Tpetra::Details::LRWrappedDualView (name = " << dualView.d_view.label()
+          << "; host use_count = " << dualView.h_view.use_count()
+          << "; device use_count = " << dualView.d_view.use_count() << "): "
+          << "Cannot access data on host while a device view is alive";
+      throw std::runtime_error(msg.str());
+    }
+  }
+ 
+  bool iAmASubview() {
+    return originalDualView.h_view != dualView.h_view;
+  }
+
+  mutable DualViewType originalDualView;
+  mutable DualViewType dualView;
+};
+
+/// \brief Is the given Tpetra::WrappedDualView valid?
+///
+/// A WrappedDualView is valid if both of its constituent Views are valid.
+template<class DataType ,
+         class... Args>
+bool
+checkLocalWrappedDualViewValidity
+  (std::ostream* const lclErrStrm,
+   const int myMpiProcessRank,
+   const Tpetra::Details::LRWrappedDualView<Kokkos::DualView<DataType, Args...> >& dv)
+{
+  const bool dev_good  = dv.is_valid_device();
+  const bool host_good = dv. is_valid_host();
+  const bool good = dev_good && host_good;
+  if (! good && lclErrStrm != nullptr) {
+    using Teuchos::TypeNameTraits;
+    using std::endl;
+    using dv_type =
+      Tpetra::Details::WrappedDualView<Kokkos::DualView<DataType, Args...> >;
+
+    const std::string dvName = TypeNameTraits<dv_type>::name ();
+    *lclErrStrm << "Proc " << myMpiProcessRank << ": Tpetra::WrappedDualView "
+      "of type " << dvName << " has one or more invalid Views.  See "
+      "above error messages from this MPI process for details." << endl;
+  }
+  return good;
+}
+
+template<class DataType ,
+         class... Args>
+bool
+checkGlobalWrappedDualViewValidity
+(std::ostream* const gblErrStrm,
+ const Tpetra::Details::LRWrappedDualView<Kokkos::DualView<DataType, Args...> >& dv,
+ const bool verbose,
+ const Teuchos::Comm<int>* const comm)
+{
+  using std::endl;
+  const int myRank = comm == nullptr ? 0 : comm->getRank ();
+  std::ostringstream lclErrStrm;
+  int lclSuccess = 1;
+
+  try {
+    const bool lclValid =
+      checkLocalWrappedDualViewValidity (&lclErrStrm, myRank, dv);
+    lclSuccess = lclValid ? 1 : 0;
+  }
+  catch (std::exception& e) {
+    lclErrStrm << "Proc " << myRank << ": checkLocalDualViewValidity "
+      "threw an exception: " << e.what () << endl;
+    lclSuccess = 0;
+  }
+  catch (...) {
+    lclErrStrm << "Proc " << myRank << ": checkLocalDualViewValidity "
+      "threw an exception not a subclass of std::exception." << endl;
+    lclSuccess = 0;
+  }
+
+  int gblSuccess = 0; // output argument
+  if (comm == nullptr) {
+    gblSuccess = lclSuccess;
+  }
+  else {
+    using Teuchos::outArg;
+    using Teuchos::REDUCE_MIN;
+    using Teuchos::reduceAll;
+    reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
+  }
+
+  if (gblSuccess != 1 && gblErrStrm != nullptr) {
+    *gblErrStrm << "On at least one (MPI) process, the "
+      "Kokkos::DualView has "
+      "either the device or host pointer in the "
+      "DualView equal to null, but the DualView has a nonzero number of "
+      "rows.  For more detailed information, please rerun with the "
+      "TPETRA_VERBOSE environment variable set to 1. ";
+    if (verbose) {
+      *gblErrStrm << "  Here are error messages from all "
+        "processes:" << endl;
+      if (comm == nullptr) {
+        *gblErrStrm << lclErrStrm.str ();
+      }
+      else {
+        using Tpetra::Details::gathervPrint;
+        gathervPrint (*gblErrStrm, lclErrStrm.str (), *comm);
+      }
+    }
+   *gblErrStrm << endl;
+  }
+  return gblSuccess == 1;
+}
+
+} // namespace Details
+
+} // namespace Tpetra
+
+#endif
diff --git a/src/include/communication_plan.h b/src/include/communication_plan.h
index d8dff757..302cb119 100644
--- a/src/include/communication_plan.h
+++ b/src/include/communication_plan.h
@@ -90,7 +90,7 @@ class CommunicationPlan {
     virtual KOKKOS_INLINE_FUNCTION
     ~CommunicationPlan ();
 
-    virtual execute_comms(){}
+    virtual void execute_comms(){}
 }; // End of CommunicationPlan
 
 
@@ -113,7 +113,7 @@ CommunicationPlan<T,Layout,ExecSpace,MemoryTraits>& CommunicationPlan<T,Layout,E
     
     // Do nothing if the assignment is of the form x = x
     if (this != &temp) {
-        reverse_comms_flag = reverse_comms_flag;
+        reverse_comms_flag = temp.reverse_comms_flag;
     }
     
     return *this;
diff --git a/src/include/tpetra_wrapper_types.h b/src/include/tpetra_wrapper_types.h
index 0f60a4c4..23796d4b 100644
--- a/src/include/tpetra_wrapper_types.h
+++ b/src/include/tpetra_wrapper_types.h
@@ -54,74 +54,87 @@
 #include <Tpetra_Core.hpp>
 #include <Tpetra_Map.hpp>
 #include <Tpetra_MultiVector.hpp>
+#include <Tpetra_LRMultiVector_decl.hpp>
+#include <Tpetra_LRMultiVector_def.hpp>
 #include <Kokkos_Core.hpp>
 #include "Tpetra_Details_DefaultTypes.hpp"
 #include "Tpetra_Import.hpp"
+#include "Tpetra_Details_makeColMap.hpp"
+#include "Tpetra_Import_Util2.hpp"
+
+// Repartition Package
+#include <Zoltan2_XpetraMultiVectorAdapter.hpp>
+#include <Zoltan2_PartitioningProblem.hpp>
+#include <Zoltan2_PartitioningSolution.hpp>
+#include <Zoltan2_InputTraits.hpp>
+
+// Trilinos type definitions
+typedef Tpetra::Map<>::local_ordinal_type tpetra_LO;
+typedef Tpetra::Map<>::global_ordinal_type tpetra_GO;
+
+typedef Kokkos::ViewTraits<tpetra_LO*, Kokkos::LayoutLeft, void, void>::size_type tpetra_SizeType;
+typedef Tpetra::Details::DefaultTypes::node_type tpetra_node_type;
+using tpetra_traits = Kokkos::ViewTraits<tpetra_LO*, Kokkos::LayoutLeft, void, void>;
+
+using tpetra_array_layout    = typename tpetra_traits::array_layout;
+using tpetra_execution_space = typename tpetra_traits::execution_space;
+using tpetra_device_type     = typename tpetra_traits::device_type;
+using tpetra_memory_traits   = typename tpetra_traits::memory_traits;
+using tpetra_global_size_t   = Tpetra::global_size_t;
 
 namespace mtr
 {
 
+//forward declarations for friendship
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+class TpetraCommunicationPlan;
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+class TpetraLRCommunicationPlan;
+
 /////////////////////////
 // TpetraPartitionMap:  Container storing global indices corresponding to local indices that belong on this process/rank as well as comms related data/functions.
 /////////////////////////
-template <typename T = long long int, typename Layout = DefaultLayout, typename ExecSpace = DefaultExecSpace, typename MemoryTraits = void>
+template <typename ExecSpace = tpetra_execution_space, typename MemoryTraits = tpetra_memory_traits>
 class TpetraPartitionMap {
 
-    // this is manage
-    using TArray1D = Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits>;
-    using TArray1D_host = Kokkos::View <T*, Layout, HostSpace, MemoryTraits>;
-    using TArray1D_dev = Kokkos::View <T*, Layout, ExecSpace, MemoryTraits>;
-
-    // Trilinos type definitions
-    typedef Tpetra::Map<>::local_ordinal_type LO;
-    typedef Tpetra::Map<>::global_ordinal_type GO;
-
-    typedef Tpetra::MultiVector<real_t, LO, GO> MV;
-
-    typedef Kokkos::ViewTraits<LO*, Kokkos::LayoutLeft, void, void>::size_type SizeType;
-    typedef Tpetra::Details::DefaultTypes::node_type node_type;
-    using traits = Kokkos::ViewTraits<LO*, Kokkos::LayoutLeft, void, void>;
-
-    using array_layout    = typename traits::array_layout;
-    using execution_space = typename traits::execution_space;
-    using device_type     = typename traits::device_type;
-    using memory_traits   = typename traits::memory_traits;
-    using global_size_t   = Tpetra::global_size_t;
+    // these are unmanaged
+    using TArray1D_host = Kokkos::View <const long long int*, tpetra_array_layout, HostSpace, MemoryTraits>;
+    using TArray1D_dev = Kokkos::View <const long long int*, tpetra_array_layout, ExecSpace, MemoryTraits>;
+    
     
 protected:
     size_t length_;
-    size_t order_;  // tensor order (rank)
     MPI_Datatype mpi_datatype_;
-    TArray1D this_array_;
+    TArray1D_host host;
+    TArray1D_dev device;
     
-    void set_mpi_type();
 
 public:
 
     //pointer to wrapped Tpetra map
-    Teuchos::RCP<Tpetra::Map<LO, GO, node_type>> tpetra_map; // map of node indices
+    Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> tpetra_map; // map of node indices
 
     int num_global_;
 
-    // Data member to access host view
-    ViewCArray <T> host;
-
     //MPI communicator
     MPI_Comm mpi_comm_;
 
     TpetraPartitionMap();
 
     //Copy Constructor
-    TpetraPartitionMap(const TpetraPartitionMap<T, Layout, ExecSpace,MemoryTraits> &temp){
+    KOKKOS_INLINE_FUNCTION
+    TpetraPartitionMap(const TpetraPartitionMap<ExecSpace,MemoryTraits> &temp){
         *this = temp;
     }
      
-    //TpetraPartitionMap(size_t global_length, MPI_Comm mpi_comm_ = MPI_COMM_WORLD, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    TpetraPartitionMap(size_t global_length, MPI_Comm mpi_comm = MPI_COMM_WORLD, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    TpetraPartitionMap(DCArrayKokkos<long long int> &indices, MPI_Comm mpi_comm = MPI_COMM_WORLD, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
-    TpetraPartitionMap(DCArrayKokkos<T,Layout,ExecSpace,MemoryTraits> &indices, MPI_Comm mpi_comm_ = MPI_COMM_WORLD);
+    TpetraPartitionMap(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_tpetra_map, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
     KOKKOS_INLINE_FUNCTION
-    T& operator()(size_t i) const;
+    long long int& operator()(size_t i) const;
 
     KOKKOS_INLINE_FUNCTION
     TpetraPartitionMap& operator=(const TpetraPartitionMap& temp);
@@ -137,30 +150,25 @@ class TpetraPartitionMap {
     size_t extent() const;
     
     int getLocalIndex(int global_index) const;
+    
+    long long int getGlobalIndex(int local_index) const;
 
-    int getGlobalIndex(int local_index) const;
+    long long int getMinGlobalIndex() const;
 
+    long long int getMaxGlobalIndex() const;
+    
     bool isProcessGlobalIndex(int global_index) const;
-
+    
     bool isProcessLocalIndex(int local_index) const;
 
     // Method returns the raw device pointer of the Kokkos DualView
     KOKKOS_INLINE_FUNCTION
-    T* device_pointer() const;
+    long long int* device_pointer() const;
 
     // Method returns the raw host pointer of the Kokkos DualView
-    KOKKOS_INLINE_FUNCTION
-    T* host_pointer() const;
-
-    // Method returns kokkos dual view
-    KOKKOS_INLINE_FUNCTION
-    TArray1D get_kokkos_dual_view() const;
+    long long int* host_pointer() const;
 
-    // Method that update host view
-    void update_host();
-
-    // Method that update device view
-    void update_device();
+    void print() const;
 
     // Deconstructor
     virtual KOKKOS_INLINE_FUNCTION
@@ -169,80 +177,67 @@ class TpetraPartitionMap {
 
 
 // Default constructor
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::TpetraPartitionMap() {
+template <typename ExecSpace, typename MemoryTraits>
+TpetraPartitionMap<ExecSpace,MemoryTraits>::TpetraPartitionMap() {
     length_ = 0;
 }
 
-// Constructor for contiguous index decomposition
-// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-// TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::TpetraPartitionMap(size_t global_length, MPI_Comm mpi_comm_, const std::string& tag_string) {
-    
-//     Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<GO>(mpi_comm_));
-//     tpetra_map = Teuchos::rcp(new Tpetra::Map<LO, GO, node_type>((int) global_length, 0, teuchos_comm));
-//     num_global_ = global_length;
-//     TArray1D_host indices_host = tpetra_map->getMyGlobalIndices();
-//     length_ = indices_host.size();
-//     this_array_ = TArray1D(tag_string, length_);
-//     // Create host ViewCArray
-//     //host = ViewCArray <T> (this_array_.h_view.data(), dim0);
-//     set_mpi_type();
-// }
+//Constructor for contiguous index decomposition
+template <typename ExecSpace, typename MemoryTraits>
+TpetraPartitionMap<ExecSpace,MemoryTraits>::TpetraPartitionMap(size_t global_length, MPI_Comm mpi_comm, const std::string& tag_string) {
+    mpi_comm_ = mpi_comm;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm));
+    tpetra_map = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) global_length, 0, teuchos_comm));
+    num_global_ = global_length;
+    TArray1D_host host = tpetra_map->getMyGlobalIndices();
+    TArray1D_dev device = tpetra_map->getMyGlobalIndicesDevice();
+    length_ = host.size();
+    mpi_datatype_ = MPI_LONG_LONG_INT;
+}
 
 // Constructor to pass matar dual view of indices
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::TpetraPartitionMap(DCArrayKokkos<T,Layout,ExecSpace,MemoryTraits> &indices, MPI_Comm mpi_comm_) {
-    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
-    this_array_ = static_cast<DCArrayKokkos<GO,Layout,ExecSpace,MemoryTraits>>(indices).get_kokkos_dual_view();
-    // Create host ViewCArray
-    host = ViewCArray <T> (this_array_.h_view.data(), indices.size());
-    set_mpi_type();
-    tpetra_map = Teuchos::rcp(new Tpetra::Map<LO, GO, node_type>(Teuchos::OrdinalTraits<GO>::invalid(), this_array_.d_view, 0, teuchos_comm));
+template <typename ExecSpace, typename MemoryTraits>
+TpetraPartitionMap<ExecSpace,MemoryTraits>::TpetraPartitionMap(DCArrayKokkos<long long int> &indices, MPI_Comm mpi_comm, const std::string& tag_string) {
+    mpi_comm_ = mpi_comm;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm));
+    tpetra_map = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>(Teuchos::OrdinalTraits<tpetra_GO>::invalid(), indices.get_kokkos_dual_view().d_view, 0, teuchos_comm));
+    TArray1D_host host = tpetra_map->getMyGlobalIndices();
+    TArray1D_dev device = tpetra_map->getMyGlobalIndicesDevice();
+    length_ = host.size();
+    num_global_ = tpetra_map->getGlobalNumElements();
+    mpi_datatype_ = MPI_LONG_LONG_INT;
 }
 
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::set_mpi_type() {
-    if (typeid(T).name() == typeid(bool).name()) {
-        mpi_datatype_ = MPI_C_BOOL;
-    }
-    else if (typeid(T).name() == typeid(int).name()) {
-        mpi_datatype_ = MPI_INT;
-    }
-    else if (typeid(T).name() == typeid(long int).name()) {
-        mpi_datatype_ = MPI_LONG;
-    }
-    else if (typeid(T).name() == typeid(long long int).name()) {
-        mpi_datatype_ = MPI_LONG_LONG_INT;
-    }
-    else if (typeid(T).name() == typeid(float).name()) {
-        mpi_datatype_ = MPI_FLOAT;
-    }
-    else if (typeid(T).name() == typeid(double).name()) {
-        mpi_datatype_ = MPI_DOUBLE;
-    }
-    else {
-        printf("Your entered TpetraPartitionMap type is not a supported type for MPI communications and is being set to int\n");
-        mpi_datatype_ = MPI_INT;
-    }
+// Constructor to pass an existing Tpetra map
+template <typename ExecSpace, typename MemoryTraits>
+TpetraPartitionMap<ExecSpace,MemoryTraits>::TpetraPartitionMap(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_tpetra_map, const std::string& tag_string) {
+    tpetra_map = input_tpetra_map;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = tpetra_map->getComm();
+    mpi_comm_ = getRawMpiComm(*teuchos_comm);
+    TArray1D_host host = input_tpetra_map->getMyGlobalIndices();
+    TArray1D_dev device = input_tpetra_map->getMyGlobalIndicesDevice();
+    length_ = host.size();
+    num_global_ = tpetra_map->getGlobalNumElements();
+    mpi_datatype_ = MPI_LONG_LONG_INT;
 }
 
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+template <typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T& TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i) const {
+long long int& TpetraPartitionMap<ExecSpace,MemoryTraits>::operator()(size_t i) const {
     assert(order_ == 1 && "Tensor order (rank) does not match constructor in TpetraPartitionMap 1D!");
     assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraPartitionMap 1D!");
-    return this_array_.d_view(i);
+    return device(i);
 }
 
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+template <typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>& TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::operator= (const TpetraPartitionMap& temp) {
+TpetraPartitionMap<ExecSpace,MemoryTraits>& TpetraPartitionMap<ExecSpace,MemoryTraits>::operator= (const TpetraPartitionMap& temp) {
     
     // Do nothing if the assignment is of the form x = x
     if (this != &temp) {
         length_ = temp.length_;
-        this_array_ = temp.this_array_;
         host = temp.host;
+        device = temp.device;
         mpi_datatype_ = temp.mpi_datatype_;
         tpetra_map = temp.tpetra_map;
         mpi_comm_ = temp.mpi_comm_;
@@ -253,189 +248,328 @@ TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>& TpetraPartitionMap<T,Layout
 }
 
 // Return size
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+template <typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-size_t TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::size() const {
+size_t TpetraPartitionMap<ExecSpace,MemoryTraits>::size() const {
     return length_;
 }
 
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+template <typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-size_t TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::extent() const {
+size_t TpetraPartitionMap<ExecSpace,MemoryTraits>::extent() const {
     return length_;
 }
 
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-T* TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::device_pointer() const {
-    return this_array_.d_view.data();
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-T* TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::host_pointer() const {
-    return this_array_.h_view.data();
-}
-
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+template <typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-Kokkos::DualView <T*, Layout, ExecSpace, MemoryTraits> TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::get_kokkos_dual_view() const {
-  return this_array_;
+long long int* TpetraPartitionMap<ExecSpace,MemoryTraits>::device_pointer() const {
+    return device.data();
 }
 
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::update_host() {
-
-    this_array_.template modify<typename TArray1D::execution_space>();
-    this_array_.template sync<typename TArray1D::host_mirror_space>();
+template <typename ExecSpace, typename MemoryTraits>
+long long int* TpetraPartitionMap<ExecSpace,MemoryTraits>::host_pointer() const {
+    return host.data();
 }
 
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::update_device() {
-
-    this_array_.template modify<typename TArray1D::host_mirror_space>();
-    this_array_.template sync<typename TArray1D::execution_space>();
+template <typename ExecSpace, typename MemoryTraits>
+void TpetraPartitionMap<ExecSpace,MemoryTraits>::print() const {
+        std::ostream &out = std::cout;
+        Teuchos::RCP<Teuchos::FancyOStream> fos;
+        fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(out));
+        tpetra_map->describe(*fos,Teuchos::VERB_EXTREME);
 }
 
 // Return local index (on this process/rank) corresponding to the input global index
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-int TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::getLocalIndex(int global_index) const {
+template <typename ExecSpace, typename MemoryTraits>
+int TpetraPartitionMap<ExecSpace,MemoryTraits>::getLocalIndex(int global_index) const {
     int local_index = tpetra_map->getLocalElement(global_index);
     return local_index;
 }
 
 // Return global index corresponding to the input local (on this process/rank) index
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-int TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::getGlobalIndex(int local_index) const {
+template <typename ExecSpace, typename MemoryTraits>
+long long int TpetraPartitionMap<ExecSpace,MemoryTraits>::getGlobalIndex(int local_index) const {
     int global_index = tpetra_map->getGlobalElement(local_index);
     return global_index;
 }
 
+// Return smallest global index (on this process/rank)
+template <typename ExecSpace, typename MemoryTraits>
+long long int TpetraPartitionMap<ExecSpace,MemoryTraits>::getMinGlobalIndex() const {
+    int global_index = tpetra_map->getMinGlobalIndex();
+    return global_index;
+}
+
+// Return largest global index (on this process/rank)
+template <typename ExecSpace, typename MemoryTraits>
+long long int TpetraPartitionMap<ExecSpace,MemoryTraits>::getMaxGlobalIndex() const {
+    int global_index = tpetra_map->getMaxGlobalIndex();
+    return global_index;
+}
+
 // Return global index corresponding to the input local (on this process/rank) index
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-bool TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::isProcessGlobalIndex(int global_index) const {
+template <typename ExecSpace, typename MemoryTraits>
+bool TpetraPartitionMap<ExecSpace,MemoryTraits>::isProcessGlobalIndex(int global_index) const {
     bool belongs = tpetra_map->isNodeGlobalElement(global_index);
     return belongs;
 }
 
 // Return global index corresponding to the input local (on this process/rank) index
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-bool TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::isProcessLocalIndex(int local_index) const {
+template <typename ExecSpace, typename MemoryTraits>
+bool TpetraPartitionMap<ExecSpace,MemoryTraits>::isProcessLocalIndex(int local_index) const {
     bool belongs = tpetra_map->isNodeGlobalElement(local_index);
     return belongs;
 }
 
-template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+template <typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-TpetraPartitionMap<T,Layout,ExecSpace,MemoryTraits>::~TpetraPartitionMap() {}
+TpetraPartitionMap<ExecSpace,MemoryTraits>::~TpetraPartitionMap() {}
 
 ////////////////////////////////////////////////////////////////////////////////
 // End of TpetraPartitionMap
 ////////////////////////////////////////////////////////////////////////////////
 
 /////////////////////////
-// TpetraMVArray:  Dual type for managing distributed data on both CPU and GPU with a partition map.
+// TpetraDCArray:  Tpetra wrapper for a distributed multivector (several components per vector element).
 /////////////////////////
-template <typename T, typename Layout = DefaultLayout, typename ExecSpace = DefaultExecSpace, typename MemoryTraits = void>
-class TpetraMVArray {
+
+template <typename T, typename Layout = Kokkos::LayoutRight, typename ExecSpace = tpetra_execution_space, typename MemoryTraits = tpetra_memory_traits>
+class TpetraDCArray {
 
     // this is manage
     using  TArray1D = Kokkos::DualView <T**, Layout, ExecSpace, MemoryTraits>;
 
     size_t dims_[7];
-    size_t length_;
+    size_t global_dim1_;
+    size_t submap_size_;
+    size_t length_, component_length_;
     size_t order_;  // tensor order (rank)
-    int mpi_recv_rank_;
-    int mpi_tag_;
     MPI_Comm mpi_comm_;
-    MPI_Status mpi_status_;
     MPI_Datatype mpi_datatype_;
-    MPI_Request mpi_request_;
     TArray1D this_array_;
     
     // Trilinos type definitions
-    typedef Tpetra::Map<>::local_ordinal_type LO;
-    typedef Tpetra::Map<>::global_ordinal_type GO;
-
-    typedef Tpetra::MultiVector<real_t, LO, GO> MV;
-
-    typedef Kokkos::ViewTraits<LO*, Kokkos::LayoutLeft, void, void>::size_type SizeType;
-    typedef Tpetra::Details::DefaultTypes::node_type node_type;
-    using traits = Kokkos::ViewTraits<LO*, Kokkos::LayoutLeft, void, void>;
-
-    using array_layout    = typename traits::array_layout;
-    using execution_space = typename traits::execution_space;
-    using device_type     = typename traits::device_type;
-    using memory_traits   = typename traits::memory_traits;
-    using global_size_t   = Tpetra::global_size_t;
-
-    typedef Kokkos::View<real_t*, Kokkos::LayoutRight, device_type, memory_traits> values_array;
-    typedef Kokkos::View<GO*, array_layout, device_type, memory_traits> global_indices_array;
-    typedef Kokkos::View<LO*, array_layout, device_type, memory_traits> indices_array;
-    typedef MV::dual_view_type::t_dev vec_array;
-    typedef MV::dual_view_type::t_host host_vec_array;
-    typedef Kokkos::View<const real_t**, array_layout, HostSpace, memory_traits> const_host_vec_array;
-    typedef Kokkos::View<const real_t**, array_layout, device_type, memory_traits> const_vec_array;
-    typedef Kokkos::View<const int**, array_layout, HostSpace, memory_traits> const_host_ivec_array;
-    typedef Kokkos::View<int**, array_layout, HostSpace, memory_traits> host_ivec_array;
-    typedef MV::dual_view_type dual_vec_array;
+    typedef typename Tpetra::LRMultiVector<T, tpetra_LO, tpetra_GO> MV; //stands for MultiVector
+    typedef typename Kokkos::View<T*, Kokkos::LayoutRight, tpetra_device_type, tpetra_memory_traits> values_array;
+    typedef Kokkos::View<tpetra_GO*, Layout, tpetra_device_type, tpetra_memory_traits> global_indices_array;
+    typedef Kokkos::View<tpetra_LO*, Layout, tpetra_device_type, tpetra_memory_traits> indices_array;
+    typedef typename MV::dual_view_type::t_dev vec_array;
+    typedef typename MV::dual_view_type::t_host host_vec_array;
+    typedef typename Kokkos::View<const T**, Layout, HostSpace, tpetra_memory_traits> const_host_vec_array;
+    typedef typename Kokkos::View<const T**, Layout, tpetra_device_type, tpetra_memory_traits> const_vec_array;
+    typedef Kokkos::View<const int**, Layout, HostSpace, tpetra_memory_traits> const_host_ivec_array;
+    typedef Kokkos::View<int**, Layout, HostSpace, tpetra_memory_traits> host_ivec_array;
+    typedef typename MV::dual_view_type dual_vec_array;
+
+    Teuchos::RCP<Tpetra::Import<tpetra_LO, tpetra_GO>> importer; // tpetra comms object
     
 
 public:
     
-    Teuchos::RCP<Tpetra::Map<LO, GO, node_type>> pmap;
-    Teuchos::RCP<Tpetra::Map<LO, GO, node_type>> comm_pmap;
-    Teuchos::RCP<MV>       tpetra_vector;
-    Teuchos::RCP<MV>       tpetra_sub_vector; //for owned comms situations
-    Teuchos::RCP<Tpetra::Import<LO, GO>> importer; // tpetra comms object
-
+    friend class TpetraLRCommunicationPlan<T,Layout,ExecSpace,MemoryTraits>;
     //data for arrays that own both shared and local data and aren't intended to communicate with another MATAR type
     //This is simplifying for cases such as a local + ghost storage vector where you need to update the ghost entries
     bool own_comms; //This Mapped MPI Array contains its own communication plan; just call array_comms()
     
     void set_mpi_type();
+    TpetraPartitionMap<ExecSpace, MemoryTraits> pmap;
+    TpetraPartitionMap<ExecSpace, MemoryTraits> comm_pmap;
+    Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> tpetra_pmap;
+    Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> tpetra_comm_pmap;
+    Teuchos::RCP<MV>       tpetra_vector;
+    Teuchos::RCP<MV>       tpetra_sub_vector; //for owned comms situations
 
-    TpetraMVArray();
+    TpetraDCArray();
     
     //Copy Constructor
-    TpetraMVArray(const TpetraMVArray<T, Layout, ExecSpace,MemoryTraits> &temp){
+    KOKKOS_INLINE_FUNCTION
+    TpetraDCArray(const TpetraDCArray<T, Layout, ExecSpace,MemoryTraits> &temp){
         *this = temp;
     }
 
-    //Tpetra type for 1D case(still allocates dim0 by 1 using **T)
-    TpetraMVArray(size_t dim0, TpetraPartitionMap<long long int,Layout,ExecSpace,MemoryTraits> input_pmap, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    /* Default Contigous Map Constructors*/
+    //Tpetra type for 1D case(still allocates dim0 by 1 using **T); partitions with unique indices per process
+    TpetraDCArray(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
 
-    //Tpetra type only goes up to 2D access
-    TpetraMVArray(size_t dim0, size_t dim1, TpetraPartitionMap<long long int,Layout,ExecSpace,MemoryTraits> input_pmap, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    //Tpetra type for 2D case; partitions along rows with unique indices per process
+    TpetraDCArray(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+    
+    //Tpetra type for 3D case; partitions along rows with unique indices per process
+    TpetraDCArray(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+    
+    //Tpetra type for 4D case; partitions along rows with unique indices per process
+    TpetraDCArray(size_t dim0, size_t dim1, size_t dim2, size_t dim3,
+                  const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+    
+    //Tpetra type for 5D case; partitions along rows with unique indices per process
+    TpetraDCArray(size_t dim0, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+    
+    //Tpetra type for 6D case; partitions along rows with unique indices per process
+    TpetraDCArray(size_t dim0, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+    
+    //Tpetra type for 7D case; partitions along rows with unique indices per process
+    TpetraDCArray(size_t dim0, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+
+    /* Specified Map Constructors*/
+    //Tpetra type for 1D case with a partition map passed in
+    TpetraDCArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
+                  const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    //2D Tpetra type with a partition map passed in
+    TpetraDCArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap, size_t dim1,
+                  const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    //3D Tpetra type with a partition map passed in
+    TpetraDCArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap, size_t dim1,
+                  size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //4D Tpetra type with a partition map passed in
+    TpetraDCArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap, size_t dim1, size_t dim2, size_t dim3,
+                  const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //5D Tpetra type with a partition map passed in
+    TpetraDCArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //6D Tpetra type with a partition map passed in
+    TpetraDCArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //7D Tpetra type with a partition map passed in
+    TpetraDCArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
 
     //Tpetra type for 1D case(still allocates dim0 by 1 using **T); this constructor takes an RCP pointer to a Tpetra Map directly
-    TpetraMVArray(size_t dim0, Teuchos::RCP<Tpetra::Map<LO, GO, node_type>> input_pmap, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    TpetraDCArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                  const std::string& tag_string = DEFAULTSTRINGARRAY);
 
     //Tpetra type only goes up to 2D access; this constructor takes an RCP pointer to a Tpetra Map directly
-    TpetraMVArray(size_t dim0, size_t dim1, Teuchos::RCP<Tpetra::Map<LO, GO, node_type>> input_pmap, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    TpetraDCArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap, size_t dim1,
+                  const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    //Tpetra type only goes up to 3D access; this constructor takes an RCP pointer to a Tpetra Map directly
+    TpetraDCArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap, size_t dim1,
+                  size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //Tpetra type only goes up to 4D access; this constructor takes an RCP pointer to a Tpetra Map directly
+    TpetraDCArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap, size_t dim1, size_t dim2, size_t dim3,
+                  const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //Tpetra type only goes up to 5D access; this constructor takes an RCP pointer to a Tpetra Map directly
+    TpetraDCArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //Tpetra type only goes up to 6D access; this constructor takes an RCP pointer to a Tpetra Map directly
+    TpetraDCArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY);
     
-    TpetraMVArray(Teuchos::RCP<MV> input_tpetra_vector, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    //Tpetra type only goes up to 7D access; this constructor takes an RCP pointer to a Tpetra Map directly
+    TpetraDCArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //construct an array that views a contiguous subset of another array; start index denotes the local index in super vector to start the sub view
+    TpetraDCArray(const TpetraDCArray<T, Layout, ExecSpace,MemoryTraits> &super_vector,
+                  const TpetraPartitionMap<ExecSpace,MemoryTraits> &sub_pmap, size_t start_index);
+
+    // 1D array setup
+    void data_setup(const std::string& tag_string);
+
+    // 2D array setup
+    void data_setup( size_t dim1,
+                const std::string& tag_string);
+
+    // 3D array setup
+    void data_setup( size_t dim1, size_t dim2,
+                const std::string& tag_string);
 
-    void own_comm_setup(Teuchos::RCP<Tpetra::Map<LO, GO, node_type>> other_pmap); //only call if the map in the arg is a uniquely owned submap of the arrays map
+    // 4D array setup
+    void data_setup( size_t dim1, size_t dim2, size_t dim3,
+                const std::string& tag_string);
+
+    // 5D array setup
+    void data_setup( size_t dim1, size_t dim2, size_t dim3,
+                size_t dim4, const std::string& tag_string);
+
+    // 6D array setup
+    void data_setup( size_t dim1, size_t dim2, size_t dim3,
+                size_t dim4, size_t dim5, const std::string& tag_string);
+
+    // 7D array setup
+    void data_setup(size_t dim1, size_t dim2, size_t dim3,
+                size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string);
+
+    void own_comm_setup(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> other_pmap); //only call if the map in the arg is a uniquely owned submap of the arrays map
     
-    void own_comm_setup(TpetraPartitionMap<long long int,Layout,ExecSpace,MemoryTraits> other_pmap); //only call if the map in the arg is a uniquely owned submap of the arrays map
+    void own_comm_setup(TpetraPartitionMap<ExecSpace,MemoryTraits> &other_pmap); //only call if the map in the arg is a uniquely owned submap of the arrays map
 
     void perform_comms();
 
+    void repartition_vector(); //repartitions this vector using the zoltan2 multijagged algorithm on its data
+
     KOKKOS_INLINE_FUNCTION
     T& operator()(size_t i) const;
 
     KOKKOS_INLINE_FUNCTION
     T& operator()(size_t i, size_t j) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator() (size_t i, size_t j, size_t k,
+                   size_t l) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator() (size_t i, size_t j, size_t k,
+                   size_t l, size_t m) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator() (size_t i, size_t j, size_t k,
+                   size_t l, size_t m, size_t n) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator() (size_t i, size_t j, size_t k,
+                   size_t l, size_t m, size_t n, size_t o) const;
+
+    T& host(size_t i) const;
+
+    T& host(size_t i, size_t j) const;
+
+    T& host(size_t i, size_t j, size_t k) const;
+
+    T& host(size_t i, size_t j, size_t k,
+            size_t l) const;
+    
+    T& host(size_t i, size_t j, size_t k,
+            size_t l, size_t m) const;
+    
+    T& host(size_t i, size_t j, size_t k,
+            size_t l, size_t m, size_t n) const;
+
+    T& host(size_t i, size_t j, size_t k,
+            size_t l, size_t m, size_t n, size_t o) const;
     
     KOKKOS_INLINE_FUNCTION
-    TpetraMVArray& operator=(const TpetraMVArray& temp);
+    TpetraDCArray& operator=(const TpetraDCArray& temp);
 
     // GPU Method
     // Method that returns size
     KOKKOS_INLINE_FUNCTION
     size_t size() const;
 
+    size_t submap_size() const;
+
+    long long int getSubMapGlobalIndex(int local_index) const;
+
+    long long int getMapGlobalIndex(int local_index) const;
+
+    int getSubMapLocalIndex(long long int local_index) const;
+
+    int getMapLocalIndex(long long int local_index) const;
+
     // Host Method
     // Method that returns size
     KOKKOS_INLINE_FUNCTION
@@ -444,6 +578,8 @@ class TpetraMVArray {
     KOKKOS_INLINE_FUNCTION
     size_t dims(size_t i) const;
 
+    size_t global_dim() const;
+
     KOKKOS_INLINE_FUNCTION
     size_t order() const;
  
@@ -465,177 +601,713 @@ class TpetraMVArray {
     // Method that update device view
     void update_device();
 
+    //print vector data
+    void print() const;
+
     // Deconstructor
     virtual KOKKOS_INLINE_FUNCTION
-    ~TpetraMVArray ();
-}; // End of TpetraMVArray
+    ~TpetraDCArray ();
+}; // End of TpetraDCArray
 
 
 // Default constructor
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::TpetraMVArray(): pmap(NULL){
-    length_ = order_ = 0;
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(): tpetra_pmap(NULL){
+    length_ = order_ = component_length_ = 0;
+    own_comms = false;
     for (int i = 0; i < 7; i++) {
         dims_[i] = 0;
     }
 }
 
-// Overloaded 1D constructor
+// Overloaded 1D constructor where you provide dimensions, partitioning is done along first dimension
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::TpetraMVArray(size_t dim0,
-                                                             TpetraPartitionMap<long long int,Layout,ExecSpace,MemoryTraits> input_pmap,
-                                                             const std::string& tag_string) {
-    
-    dims_[0] = dim0;
-    dims_[1] = 1;
-    order_ = 1;
-    length_ = dim0;
-    pmap = input_pmap.tpetra_map;
-    // Create host ViewCArray
-    set_mpi_type();
-    this_array_ = TArray1D(tag_string, dim0, 1);
-    tpetra_vector   = Teuchos::rcp(new MV(pmap, this_array_));
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(size_t dim0, const std::string& tag_string, MPI_Comm mpi_comm) {
+    mpi_comm_ = mpi_comm;
+    global_dim1_ = dim0;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(tag_string);
 }
 
-// Overloaded 2D constructor
+// Overloaded 2D constructor where you provide dimensions, partitioning is done along first dimension
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::TpetraMVArray(size_t dim0, size_t dim1,
-                                                                            TpetraPartitionMap<long long int,Layout,ExecSpace,MemoryTraits> input_pmap,
-                                                                            const std::string& tag_string) {
-    
-    dims_[0] = dim0;
-    dims_[1] = dim1;
-    pmap = input_pmap.tpetra_map;
-    order_ = 2;
-    length_ = (dim0 * dim1);
-    // Create host ViewCArray
-    set_mpi_type();
-    this_array_ = TArray1D(tag_string, dim0, dim1);
-    tpetra_vector   = Teuchos::rcp(new MV(pmap, this_array_));
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(size_t dim0, size_t dim1, const std::string& tag_string, MPI_Comm mpi_comm) {
+    mpi_comm_ = mpi_comm;
+    global_dim1_ = dim0;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, tag_string);
 }
 
-// Overloaded 1D constructor taking an RPC pointer to a Tpetra Map
+// Overloaded 3D constructor where you provide dimensions, partitioning is done along first dimension
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string, MPI_Comm mpi_comm) {
+    mpi_comm_ = mpi_comm;
+    global_dim1_ = dim0;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, tag_string);
+}
+
+// Overloaded 4D constructor where you provide dimensions, partitioning is done along first dimension
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string, MPI_Comm mpi_comm) {
+    mpi_comm_ = mpi_comm;
+    global_dim1_ = dim0;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, tag_string);
+}
+
+// Overloaded 5D constructor where you provide dimensions, partitioning is done along first dimension
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(size_t dim0, size_t dim1, size_t dim2, size_t dim3,
+                                                              size_t dim4, const std::string& tag_string, MPI_Comm mpi_comm) {
+    mpi_comm_ = mpi_comm;
+    global_dim1_ = dim0;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, tag_string);
+}
+
+// Overloaded 6D constructor where you provide dimensions, partitioning is done along first dimension
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::TpetraMVArray(size_t dim0,
-                                                             Teuchos::RCP<Tpetra::Map<LO, GO, node_type>> input_pmap,
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(size_t dim0, size_t dim1, size_t dim2, size_t dim3,
+                                                              size_t dim4, size_t dim5, const std::string& tag_string, MPI_Comm mpi_comm) {
+    mpi_comm_ = mpi_comm;
+    global_dim1_ = dim0;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, dim5, tag_string);
+}
+
+// Overloaded 7D constructor where you provide dimensions, partitioning is done along first dimension
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(size_t dim0, size_t dim1, size_t dim2, size_t dim3,
+                                                              size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string, MPI_Comm mpi_comm) {
+    mpi_comm_ = mpi_comm;
+    global_dim1_ = dim0;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, dim5, dim6, tag_string);
+}
+
+// Overloaded 1D constructor where you provide a partition map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
                                                              const std::string& tag_string) {
-    
-    dims_[0] = dim0;
-    dims_[1] = 1;
-    order_ = 1;
-    length_ = dim0;
+    mpi_comm_ = input_pmap.mpi_comm_;                                                            
+    global_dim1_ = input_pmap.num_global_;
+    tpetra_pmap = input_pmap.tpetra_map;
     pmap = input_pmap;
-    // Create host ViewCArray
-    set_mpi_type();
-    this_array_ = TArray1D(tag_string, dim0, 1);
-    tpetra_vector   = Teuchos::rcp(new MV(pmap, this_array_));
+    own_comms = false;
+    data_setup(tag_string);
 }
 
-// Overloaded 2D constructor taking an RPC pointer to a Tpetra Map
+// Overloaded 2D constructor where you provide a partition map
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::TpetraMVArray(size_t dim0, size_t dim1,
-                                                                            Teuchos::RCP<Tpetra::Map<LO, GO, node_type>> input_pmap,
-                                                                            const std::string& tag_string) {
-    
-    dims_[0] = dim0;
-    dims_[1] = dim1;
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
+                                                              size_t dim1, const std::string& tag_string) {
+    mpi_comm_ = input_pmap.mpi_comm_;
+    global_dim1_ = input_pmap.num_global_;
+    tpetra_pmap = input_pmap.tpetra_map;
     pmap = input_pmap;
-    order_ = 2;
-    length_ = (dim0 * dim1);
-    // Create host ViewCArray
-    set_mpi_type();
-    this_array_ = TArray1D(tag_string, dim0, dim1);
-    tpetra_vector   = Teuchos::rcp(new MV(pmap, this_array_));
+    own_comms = false;
+    data_setup(dim1, tag_string);
 }
 
-// Tpetra vector argument constructor: CURRENTLY DOESN'T WORK SINCE WE CANT GET DUAL VIEW FROM THE MULTIVECTOR
+// Overloaded 3D constructor where you provide a partition map
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::TpetraMVArray(Teuchos::RCP<MV> input_tpetra_vector, const std::string& tag_string){
-    
-    tpetra_vector   = input_tpetra_vector;
-    pmap = input_tpetra_vector->getMap(); 
-    //this_array_ = tpetra_vector->getWrappedDualView();
-    dims_[0] = tpetra_vector->getMap()->getLocalNumElements();
-    dims_[1] = tpetra_vector->getNumVectors();
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
+                                                              size_t dim1, size_t dim2, const std::string& tag_string) {
+    mpi_comm_ = input_pmap.mpi_comm_;
+    global_dim1_ = input_pmap.num_global_;
+    tpetra_pmap = input_pmap.tpetra_map;
+    pmap = input_pmap;
+    own_comms = false;
+    data_setup(dim1, dim2, tag_string);
+}
 
-    if(dims_[1]==1){
-        order_ = 1;
-    }
-    else{
-        order_ = 2;
-    }
-    length_ = (dims_[0] * dims_[1]);
-    set_mpi_type();
+// Overloaded 4D constructor where you provide a partition map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
+                                                              size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) {
+    mpi_comm_ = input_pmap.mpi_comm_;
+    global_dim1_ = input_pmap.num_global_;
+    tpetra_pmap = input_pmap.tpetra_map;
+    pmap = input_pmap;
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, tag_string);
 }
 
+// Overloaded 5D constructor where you provide a partition map
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::set_mpi_type() {
-    if (typeid(T).name() == typeid(bool).name()) {
-        mpi_datatype_ = MPI_C_BOOL;
-    }
-    else if (typeid(T).name() == typeid(int).name()) {
-        mpi_datatype_ = MPI_INT;
-    }
-    else if (typeid(T).name() == typeid(long int).name()) {
-        mpi_datatype_ = MPI_LONG;
-    }
-    else if (typeid(T).name() == typeid(long long int).name()) {
-        mpi_datatype_ = MPI_LONG_LONG_INT;
-    }
-    else if (typeid(T).name() == typeid(float).name()) {
-        mpi_datatype_ = MPI_FLOAT;
-    }
-    else if (typeid(T).name() == typeid(double).name()) {
-        mpi_datatype_ = MPI_DOUBLE;
-    }
-    else {
-        printf("Your entered TpetraMVArray type is not a supported type for MPI communications and is being set to int\n");
-        mpi_datatype_ = MPI_INT;
-    }
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
+                                                              size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) {
+    mpi_comm_ = input_pmap.mpi_comm_;
+    global_dim1_ = input_pmap.num_global_;
+    tpetra_pmap = input_pmap.tpetra_map;
+    pmap = input_pmap;
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, tag_string);
 }
 
+// Overloaded 6D constructor where you provide a partition map
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-T& TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i) const {
-    assert(order_ == 1 && "Tensor order (rank) does not match constructor in TpetraMVArray 1D!");
-    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraMVArray 1D!");
-    return this_array_.d_view(i,0);
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
+                                                              size_t dim1, size_t dim2, size_t dim3, size_t dim4,
+                                                              size_t dim5, const std::string& tag_string) {
+    mpi_comm_ = input_pmap.mpi_comm_;
+    global_dim1_ = input_pmap.num_global_;
+    tpetra_pmap = input_pmap.tpetra_map;
+    pmap = input_pmap;
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, dim5, tag_string);
 }
 
+// Overloaded 7D constructor where you provide a partition map
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-T& TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j) const {
-    assert(order_ == 2 && "Tensor order (rank) does not match constructor in TpetraMVArray 2D!");
-    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraMVArray 2D!");
-    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraMVArray 2D!");
-    return this_array_.d_view(i,j);
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
+                                                              size_t dim1, size_t dim2, size_t dim3, size_t dim4,
+                                                              size_t dim5, size_t dim6, const std::string& tag_string) {
+    mpi_comm_ = input_pmap.mpi_comm_;
+    global_dim1_ = input_pmap.num_global_;
+    tpetra_pmap = input_pmap.tpetra_map;
+    pmap = input_pmap;
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, dim5, dim6, tag_string);
 }
 
+// Overloaded 1D constructor taking an RPC pointer to a Tpetra Map
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>& TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::operator= (const TpetraMVArray& temp) {
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                                                              const std::string& tag_string) {
     
-    // Do nothing if the assignment is of the form x = x
-    if (this != &temp) {
-        for (int iter = 0; iter < temp.order_; iter++){
-            dims_[iter] = temp.dims_[iter];
+    global_dim1_ = input_pmap->getGlobalNumElements();
+    tpetra_pmap = input_pmap;
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(tag_string);
+}
+
+// Overloaded 2D constructor taking an RPC pointer to a Tpetra Map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                                                              size_t dim1, const std::string& tag_string) {
+    
+    global_dim1_ = input_pmap->getGlobalNumElements();
+    tpetra_pmap = input_pmap;
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, tag_string);
+}
+
+// Overloaded 3D constructor taking an RPC pointer to a Tpetra Map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                                                              size_t dim1, size_t dim2, const std::string& tag_string) {
+    
+    global_dim1_ = input_pmap->getGlobalNumElements();
+    tpetra_pmap = input_pmap;
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, tag_string);
+}
+
+// Overloaded 4D constructor taking an RPC pointer to a Tpetra Map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                                                              size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) {
+    
+    global_dim1_ = input_pmap->getGlobalNumElements();
+    tpetra_pmap = input_pmap;
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, tag_string);
+}
+
+// Overloaded 5D constructor taking an RPC pointer to a Tpetra Map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                                                              size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) {
+    
+    global_dim1_ = input_pmap->getGlobalNumElements();
+    tpetra_pmap = input_pmap;
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, tag_string);
+}
+
+// Overloaded 6D constructor taking an RPC pointer to a Tpetra Map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                                                              size_t dim1, size_t dim2, size_t dim3, size_t dim4,
+                                                              size_t dim5, const std::string& tag_string) {
+    
+    global_dim1_ = input_pmap->getGlobalNumElements();
+    tpetra_pmap = input_pmap;
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, dim5, tag_string);
+}
+
+// Overloaded 7D constructor taking an RPC pointer to a Tpetra Map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                                                              size_t dim1, size_t dim2, size_t dim3, size_t dim4,
+                                                              size_t dim5, size_t dim6, const std::string& tag_string) {
+    
+    global_dim1_ = input_pmap->getGlobalNumElements();
+    tpetra_pmap = input_pmap;
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, dim5, dim6, tag_string);
+}
+
+//construct an array that views a contiguous subset of another array; start index denotes the local index in super vector to start the sub view
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(const TpetraDCArray<T, Layout, ExecSpace,MemoryTraits> &super_vector,
+                const TpetraPartitionMap<ExecSpace,MemoryTraits> &sub_pmap, size_t start_index){
+    mpi_comm_ = sub_pmap.mpi_comm_;                                                            
+    global_dim1_ = sub_pmap.num_global_;
+    tpetra_pmap = sub_pmap.tpetra_map;
+    pmap = sub_pmap;
+    own_comms = false;
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    for (int iter = 1; iter < super_vector.order_; iter++){
+            dims_[iter] = super_vector.dims_[iter];
+        } // end for
+
+    if(super_vector.order_==1){
+        dims_[1] = 1;
+    }
+    order_ = super_vector.order_;
+    component_length_ = super_vector.component_length_;
+    length_ = dims_[0]*component_length_;
+    // Create host ViewCArray
+    set_mpi_type();
+    this_array_ = TArray1D(super_vector.this_array_, std::pair<size_t,size_t>(start_index, super_vector.this_array_.extent(0)), Kokkos::ALL());
+    tpetra_vector = Teuchos::rcp(new MV(*(super_vector.tpetra_vector), tpetra_pmap, start_index));
+}
+
+// Tpetra vector argument constructor: CURRENTLY DOESN'T WORK SINCE WE CANT GET DUAL VIEW FROM THE MULTIVECTOR
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDCArray(Teuchos::RCP<MV> input_tpetra_vector, const std::string& tag_string){
+    
+//     tpetra_vector   = input_tpetra_vector;
+//     tpetra_pmap = input_tpetra_vector->getMap(); 
+//     //this_array_ = tpetra_vector->getWrappedDualView();
+//     dims_[0] = tpetra_vector->getMap()->getLocalNumElements()();
+//     dims_[1] = tpetra_vector->getNumVectors();
+
+//     if(dims_[1]==1){
+//         order_ = 1;
+//     }
+//     else{
+//         order_ = 2;
+//     }
+//     length_ = (dims_[0] * dims_[1]);
+//     set_mpi_type();
+// }
+
+// Overloaded 1D array setup
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::data_setup(const std::string& tag_string) {
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    dims_[1] = 1;
+    order_ = 1;
+    length_ = dims_[0];
+    component_length_ = 1;
+    // Create host ViewCArray
+    set_mpi_type();
+    this_array_ = TArray1D(tag_string, dims_[0], component_length_);
+    tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+// Overloaded 2D array setup
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::data_setup(size_t dim1, const std::string& tag_string) {
+
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    dims_[1] = dim1;
+    order_ = 2;
+    length_ = (dims_[0] * dims_[1]);
+    component_length_ = dims_[1];
+    // Create host ViewCArray
+    //set_mpi_type();
+    this_array_ = TArray1D(tag_string, dims_[0], component_length_);
+    tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+// Overloaded 3D array setup
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::data_setup(size_t dim1, size_t dim2, const std::string& tag_string) {
+
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    order_ = 3;
+    length_ = (dims_[0] * dims_[1] * dims_[2]);
+    component_length_ = dims_[1] * dims_[2];
+    // Create host ViewCArray
+    set_mpi_type();
+    this_array_ = TArray1D(tag_string, dims_[0], component_length_);
+    tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+// Overloaded 4D array setup
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::data_setup(size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) {
+
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    order_ = 4;
+    length_ = (dims_[0] * dims_[1] * dims_[2]* dims_[3]);
+    component_length_ = dims_[1] * dims_[2] * dims_[3];
+    // Create host ViewCArray
+    set_mpi_type();
+    this_array_ = TArray1D(tag_string, dims_[0], component_length_);
+    tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+// Overloaded 5D array setup
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::data_setup(size_t dim1, size_t dim2, size_t dim3,
+                                                           size_t dim4, const std::string& tag_string) {
+
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    dims_[4] = dim4;
+    order_ = 5;
+    length_ = (dims_[0] * dims_[1] * dims_[2] * dims_[3]* dims_[4]);
+    component_length_ = dims_[1] * dims_[2] * dims_[3] * dims_[4];
+    // Create host ViewCArray
+    set_mpi_type();
+    this_array_ = TArray1D(tag_string, dims_[0], component_length_);
+    tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+// Overloaded 6D array setup
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::data_setup(size_t dim1, size_t dim2, size_t dim3,
+                                                           size_t dim4, size_t dim5, const std::string& tag_string) {
+
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    dims_[4] = dim4;
+    dims_[5] = dim5;
+    order_ = 6;
+    length_ = (dims_[0] * dims_[1] * dims_[2] * dims_[3] * dims_[4] * dims_[5]);
+    component_length_ = dims_[1] * dims_[2] * dims_[3] * dims_[4] * dims_[5];
+    // Create host ViewCArray
+    set_mpi_type();
+    this_array_ = TArray1D(tag_string, dims_[0], component_length_);
+    tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+// Overloaded 7D array setup
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::data_setup(size_t dim1, size_t dim2, size_t dim3,size_t dim4, size_t dim5, size_t dim6,
+                                                           const std::string& tag_string) {
+    
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    dims_[4] = dim4;
+    dims_[5] = dim5;
+    dims_[6] = dim6;
+    order_ = 7;
+    length_ = (dims_[0] * dims_[1] * dims_[2] * dims_[3] * dims_[4] * dims_[5] * dims_[6]);
+    component_length_ = dims_[1] * dims_[2] * dims_[3] * dims_[4] * dims_[5] * dims_[6];
+    // Create host ViewCArray
+    set_mpi_type();
+    this_array_ = TArray1D(tag_string, dims_[0], component_length_);
+    tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::set_mpi_type() {
+    if (typeid(T).name() == typeid(bool).name()) {
+        mpi_datatype_ = MPI_C_BOOL;
+    }
+    else if (typeid(T).name() == typeid(int).name()) {
+        mpi_datatype_ = MPI_INT;
+    }
+    else if (typeid(T).name() == typeid(long int).name()) {
+        mpi_datatype_ = MPI_LONG;
+    }
+    else if (typeid(T).name() == typeid(long long int).name()) {
+        mpi_datatype_ = MPI_LONG_LONG_INT;
+    }
+    else if (typeid(T).name() == typeid(float).name()) {
+        mpi_datatype_ = MPI_FLOAT;
+    }
+    else if (typeid(T).name() == typeid(double).name()) {
+        mpi_datatype_ = MPI_DOUBLE;
+    }
+    else {
+        printf("Your entered TpetraDCArray type is not a supported type for MPI communications and is being set to int\n");
+        mpi_datatype_ = MPI_INT;
+    }
+}
+
+//1D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i) const {
+    assert(order_ == 1 && "Tensor order (rank) does not match constructor in TpetraDCArray 1D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDCArray 1D!");
+    return this_array_.d_view(i,0);
+}
+
+//2D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j) const {
+    assert(order_ == 2 && "Tensor order (rank) does not match constructor in TpetraDCArray 2D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDCArray 2D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDCArray 2D!");
+    return this_array_.d_view(i,j);
+}
+
+// 3D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k) const {
+    assert(order_ == 3 && "Tensor order (rank) does not match constructor in TpetraDCArray 3D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDCArray 3D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDCArray 3D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDCArray 3D!");
+    return this_array_.d_view(i, j * dims_[2] + k);
+}
+
+// 4D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l) const {
+    assert(order_ == 4 && "Tensor order (rank) does not match constructor in TpetraDCArray 4D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDCArray 4D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDCArray 4D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDCArray 4D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in TpetraDCArray 4D!");
+    return this_array_.d_view(i, j * dims_[2] * dims_[3] + k * dims_[3] + l);
+}
+
+// 5D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
+                               size_t m) const {
+    assert(order_ == 5 && "Tensor order (rank) does not match constructor in TpetraDCArray 5D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDCArray 5D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDCArray 5D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDCArray 5D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in TpetraDCArray 5D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in TpetraDCArray 5D!");
+    return this_array_.d_view(i, j * dims_[2] * dims_[3] * dims_[4] +
+                              k * dims_[3] * dims_[4] + l * dims_[4] + m);
+}
+
+// 6D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
+                               size_t m, size_t n) const {
+    assert(order_ == 6 && "Tensor order (rank) does not match constructor in TpetraDCArray 6D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDCArray 6D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDCArray 6D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDCArray 6D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in TpetraDCArray 6D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in TpetraDCArray 6D!");
+    assert(n >= 0 && n < dims_[5] && "n is out of bounds in TpetraDCArray 6D!");
+    return this_array_.d_view(i, j * dims_[2] * dims_[3] * dims_[4]* dims_[5] +
+                              k * dims_[3] * dims_[4]* dims_[5] + l * dims_[4]* dims_[5] +
+                              m* dims_[5] + n);
+}
+
+// 7D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
+                               size_t m, size_t n, size_t o) const {
+    assert(order_ == 7 && "Tensor order (rank) does not match constructor in TpetraDCArray 7D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDCArray 7D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDCArray 7D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDCArray 7D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in TpetraDCArray 7D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in TpetraDCArray 7D!");
+    assert(n >= 0 && n < dims_[5] && "n is out of bounds in TpetraDCArray 7D!");
+    assert(o >= 0 && o < dims_[6] && "o is out of bounds in TpetraDCArray 7D!");
+    return this_array_.d_view(i,  j * dims_[2] * dims_[3] * dims_[4] * dims_[5] * dims_[6] +
+                              k * dims_[3] * dims_[4]* dims_[5] * dims_[6] + l * dims_[4]* dims_[5] * dims_[6] +
+                              m* dims_[5] * dims_[6] + n * dims_[6] + o);
+}
+
+// Return global index corresponding to the input local (on this process/rank) index for the sub map this vector comms from
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+long long int TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::getSubMapGlobalIndex(int local_index) const {
+    long long int global_index = tpetra_comm_pmap->getGlobalElement(local_index);
+    return global_index;
+}
+
+// Return global index corresponding to the input local (on this process/rank) index
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+long long int TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::getMapGlobalIndex(int local_index) const {
+    long long int global_index = tpetra_pmap->getGlobalElement(local_index);
+    return global_index;
+}
+
+// Return global index corresponding to the input local (on this process/rank) index for the sub map this vector comms from
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+int TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::getSubMapLocalIndex(long long int global_index) const {
+    int local_index = tpetra_comm_pmap->getLocalElement(global_index);
+    return local_index;
+}
+
+// Return global index corresponding to the input local (on this process/rank) index
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+int TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::getMapLocalIndex(long long int global_index) const {
+    int local_index = tpetra_pmap->getLocalElement(global_index);
+    return local_index;
+}
+
+//1D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T& TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::host(size_t i) const {
+    assert(order_ == 1 && "Tensor order (rank) does not match constructor in TpetraDCArray 1D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDCArray 1D!");
+    return this_array_.h_view(i,0);
+}
+
+//2D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T& TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::host(size_t i, size_t j) const {
+    assert(order_ == 2 && "Tensor order (rank) does not match constructor in TpetraDCArray 2D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDCArray 2D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDCArray 2D!");
+    return this_array_.h_view(i,j);
+}
+
+// 3D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T& TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::host(size_t i, size_t j, size_t k) const {
+    assert(order_ == 3 && "Tensor order (rank) does not match constructor in TpetraDCArray 3D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDCArray 3D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDCArray 3D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDCArray 3D!");
+    return this_array_.h_view(i, j * dims_[2] + k);
+}
+
+// 4D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T& TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::host(size_t i, size_t j, size_t k, size_t l) const {
+    assert(order_ == 4 && "Tensor order (rank) does not match constructor in TpetraDCArray 4D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDCArray 4D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDCArray 4D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDCArray 4D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in TpetraDCArray 4D!");
+    return this_array_.h_view(i, j * dims_[2] * dims_[3] + k * dims_[3] + l);
+}
+
+// 5D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T& TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::host(size_t i, size_t j, size_t k, size_t l,
+                               size_t m) const {
+    assert(order_ == 5 && "Tensor order (rank) does not match constructor in TpetraDCArray 5D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDCArray 5D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDCArray 5D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDCArray 5D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in TpetraDCArray 5D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in TpetraDCArray 5D!");
+    return this_array_.h_view(i, j * dims_[2] * dims_[3] * dims_[4] +
+                              k * dims_[3] * dims_[4] + l * dims_[4] + m);
+}
+
+// 6D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T& TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::host(size_t i, size_t j, size_t k, size_t l,
+                               size_t m, size_t n) const {
+    assert(order_ == 6 && "Tensor order (rank) does not match constructor in TpetraDCArray 6D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDCArray 6D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDCArray 6D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDCArray 6D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in TpetraDCArray 6D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in TpetraDCArray 6D!");
+    assert(n >= 0 && n < dims_[5] && "n is out of bounds in TpetraDCArray 6D!");
+    return this_array_.h_view(i, j * dims_[2] * dims_[3] * dims_[4]* dims_[5] +
+                              k * dims_[3] * dims_[4]* dims_[5] + l * dims_[4]* dims_[5] +
+                              m* dims_[5] + n);
+}
+
+// 7D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T& TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::host(size_t i, size_t j, size_t k, size_t l,
+                               size_t m, size_t n, size_t o) const {
+    assert(order_ == 7 && "Tensor order (rank) does not match constructor in TpetraDCArray 7D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDCArray 7D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDCArray 7D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDCArray 7D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in TpetraDCArray 7D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in TpetraDCArray 7D!");
+    assert(n >= 0 && n < dims_[5] && "n is out of bounds in TpetraDCArray 7D!");
+    assert(o >= 0 && o < dims_[6] && "o is out of bounds in TpetraDCArray 7D!");
+    return this_array_.h_view(i,  j * dims_[2] * dims_[3] * dims_[4] * dims_[5] * dims_[6] +
+                              k * dims_[3] * dims_[4]* dims_[5] * dims_[6] + l * dims_[4]* dims_[5] * dims_[6] +
+                              m* dims_[5] * dims_[6] + n * dims_[6] + o);
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>& TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::operator= (const TpetraDCArray& temp) {
+    
+    // Do nothing if the assignment is of the form x = x
+    if (this != &temp) {
+        for (int iter = 0; iter < temp.order_; iter++){
+            dims_[iter] = temp.dims_[iter];
         } // end for
 
+        if(temp.order_==1){
+            dims_[1] = 1;
+        }
+
+        global_dim1_ = temp.global_dim1_;
         order_ = temp.order_;
         length_ = temp.length_;
+        component_length_ = temp.component_length_;
         this_array_ = temp.this_array_;
-        mpi_recv_rank_ = temp.mpi_recv_rank_;
-        mpi_tag_ = temp.mpi_tag_;
         mpi_comm_ = temp.mpi_comm_;
-        mpi_status_ = temp.mpi_status_;
         mpi_datatype_ = temp.mpi_datatype_;
-        mpi_request_ = temp.mpi_request_;
         tpetra_vector = temp.tpetra_vector;
         tpetra_sub_vector = temp.tpetra_sub_vector;
         pmap = temp.pmap;
         comm_pmap = temp.comm_pmap;
+        tpetra_pmap = temp.tpetra_pmap;
+        tpetra_comm_pmap = temp.tpetra_comm_pmap;
         importer = temp.importer;
         own_comms = temp.own_comms;
+        submap_size_ = temp.submap_size_;
     }
     
     return *this;
@@ -644,95 +1316,2202 @@ TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>& TpetraMVArray<T,Layout,ExecSpace
 // Return size
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-size_t TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::size() const {
+size_t TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::size() const {
     return length_;
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-size_t TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::extent() const {
+size_t TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::extent() const {
     return length_;
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-size_t TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::dims(size_t i) const {
-    assert(i < order_ && "TpetraMVArray order (rank) does not match constructor, dim[i] does not exist!");
-    assert(i >= 0 && dims_[i]>0 && "Access to TpetraMVArray dims is out of bounds!");
+size_t TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::dims(size_t i) const {
+    assert(i < order_ && "TpetraDCArray order (rank) does not match constructor, dim[i] does not exist!");
+    assert(i >= 0 && dims_[i]>0 && "Access to TpetraDCArray dims is out of bounds!");
     return dims_[i];
 }
 
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+size_t TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::global_dim() const {
+    return global_dim1_;
+}
+
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-size_t TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::order() const {
+size_t TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::order() const {
     return order_;
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T* TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::device_pointer() const {
+T* TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::device_pointer() const {
     return this_array_.d_view.data();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-T* TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::host_pointer() const {
+T* TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::host_pointer() const {
     return this_array_.h_view.data();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-Kokkos::DualView <T**, Layout, ExecSpace, MemoryTraits> TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::get_kokkos_dual_view() const {
+Kokkos::DualView <T**, Layout, ExecSpace, MemoryTraits> TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::get_kokkos_dual_view() const {
   return this_array_;
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::update_host() {
+void TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::update_host() {
 
     this_array_.template modify<typename TArray1D::execution_space>();
     this_array_.template sync<typename TArray1D::host_mirror_space>();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::update_device() {
+void TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::update_device() {
 
     this_array_.template modify<typename TArray1D::host_mirror_space>();
     this_array_.template sync<typename TArray1D::execution_space>();
 }
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::own_comm_setup(TpetraPartitionMap<long long int,Layout,ExecSpace,MemoryTraits> other_pmap) {
+void TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::own_comm_setup(TpetraPartitionMap<ExecSpace,MemoryTraits> &other_pmap) {
     own_comms = true;
-    comm_pmap = other_pmap.tpetra_map;
-    tpetra_sub_vector = Teuchos::rcp(new MV(*tpetra_vector, comm_pmap));
-    importer = Teuchos::rcp(new Tpetra::Import<LO, GO>(comm_pmap, pmap));
+    tpetra_comm_pmap = other_pmap.tpetra_map;
+    comm_pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_comm_pmap);
+    int local_offset = tpetra_pmap->getLocalElement((tpetra_comm_pmap->getMinGlobalIndex()));
+    tpetra_sub_vector = Teuchos::rcp(new MV(*tpetra_vector, tpetra_comm_pmap, local_offset));
+    submap_size_ = tpetra_comm_pmap->getLocalNumElements();
+    importer = Teuchos::rcp(new Tpetra::Import<tpetra_LO, tpetra_GO>(tpetra_comm_pmap, tpetra_pmap));
 }
 
+//requires both tpetra_pmap and other_pmap to be contiguous and for other_pmap to be a subset of tpetra_pmap on every process
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::own_comm_setup(Teuchos::RCP<Tpetra::Map<LO, GO, node_type>> other_pmap) {
+void TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::own_comm_setup(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> other_pmap) {
     own_comms = true;
-    comm_pmap = other_pmap;
-    tpetra_sub_vector = Teuchos::rcp(new MV(*tpetra_vector, comm_pmap));
-    importer = Teuchos::rcp(new Tpetra::Import<LO, GO>(comm_pmap, pmap));
+    tpetra_comm_pmap = other_pmap;
+    comm_pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_comm_pmap);
+    int local_offset = tpetra_pmap->getLocalElement((tpetra_comm_pmap->getMinGlobalIndex()));
+    tpetra_sub_vector = Teuchos::rcp(new MV(*tpetra_vector, tpetra_comm_pmap, local_offset));
+    submap_size_ = tpetra_comm_pmap->getLocalNumElements();
+    importer = Teuchos::rcp(new Tpetra::Import<tpetra_LO, tpetra_GO>(tpetra_comm_pmap, tpetra_pmap));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::perform_comms() {
+    if(own_comms){
+        tpetra_vector->doImport(*tpetra_sub_vector, *importer, Tpetra::INSERT);
+    }
+    else{}
+
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::print() const {
+        std::ostream &out = std::cout;
+        Teuchos::RCP<Teuchos::FancyOStream> fos;
+        fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(out));
+        tpetra_vector->describe(*fos,Teuchos::VERB_EXTREME);
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::repartition_vector() {
+
+    int num_dim = dims_[1];
+    int nranks, process_rank;
+    MPI_Comm_rank(mpi_comm_, &process_rank);
+    MPI_Comm_size(mpi_comm_, &nranks);
+    // construct input adapted needed by Zoltan2 problem
+    //typedef Xpetra::MultiVector<real_t, tpetra_LO, tpetra_GO, tpetra_node_type> xvector_t;
+    typedef Zoltan2::XpetraMultiVectorAdapter<MV> inputAdapter_t;
+    typedef Zoltan2::EvaluatePartition<inputAdapter_t> quality_t;
+
+    // Teuchos::RCP<xvector_t> xpetra_vector = 
+    //     Teuchos::rcp(new Xpetra::TpetraMultiVector<real_t, tpetra_LO, tpetra_GO, tpetra_node_type>(tpetra_vector));
+
+    //Teuchos::RCP<inputAdapter_t> problem_adapter =  Teuchos::rcp(new inputAdapter_t(xpetra_vector));
+    Teuchos::RCP<inputAdapter_t> problem_adapter =  Teuchos::rcp(new inputAdapter_t(tpetra_vector));
+
+    // Create parameters for an RCB problem
+
+    double tolerance = 1.05;
+
+    Teuchos::ParameterList params("Node Partition Params");
+    params.set("debug_level", "basic_status");
+    params.set("debug_procs", "0");
+    params.set("error_check_level", "debug_mode_assertions");
+
+    // params.set("algorithm", "rcb");
+    params.set("algorithm", "multijagged");
+    params.set("imbalance_tolerance", tolerance);
+    params.set("num_global_parts", nranks);
+    params.set("partitioning_objective", "minimize_cut_edge_count");
+
+    Teuchos::RCP<Zoltan2::PartitioningProblem<inputAdapter_t>> problem =
+        Teuchos::rcp(new Zoltan2::PartitioningProblem<inputAdapter_t>(&(*problem_adapter), &params));
+
+    // Solve the problem
+
+    problem->solve();
+
+    // create metric object where communicator is Teuchos default
+
+    quality_t* metricObject1 = new quality_t(&(*problem_adapter), &params, // problem1->getComm(),
+                                             &problem->getSolution());
+    // // Check the solution.
+
+    if (process_rank == 0)
+    {
+        metricObject1->printMetrics(std::cout);
+    }
+
+    if (process_rank == 0)
+    {
+        real_t imb = metricObject1->getObjectCountImbalance();
+        if (imb <= tolerance)
+        {
+            std::cout << "pass: " << imb << std::endl;
+        }
+        else
+        {
+            std::cout << "fail: " << imb << std::endl;
+        }
+        std::cout << std::endl;
+    }
+    delete metricObject1;
+
+    // // migrate rows of the vector so they correspond to the partition recommended by Zoltan2
+
+    // Teuchos::RCP<MV> partitioned_node_coords_distributed = Teuchos::rcp(new MV(map, num_dim));
+
+    // Teuchos::RCP<xvector_t> xpartitioned_node_coords_distributed =
+    //     Teuchos::rcp(new Xpetra::TpetraMultiVector<real_t, LO, GO, node_type>(partitioned_node_coords_distributed));
+
+    TArray1D this_array_temp = TArray1D(this_array_.d_view.label(), dims_[0], component_length_);
+    Teuchos::RCP<MV> temp_tpetra_vector = Teuchos::rcp(new MV(tpetra_pmap, this_array_temp));
+    problem_adapter->applyPartitioningSolution(*tpetra_vector, temp_tpetra_vector, problem->getSolution());
+    
+    // std::ostream &out = std::cout;
+    // Teuchos::RCP<Teuchos::FancyOStream> fos;
+    // fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(out));
+    // temp_tpetra_vector->describe(*fos,Teuchos::VERB_EXTREME);
+    // temp_tpetra_vector->getMap()->describe(*fos,Teuchos::VERB_EXTREME);
+    
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>(*(temp_tpetra_vector->getMap())));
+    tpetra_vector = temp_tpetra_vector;
+    // *partitioned_node_coords_distributed = Xpetra::toTpetra<real_t, LO, GO, node_type>(*xpartitioned_node_coords_distributed);
+
+    // Teuchos::RCP<Tpetra::Map<LO, GO, node_type>> partitioned_map = Teuchos::rcp(new Tpetra::Map<LO, GO, node_type>(*(partitioned_node_coords_distributed->getMap())));
+
+    Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> partitioned_map_one_to_one;
+    partitioned_map_one_to_one = Tpetra::createOneToOne<tpetra_LO, tpetra_GO, tpetra_node_type>(tpetra_pmap);
+    temp_tpetra_vector = Teuchos::rcp(new MV(partitioned_map_one_to_one, num_dim));
+
+    Tpetra::Import<tpetra_LO, tpetra_GO> importer_one_to_one(tpetra_vector->getMap(), partitioned_map_one_to_one);
+    temp_tpetra_vector->doImport(*tpetra_vector, importer_one_to_one, Tpetra::INSERT);
+    // node_coords_distributed = partitioned_node_coords_one_to_one_distributed;
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>(*partitioned_map_one_to_one));
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false; //reset submap setup now that full map is different
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    length_ = (dims_[0] * component_length_);
+
+    //copy new partitioned vector into another one constructed with our managed dual view
+    this_array_temp = TArray1D(this_array_.d_view.label(), dims_[0], component_length_);
+    tpetra_vector = Teuchos::rcp(new MV(tpetra_pmap, this_array_temp));
+    tpetra_vector->assign(*temp_tpetra_vector);
+    this_array_ = this_array_temp;
+    
+    //for whatever reason, when using one process the device contains the updated data, when using several the host does
+    //so we need this if block
+    if(this_array_.template need_sync<typename TArray1D::execution_space>()){
+        this_array_.template sync<typename TArray1D::execution_space>();
+    }
+    else{
+        this_array_.template sync<typename TArray1D::host_mirror_space>();
+    }
 }
 
+// Return size of the submap
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-void TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::perform_comms() {
-    tpetra_vector->doImport(*tpetra_sub_vector, *importer, Tpetra::INSERT);
+size_t TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::submap_size() const {
+    return submap_size_;
 }
 
 //MPI_Barrier wrapper
 //template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-//void TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::barrier(MPI_Comm comm) {
+//void TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::barrier(MPI_Comm comm) {
 //    MPI_Barrier(comm); 
 //}
 
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
 KOKKOS_INLINE_FUNCTION
-TpetraMVArray<T,Layout,ExecSpace,MemoryTraits>::~TpetraMVArray() {}
+TpetraDCArray<T,Layout,ExecSpace,MemoryTraits>::~TpetraDCArray() {}
+
+////////////////////////////////////////////////////////////////////////////////
+// End of TpetraDCArray
+////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////
+// TpetraDFArray:  Tpetra wrapper for a distributed multivector (several components per vector element).
+/////////////////////////
+
+template <typename T, typename Layout = tpetra_array_layout, typename ExecSpace = tpetra_execution_space, typename MemoryTraits = tpetra_memory_traits>
+class TpetraDFArray {
+
+    // this is manage
+    using  TArray1D = Kokkos::DualView <T**, Layout, ExecSpace, MemoryTraits>;
+
+    size_t dims_[7];
+    size_t global_dim1_;
+    size_t submap_size_;
+    size_t length_, component_length_;
+    size_t order_;  // tensor order (rank)
+    MPI_Comm mpi_comm_;
+    MPI_Datatype mpi_datatype_;
+    TArray1D this_array_;
+    
+    // Trilinos type definitions
+    typedef typename Tpetra::MultiVector<T, tpetra_LO, tpetra_GO> MV; //stands for MultiVector
+    typedef typename  Kokkos::View<T*, Kokkos::LayoutRight, tpetra_device_type, tpetra_memory_traits> values_array;
+    typedef Kokkos::View<tpetra_GO*, tpetra_array_layout, tpetra_device_type, tpetra_memory_traits> global_indices_array;
+    typedef Kokkos::View<tpetra_LO*, tpetra_array_layout, tpetra_device_type, tpetra_memory_traits> indices_array;
+    typedef typename MV::dual_view_type::t_dev vec_array;
+    typedef typename MV::dual_view_type::t_host host_vec_array;
+    typedef typename Kokkos::View<const T**, tpetra_array_layout, HostSpace, tpetra_memory_traits> const_host_vec_array;
+    typedef typename Kokkos::View<const T**, tpetra_array_layout, tpetra_device_type, tpetra_memory_traits> const_vec_array;
+    typedef typename MV::dual_view_type dual_vec_array;
+
+    Teuchos::RCP<Tpetra::Import<tpetra_LO, tpetra_GO>> importer; // tpetra comms object
+    
+
+public:
+    friend class TpetraCommunicationPlan<T,Layout,ExecSpace,MemoryTraits>;
+    //data for arrays that own both shared and local data and aren't intended to communicate with another MATAR type
+    //This is simplifying for cases such as a local + ghost storage vector where you need to update the ghost entries
+    bool own_comms; //This Mapped MPI Array contains its own communication plan; just call array_comms()
+    
+    void set_mpi_type();
+    TpetraPartitionMap<ExecSpace, MemoryTraits> pmap;
+    TpetraPartitionMap<ExecSpace, MemoryTraits> comm_pmap;
+    Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> tpetra_pmap;
+    Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> tpetra_comm_pmap;
+    Teuchos::RCP<MV>       tpetra_vector;
+    Teuchos::RCP<MV>       tpetra_sub_vector; //for owned comms situations
+
+    TpetraDFArray();
+    
+    //Copy Constructor
+    KOKKOS_INLINE_FUNCTION
+    TpetraDFArray(const TpetraDFArray<T, Layout, ExecSpace,MemoryTraits> &temp){
+        *this = temp;
+    }
+
+    /* Default Contigous Map Constructors*/
+    //Tpetra type for 1D case(still allocates dim0 by 1 using **T); partitions with unique indices per process
+    TpetraDFArray(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+
+    //Tpetra type for 2D case; partitions along rows with unique indices per process
+    TpetraDFArray(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+    
+    //Tpetra type for 3D case; partitions along rows with unique indices per process
+    TpetraDFArray(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+    
+    //Tpetra type for 4D case; partitions along rows with unique indices per process
+    TpetraDFArray(size_t dim0, size_t dim1, size_t dim2, size_t dim3,
+                  const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+    
+    //Tpetra type for 5D case; partitions along rows with unique indices per process
+    TpetraDFArray(size_t dim0, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+    
+    //Tpetra type for 6D case; partitions along rows with unique indices per process
+    TpetraDFArray(size_t dim0, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+    
+    //Tpetra type for 7D case; partitions along rows with unique indices per process
+    TpetraDFArray(size_t dim0, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+
+    /* Specified Map Constructors*/
+    //Tpetra type for 1D case with a partition map passed in
+    TpetraDFArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
+                  const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    //2D Tpetra type with a partition map passed in
+    TpetraDFArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap, size_t dim1,
+                  const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    //3D Tpetra type with a partition map passed in
+    TpetraDFArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap, size_t dim1,
+                  size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //4D Tpetra type with a partition map passed in
+    TpetraDFArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap, size_t dim1, size_t dim2, size_t dim3,
+                  const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //5D Tpetra type with a partition map passed in
+    TpetraDFArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //6D Tpetra type with a partition map passed in
+    TpetraDFArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //7D Tpetra type with a partition map passed in
+    TpetraDFArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    //Tpetra type for 1D case(still allocates dim0 by 1 using **T); this constructor takes an RCP pointer to a Tpetra Map directly
+    TpetraDFArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                  const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    //Tpetra type only goes up to 2D access; this constructor takes an RCP pointer to a Tpetra Map directly
+    TpetraDFArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap, size_t dim1,
+                  const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    //Tpetra type only goes up to 3D access; this constructor takes an RCP pointer to a Tpetra Map directly
+    TpetraDFArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap, size_t dim1,
+                  size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //Tpetra type only goes up to 4D access; this constructor takes an RCP pointer to a Tpetra Map directly
+    TpetraDFArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap, size_t dim1, size_t dim2, size_t dim3,
+                  const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //Tpetra type only goes up to 5D access; this constructor takes an RCP pointer to a Tpetra Map directly
+    TpetraDFArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //Tpetra type only goes up to 6D access; this constructor takes an RCP pointer to a Tpetra Map directly
+    TpetraDFArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //Tpetra type only goes up to 7D access; this constructor takes an RCP pointer to a Tpetra Map directly
+    TpetraDFArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap, size_t dim1, size_t dim2, size_t dim3,
+                  size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY);
+    
+    //construct an array that views a contiguous subset of another array; start index denotes the local index in super vector to start the sub view
+    TpetraDFArray(const TpetraDFArray<T, Layout, ExecSpace,MemoryTraits> &super_vector,
+                  const TpetraPartitionMap<ExecSpace,MemoryTraits> &sub_pmap, size_t start_index);
+
+    // 1D array setup
+    void data_setup(const std::string& tag_string);
+
+    // 2D array setup
+    void data_setup( size_t dim1,
+                const std::string& tag_string);
+
+    // 3D array setup
+    void data_setup( size_t dim1, size_t dim2,
+                const std::string& tag_string);
+
+    // 4D array setup
+    void data_setup( size_t dim1, size_t dim2, size_t dim3,
+                const std::string& tag_string);
+
+    // 5D array setup
+    void data_setup( size_t dim1, size_t dim2, size_t dim3,
+                size_t dim4, const std::string& tag_string);
+
+    // 6D array setup
+    void data_setup( size_t dim1, size_t dim2, size_t dim3,
+                size_t dim4, size_t dim5, const std::string& tag_string);
+
+    // 7D array setup
+    void data_setup(size_t dim1, size_t dim2, size_t dim3,
+                size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string);
+
+    void own_comm_setup(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> other_pmap); //only call if the map in the arg is a uniquely owned submap of the arrays map
+    
+    void own_comm_setup(TpetraPartitionMap<ExecSpace,MemoryTraits> &other_pmap); //only call if the map in the arg is a uniquely owned submap of the arrays map
+
+    void perform_comms();
+
+    void repartition_vector(); //repartitions this vector using the zoltan2 multijagged algorithm on its data
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j, size_t k) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator() (size_t i, size_t j, size_t k,
+                   size_t l) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator() (size_t i, size_t j, size_t k,
+                   size_t l, size_t m) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator() (size_t i, size_t j, size_t k,
+                   size_t l, size_t m, size_t n) const;
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator() (size_t i, size_t j, size_t k,
+                   size_t l, size_t m, size_t n, size_t o) const;
+
+    T& host(size_t i) const;
+
+    T& host(size_t i, size_t j) const;
+
+    T& host(size_t i, size_t j, size_t k) const;
+
+    T& host(size_t i, size_t j, size_t k,
+            size_t l) const;
+    
+    T& host(size_t i, size_t j, size_t k,
+            size_t l, size_t m) const;
+    
+    T& host(size_t i, size_t j, size_t k,
+            size_t l, size_t m, size_t n) const;
+
+    T& host(size_t i, size_t j, size_t k,
+            size_t l, size_t m, size_t n, size_t o) const;
+    
+    KOKKOS_INLINE_FUNCTION
+    TpetraDFArray& operator=(const TpetraDFArray& temp);
+
+    // GPU Method
+    // Method that returns size
+    KOKKOS_INLINE_FUNCTION
+    size_t size() const;
+
+    size_t submap_size() const;
+
+    long long int getSubMapGlobalIndex(int local_index) const;
+
+    long long int getMapGlobalIndex(int local_index) const;
+
+    int getSubMapLocalIndex(long long int local_index) const;
+
+    int getMapLocalIndex(long long int local_index) const;
+
+    // Host Method
+    // Method that returns size
+    KOKKOS_INLINE_FUNCTION
+    size_t extent() const;
+
+    KOKKOS_INLINE_FUNCTION
+    size_t dims(size_t i) const;
+
+    size_t global_dim() const;
+
+    KOKKOS_INLINE_FUNCTION
+    size_t order() const;
+ 
+    // Method returns the raw device pointer of the Kokkos DualView
+    KOKKOS_INLINE_FUNCTION
+    T* device_pointer() const;
+
+    // Method returns the raw host pointer of the Kokkos DualView
+    KOKKOS_INLINE_FUNCTION
+    T* host_pointer() const;
+
+    // Method returns kokkos dual view
+    KOKKOS_INLINE_FUNCTION
+    TArray1D get_kokkos_dual_view() const;
+
+    // Method that update host view
+    void update_host();
+
+    // Method that update device view
+    void update_device();
+
+    //print vector data
+    void print() const;
+
+    // Deconstructor
+    virtual KOKKOS_INLINE_FUNCTION
+    ~TpetraDFArray ();
+}; // End of TpetraDFArray
+
+
+// Default constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(): tpetra_pmap(NULL){
+    length_ = order_ = component_length_ = 0;
+    own_comms = false;
+    for (int i = 0; i < 7; i++) {
+        dims_[i] = 0;
+    }
+}
+
+// Overloaded 1D constructor where you provide dimensions, partitioning is done along first dimension
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(size_t dim0, const std::string& tag_string, MPI_Comm mpi_comm) {
+    mpi_comm_ = mpi_comm;
+    global_dim1_ = dim0;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(tag_string);
+}
+
+// Overloaded 2D constructor where you provide dimensions, partitioning is done along first dimension
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(size_t dim0, size_t dim1, const std::string& tag_string, MPI_Comm mpi_comm) {
+    mpi_comm_ = mpi_comm;
+    global_dim1_ = dim0;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, tag_string);
+}
+
+// Overloaded 3D constructor where you provide dimensions, partitioning is done along first dimension
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string, MPI_Comm mpi_comm) {
+    mpi_comm_ = mpi_comm;
+    global_dim1_ = dim0;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, tag_string);
+}
+
+// Overloaded 4D constructor where you provide dimensions, partitioning is done along first dimension
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string, MPI_Comm mpi_comm) {
+    mpi_comm_ = mpi_comm;
+    global_dim1_ = dim0;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, tag_string);
+}
+
+// Overloaded 5D constructor where you provide dimensions, partitioning is done along first dimension
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(size_t dim0, size_t dim1, size_t dim2, size_t dim3,
+                                                              size_t dim4, const std::string& tag_string, MPI_Comm mpi_comm) {
+    mpi_comm_ = mpi_comm;
+    global_dim1_ = dim0;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, tag_string);
+}
+
+// Overloaded 6D constructor where you provide dimensions, partitioning is done along first dimension
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(size_t dim0, size_t dim1, size_t dim2, size_t dim3,
+                                                              size_t dim4, size_t dim5, const std::string& tag_string, MPI_Comm mpi_comm) {
+    mpi_comm_ = mpi_comm;
+    global_dim1_ = dim0;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, dim5, tag_string);
+}
+
+// Overloaded 7D constructor where you provide dimensions, partitioning is done along first dimension
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(size_t dim0, size_t dim1, size_t dim2, size_t dim3,
+                                                              size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string, MPI_Comm mpi_comm) {
+    mpi_comm_ = mpi_comm;
+    global_dim1_ = dim0;
+    Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, dim5, dim6, tag_string);
+}
+
+// Overloaded 1D constructor where you provide a partition map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
+                                                             const std::string& tag_string) {
+    mpi_comm_ = input_pmap.mpi_comm_;                                                            
+    global_dim1_ = input_pmap.num_global_;
+    tpetra_pmap = input_pmap.tpetra_map;
+    pmap = input_pmap;
+    own_comms = false;
+    data_setup(tag_string);
+}
+
+// Overloaded 2D constructor where you provide a partition map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
+                                                              size_t dim1, const std::string& tag_string) {
+    mpi_comm_ = input_pmap.mpi_comm_;
+    global_dim1_ = input_pmap.num_global_;
+    tpetra_pmap = input_pmap.tpetra_map;
+    pmap = input_pmap;
+    own_comms = false;
+    data_setup(dim1, tag_string);
+}
+
+// Overloaded 3D constructor where you provide a partition map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
+                                                              size_t dim1, size_t dim2, const std::string& tag_string) {
+    mpi_comm_ = input_pmap.mpi_comm_;
+    global_dim1_ = input_pmap.num_global_;
+    tpetra_pmap = input_pmap.tpetra_map;
+    pmap = input_pmap;
+    own_comms = false;
+    data_setup(dim1, dim2, tag_string);
+}
+
+// Overloaded 4D constructor where you provide a partition map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
+                                                              size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) {
+    mpi_comm_ = input_pmap.mpi_comm_;
+    global_dim1_ = input_pmap.num_global_;
+    tpetra_pmap = input_pmap.tpetra_map;
+    pmap = input_pmap;
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, tag_string);
+}
+
+// Overloaded 5D constructor where you provide a partition map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
+                                                              size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) {
+    mpi_comm_ = input_pmap.mpi_comm_;
+    global_dim1_ = input_pmap.num_global_;
+    tpetra_pmap = input_pmap.tpetra_map;
+    pmap = input_pmap;
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, tag_string);
+}
+
+// Overloaded 6D constructor where you provide a partition map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
+                                                              size_t dim1, size_t dim2, size_t dim3, size_t dim4,
+                                                              size_t dim5, const std::string& tag_string) {
+    mpi_comm_ = input_pmap.mpi_comm_;
+    global_dim1_ = input_pmap.num_global_;
+    tpetra_pmap = input_pmap.tpetra_map;
+    pmap = input_pmap;
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, dim5, tag_string);
+}
+
+// Overloaded 7D constructor where you provide a partition map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
+                                                              size_t dim1, size_t dim2, size_t dim3, size_t dim4,
+                                                              size_t dim5, size_t dim6, const std::string& tag_string) {
+    mpi_comm_ = input_pmap.mpi_comm_;
+    global_dim1_ = input_pmap.num_global_;
+    tpetra_pmap = input_pmap.tpetra_map;
+    pmap = input_pmap;
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, dim5, dim6, tag_string);
+}
+
+// Overloaded 1D constructor taking an RPC pointer to a Tpetra Map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                                                              const std::string& tag_string) {
+    
+    global_dim1_ = input_pmap->getGlobalNumElements();
+    tpetra_pmap = input_pmap;
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(tag_string);
+}
+
+// Overloaded 2D constructor taking an RPC pointer to a Tpetra Map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                                                              size_t dim1, const std::string& tag_string) {
+    
+    global_dim1_ = input_pmap->getGlobalNumElements();
+    tpetra_pmap = input_pmap;
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, tag_string);
+}
+
+// Overloaded 3D constructor taking an RPC pointer to a Tpetra Map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                                                              size_t dim1, size_t dim2, const std::string& tag_string) {
+    
+    global_dim1_ = input_pmap->getGlobalNumElements();
+    tpetra_pmap = input_pmap;
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, tag_string);
+}
+
+// Overloaded 4D constructor taking an RPC pointer to a Tpetra Map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                                                              size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) {
+    
+    global_dim1_ = input_pmap->getGlobalNumElements();
+    tpetra_pmap = input_pmap;
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, tag_string);
+}
+
+// Overloaded 5D constructor taking an RPC pointer to a Tpetra Map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                                                              size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) {
+    
+    global_dim1_ = input_pmap->getGlobalNumElements();
+    tpetra_pmap = input_pmap;
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, tag_string);
+}
+
+// Overloaded 6D constructor taking an RPC pointer to a Tpetra Map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                                                              size_t dim1, size_t dim2, size_t dim3, size_t dim4,
+                                                              size_t dim5, const std::string& tag_string) {
+    
+    global_dim1_ = input_pmap->getGlobalNumElements();
+    tpetra_pmap = input_pmap;
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, dim5, tag_string);
+}
+
+// Overloaded 7D constructor taking an RPC pointer to a Tpetra Map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                                                              size_t dim1, size_t dim2, size_t dim3, size_t dim4,
+                                                              size_t dim5, size_t dim6, const std::string& tag_string) {
+    
+    global_dim1_ = input_pmap->getGlobalNumElements();
+    tpetra_pmap = input_pmap;
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false;
+    data_setup(dim1, dim2, dim3, dim4, dim5, dim6, tag_string);
+}
+
+//construct an array that views a contiguous subset of another array; start index denotes the local index in super vector to start the sub view
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(const TpetraDFArray<T, Layout, ExecSpace,MemoryTraits> &super_vector,
+                const TpetraPartitionMap<ExecSpace,MemoryTraits> &sub_pmap, size_t start_index){
+    mpi_comm_ = sub_pmap.mpi_comm_;                                                            
+    global_dim1_ = sub_pmap.num_global_;
+    tpetra_pmap = sub_pmap.tpetra_map;
+    pmap = sub_pmap;
+    own_comms = false;
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    for (int iter = 1; iter < super_vector.order_; iter++){
+            dims_[iter] = super_vector.dims_[iter];
+        } // end for
+
+    if(super_vector.order_==1){
+        dims_[1] = 1;
+    }
+    order_ = super_vector.order_;
+    component_length_ = super_vector.component_length_;
+    length_ = dims_[0]*component_length_;
+    // Create host ViewCArray
+    set_mpi_type();
+    this_array_ = TArray1D(super_vector.this_array_, std::pair<size_t,size_t>(start_index, super_vector.this_array_.extent(0)), Kokkos::ALL());
+    tpetra_vector = Teuchos::rcp(new MV(*(super_vector.tpetra_vector), tpetra_pmap, start_index));
+}
+
+// Tpetra vector argument constructor: CURRENTLY DOESN'T WORK SINCE WE CANT GET DUAL VIEW FROM THE MULTIVECTOR
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::TpetraDFArray(Teuchos::RCP<MV> input_tpetra_vector, const std::string& tag_string){
+    
+//     tpetra_vector   = input_tpetra_vector;
+//     tpetra_pmap = input_tpetra_vector->getMap(); 
+//     //this_array_ = tpetra_vector->getWrappedDualView();
+//     dims_[0] = tpetra_vector->getMap()->getLocalNumElements()();
+//     dims_[1] = tpetra_vector->getNumVectors();
+
+//     if(dims_[1]==1){
+//         order_ = 1;
+//     }
+//     else{
+//         order_ = 2;
+//     }
+//     length_ = (dims_[0] * dims_[1]);
+//     set_mpi_type();
+// }
+
+// Overloaded 1D array setup
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::data_setup(const std::string& tag_string) {
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    dims_[1] = 1;
+    order_ = 1;
+    length_ = dims_[0];
+    component_length_ = 1;
+    // Create host ViewCArray
+    set_mpi_type();
+    this_array_ = TArray1D(tag_string, dims_[0], component_length_);
+    tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+// Overloaded 2D array setup
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::data_setup(size_t dim1, const std::string& tag_string) {
+
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    dims_[1] = dim1;
+    order_ = 2;
+    length_ = (dims_[0] * dims_[1]);
+    component_length_ = dims_[1];
+    // Create host ViewCArray
+    //set_mpi_type();
+    this_array_ = TArray1D(tag_string, dims_[0], component_length_);
+    tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+// Overloaded 3D array setup
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::data_setup(size_t dim1, size_t dim2, const std::string& tag_string) {
+
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    order_ = 3;
+    length_ = (dims_[0] * dims_[1] * dims_[2]);
+    component_length_ = dims_[1] * dims_[2];
+    // Create host ViewCArray
+    set_mpi_type();
+    this_array_ = TArray1D(tag_string, dims_[0], component_length_);
+    tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+// Overloaded 4D array setup
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::data_setup(size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) {
+
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    order_ = 4;
+    length_ = (dims_[0] * dims_[1] * dims_[2]* dims_[3]);
+    component_length_ = dims_[1] * dims_[2] * dims_[3];
+    // Create host ViewCArray
+    set_mpi_type();
+    this_array_ = TArray1D(tag_string, dims_[0], component_length_);
+    tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+// Overloaded 5D array setup
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::data_setup(size_t dim1, size_t dim2, size_t dim3,
+                                                           size_t dim4, const std::string& tag_string) {
+
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    dims_[4] = dim4;
+    order_ = 5;
+    length_ = (dims_[0] * dims_[1] * dims_[2] * dims_[3]* dims_[4]);
+    component_length_ = dims_[1] * dims_[2] * dims_[3] * dims_[4];
+    // Create host ViewCArray
+    set_mpi_type();
+    this_array_ = TArray1D(tag_string, dims_[0], component_length_);
+    tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+// Overloaded 6D array setup
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::data_setup(size_t dim1, size_t dim2, size_t dim3,
+                                                           size_t dim4, size_t dim5, const std::string& tag_string) {
+
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    dims_[4] = dim4;
+    dims_[5] = dim5;
+    order_ = 6;
+    length_ = (dims_[0] * dims_[1] * dims_[2] * dims_[3] * dims_[4] * dims_[5]);
+    component_length_ = dims_[1] * dims_[2] * dims_[3] * dims_[4] * dims_[5];
+    // Create host ViewCArray
+    set_mpi_type();
+    this_array_ = TArray1D(tag_string, dims_[0], component_length_);
+    tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+// Overloaded 7D array setup
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::data_setup(size_t dim1, size_t dim2, size_t dim3,size_t dim4, size_t dim5, size_t dim6,
+                                                           const std::string& tag_string) {
+    
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    dims_[1] = dim1;
+    dims_[2] = dim2;
+    dims_[3] = dim3;
+    dims_[4] = dim4;
+    dims_[5] = dim5;
+    dims_[6] = dim6;
+    order_ = 7;
+    length_ = (dims_[0] * dims_[1] * dims_[2] * dims_[3] * dims_[4] * dims_[5] * dims_[6]);
+    component_length_ = dims_[1] * dims_[2] * dims_[3] * dims_[4] * dims_[5] * dims_[6];
+    // Create host ViewCArray
+    set_mpi_type();
+    this_array_ = TArray1D(tag_string, dims_[0], component_length_);
+    tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::set_mpi_type() {
+    if (typeid(T).name() == typeid(bool).name()) {
+        mpi_datatype_ = MPI_C_BOOL;
+    }
+    else if (typeid(T).name() == typeid(int).name()) {
+        mpi_datatype_ = MPI_INT;
+    }
+    else if (typeid(T).name() == typeid(long int).name()) {
+        mpi_datatype_ = MPI_LONG;
+    }
+    else if (typeid(T).name() == typeid(long long int).name()) {
+        mpi_datatype_ = MPI_LONG_LONG_INT;
+    }
+    else if (typeid(T).name() == typeid(float).name()) {
+        mpi_datatype_ = MPI_FLOAT;
+    }
+    else if (typeid(T).name() == typeid(double).name()) {
+        mpi_datatype_ = MPI_DOUBLE;
+    }
+    else {
+        printf("Your entered TpetraDFArray type is not a supported type for MPI communications and is being set to int\n");
+        mpi_datatype_ = MPI_INT;
+    }
+}
+
+//1D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i) const {
+    assert(order_ == 1 && "Tensor order (rank) does not match constructor in TpetraDFArray 1D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDFArray 1D!");
+    return this_array_.d_view(i,0);
+}
+
+//2D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j) const {
+    assert(order_ == 2 && "Tensor order (rank) does not match constructor in TpetraDFArray 2D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDFArray 2D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDFArray 2D!");
+    return this_array_.d_view(i,j);
+}
+
+// 3D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k) const {
+    assert(order_ == 3 && "Tensor order (rank) does not match constructor in TpetraDFArray 3D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDFArray 3D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDFArray 3D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDFArray 3D!");
+    return this_array_.d_view(i, j + (k * dims_[1]));
+}
+
+// 4D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l) const {
+    assert(order_ == 4 && "Tensor order (rank) does not match constructor in TpetraDFArray 4D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDFArray 4D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDFArray 4D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDFArray 4D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in TpetraDFArray 4D!");
+    return this_array_.d_view(i, j + (k * dims_[1])
+                              + (l * dims_[1] * dims_[2]));
+}
+
+// 5D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
+                               size_t m) const {
+    assert(order_ == 5 && "Tensor order (rank) does not match constructor in TpetraDFArray 5D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDFArray 5D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDFArray 5D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDFArray 5D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in TpetraDFArray 5D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in TpetraDFArray 5D!");
+    return this_array_.d_view(i, j + (k * dims_[1])
+                         + (l * dims_[1] * dims_[2])
+                         + (m * dims_[1] * dims_[2] * dims_[3]));
+}
+
+// 6D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
+                               size_t m, size_t n) const {
+    assert(order_ == 6 && "Tensor order (rank) does not match constructor in TpetraDFArray 6D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDFArray 6D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDFArray 6D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDFArray 6D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in TpetraDFArray 6D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in TpetraDFArray 6D!");
+    assert(n >= 0 && n < dims_[5] && "n is out of bounds in TpetraDFArray 6D!");
+    return this_array_.d_view(i, j + (k * dims_[1])
+                         + (l * dims_[1] * dims_[2])
+                         + (m * dims_[1] * dims_[2] * dims_[3])
+                         + (n * dims_[1] * dims_[2] * dims_[3] * dims_[4]));
+}
+
+// 7D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j, size_t k, size_t l,
+                               size_t m, size_t n, size_t o) const {
+    assert(order_ == 7 && "Tensor order (rank) does not match constructor in TpetraDFArray 7D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDFArray 7D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDFArray 7D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDFArray 7D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in TpetraDFArray 7D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in TpetraDFArray 7D!");
+    assert(n >= 0 && n < dims_[5] && "n is out of bounds in TpetraDFArray 7D!");
+    assert(o >= 0 && o < dims_[6] && "o is out of bounds in TpetraDFArray 7D!");
+    return this_array_.d_view(i, j + (k * dims_[1])
+                         + (l * dims_[1] * dims_[2])
+                         + (m * dims_[1] * dims_[2] * dims_[3])
+                         + (n * dims_[1] * dims_[2] * dims_[3] * dims_[4])
+                         + (o * dims_[1] * dims_[2] * dims_[3] * dims_[4] * dims_[5]));
+}
+
+// Return global index corresponding to the input local (on this process/rank) index for the sub map this vector comms from
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+long long int TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::getSubMapGlobalIndex(int local_index) const {
+    long long int global_index = tpetra_comm_pmap->getGlobalElement(local_index);
+    return global_index;
+}
+
+// Return global index corresponding to the input local (on this process/rank) index
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+long long int TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::getMapGlobalIndex(int local_index) const {
+    long long int global_index = tpetra_pmap->getGlobalElement(local_index);
+    return global_index;
+}
+
+// Return global index corresponding to the input local (on this process/rank) index for the sub map this vector comms from
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+int TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::getSubMapLocalIndex(long long int global_index) const {
+    int local_index = tpetra_comm_pmap->getLocalElement(global_index);
+    return local_index;
+}
+
+// Return global index corresponding to the input local (on this process/rank) index
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+int TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::getMapLocalIndex(long long int global_index) const {
+    int local_index = tpetra_pmap->getLocalElement(global_index);
+    return local_index;
+}
+
+//1D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T& TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::host(size_t i) const {
+    assert(order_ == 1 && "Tensor order (rank) does not match constructor in TpetraDFArray 1D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDFArray 1D!");
+    return this_array_.h_view(i,0);
+}
+
+//2D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T& TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::host(size_t i, size_t j) const {
+    assert(order_ == 2 && "Tensor order (rank) does not match constructor in TpetraDFArray 2D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDFArray 2D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDFArray 2D!");
+    return this_array_.h_view(i,j);
+}
+
+// 3D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T& TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::host(size_t i, size_t j, size_t k) const {
+    assert(order_ == 3 && "Tensor order (rank) does not match constructor in TpetraDFArray 3D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDFArray 3D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDFArray 3D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDFArray 3D!");
+    return this_array_.h_view(i, j + (k * dims_[1]));
+}
+
+// 4D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T& TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::host(size_t i, size_t j, size_t k, size_t l) const {
+    assert(order_ == 4 && "Tensor order (rank) does not match constructor in TpetraDFArray 4D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDFArray 4D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDFArray 4D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDFArray 4D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in TpetraDFArray 4D!");
+    return this_array_.h_view(i, j + (k * dims_[1])
+                              + (l * dims_[1] * dims_[2]));
+}
+
+// 5D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T& TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::host(size_t i, size_t j, size_t k, size_t l,
+                               size_t m) const {
+    assert(order_ == 5 && "Tensor order (rank) does not match constructor in TpetraDFArray 5D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDFArray 5D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDFArray 5D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDFArray 5D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in TpetraDFArray 5D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in TpetraDFArray 5D!");
+    return this_array_.h_view(i, j + (k * dims_[1])
+                         + (l * dims_[1] * dims_[2])
+                         + (m * dims_[1] * dims_[2] * dims_[3]));
+}
+
+// 6D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T& TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::host(size_t i, size_t j, size_t k, size_t l,
+                               size_t m, size_t n) const {
+    assert(order_ == 6 && "Tensor order (rank) does not match constructor in TpetraDFArray 6D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDFArray 6D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDFArray 6D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDFArray 6D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in TpetraDFArray 6D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in TpetraDFArray 6D!");
+    assert(n >= 0 && n < dims_[5] && "n is out of bounds in TpetraDFArray 6D!");
+    return this_array_.h_view(i, j + (k * dims_[1])
+                         + (l * dims_[1] * dims_[2])
+                         + (m * dims_[1] * dims_[2] * dims_[3])
+                         + (n * dims_[1] * dims_[2] * dims_[3] * dims_[4]));
+}
+
+// 7D
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T& TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::host(size_t i, size_t j, size_t k, size_t l,
+                               size_t m, size_t n, size_t o) const {
+    assert(order_ == 7 && "Tensor order (rank) does not match constructor in TpetraDFArray 7D!");
+    assert(i >= 0 && i < dims_[0] && "i is out of bounds in TpetraDFArray 7D!");
+    assert(j >= 0 && j < dims_[1] && "j is out of bounds in TpetraDFArray 7D!");
+    assert(k >= 0 && k < dims_[2] && "k is out of bounds in TpetraDFArray 7D!");
+    assert(l >= 0 && l < dims_[3] && "l is out of bounds in TpetraDFArray 7D!");
+    assert(m >= 0 && m < dims_[4] && "m is out of bounds in TpetraDFArray 7D!");
+    assert(n >= 0 && n < dims_[5] && "n is out of bounds in TpetraDFArray 7D!");
+    assert(o >= 0 && o < dims_[6] && "o is out of bounds in TpetraDFArray 7D!");
+    return this_array_.h_view(i, j + (k * dims_[1])
+                         + (l * dims_[1] * dims_[2])
+                         + (m * dims_[1] * dims_[2] * dims_[3])
+                         + (n * dims_[1] * dims_[2] * dims_[3] * dims_[4])
+                         + (o * dims_[1] * dims_[2] * dims_[3] * dims_[4] * dims_[5]));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>& TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::operator= (const TpetraDFArray& temp) {
+    
+    // Do nothing if the assignment is of the form x = x
+    if (this != &temp) {
+        for (int iter = 0; iter < temp.order_; iter++){
+            dims_[iter] = temp.dims_[iter];
+        } // end for
+
+        if(temp.order_==1){
+            dims_[1] = 1;
+        }
+
+        global_dim1_ = temp.global_dim1_;
+        order_ = temp.order_;
+        length_ = temp.length_;
+        component_length_ = temp.component_length_;
+        this_array_ = temp.this_array_;
+        mpi_comm_ = temp.mpi_comm_;
+        mpi_datatype_ = temp.mpi_datatype_;
+        tpetra_vector = temp.tpetra_vector;
+        tpetra_sub_vector = temp.tpetra_sub_vector;
+        pmap = temp.pmap;
+        comm_pmap = temp.comm_pmap;
+        tpetra_pmap = temp.tpetra_pmap;
+        tpetra_comm_pmap = temp.tpetra_comm_pmap;
+        importer = temp.importer;
+        own_comms = temp.own_comms;
+        submap_size_ = temp.submap_size_;
+    }
+    
+    return *this;
+}
+
+// Return size
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::size() const {
+    return length_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::extent() const {
+    return length_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::dims(size_t i) const {
+    assert(i < order_ && "TpetraDFArray order (rank) does not match constructor, dim[i] does not exist!");
+    assert(i >= 0 && dims_[i]>0 && "Access to TpetraDFArray dims is out of bounds!");
+    return dims_[i];
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+size_t TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::global_dim() const {
+    return global_dim1_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::order() const {
+    return order_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T* TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::device_pointer() const {
+    return this_array_.d_view.data();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T* TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::host_pointer() const {
+    return this_array_.h_view.data();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+Kokkos::DualView <T**, Layout, ExecSpace, MemoryTraits> TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::get_kokkos_dual_view() const {
+  return this_array_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::update_host() {
+
+    this_array_.template modify<typename TArray1D::execution_space>();
+    this_array_.template sync<typename TArray1D::host_mirror_space>();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::update_device() {
+
+    this_array_.template modify<typename TArray1D::host_mirror_space>();
+    this_array_.template sync<typename TArray1D::execution_space>();
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::own_comm_setup(TpetraPartitionMap<ExecSpace,MemoryTraits> &other_pmap) {
+    own_comms = true;
+    tpetra_comm_pmap = other_pmap.tpetra_map;
+    comm_pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_comm_pmap);
+    int local_offset = tpetra_pmap->getLocalElement((tpetra_comm_pmap->getMinGlobalIndex()));
+    tpetra_sub_vector = Teuchos::rcp(new MV(*tpetra_vector, tpetra_comm_pmap, local_offset));
+    submap_size_ = tpetra_comm_pmap->getLocalNumElements();
+    importer = Teuchos::rcp(new Tpetra::Import<tpetra_LO, tpetra_GO>(tpetra_comm_pmap, tpetra_pmap));
+}
+
+//requires both tpetra_pmap and other_pmap to be contiguous and for other_pmap to be a subset of tpetra_pmap on every process
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::own_comm_setup(Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> other_pmap) {
+    own_comms = true;
+    tpetra_comm_pmap = other_pmap;
+    comm_pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_comm_pmap);
+    int local_offset = tpetra_pmap->getLocalElement((tpetra_comm_pmap->getMinGlobalIndex()));
+    tpetra_sub_vector = Teuchos::rcp(new MV(*tpetra_vector, tpetra_comm_pmap, local_offset));
+    submap_size_ = tpetra_comm_pmap->getLocalNumElements();
+    importer = Teuchos::rcp(new Tpetra::Import<tpetra_LO, tpetra_GO>(tpetra_comm_pmap, tpetra_pmap));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::perform_comms() {
+    if(own_comms){
+        tpetra_vector->doImport(*tpetra_sub_vector, *importer, Tpetra::INSERT);
+    }
+    else{}
+
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::print() const {
+        std::ostream &out = std::cout;
+        Teuchos::RCP<Teuchos::FancyOStream> fos;
+        fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(out));
+        tpetra_vector->describe(*fos,Teuchos::VERB_EXTREME);
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::repartition_vector() {
+
+    int num_dim = dims_[1];
+    int nranks, process_rank;
+    MPI_Comm_rank(mpi_comm_, &process_rank);
+    MPI_Comm_size(mpi_comm_, &nranks);
+    // construct input adapted needed by Zoltan2 problem
+    //typedef Xpetra::MultiVector<real_t, tpetra_LO, tpetra_GO, tpetra_node_type> xvector_t;
+    typedef Zoltan2::XpetraMultiVectorAdapter<MV> inputAdapter_t;
+    typedef Zoltan2::EvaluatePartition<inputAdapter_t> quality_t;
+
+    // Teuchos::RCP<xvector_t> xpetra_vector = 
+    //     Teuchos::rcp(new Xpetra::TpetraMultiVector<real_t, tpetra_LO, tpetra_GO, tpetra_node_type>(tpetra_vector));
+
+    //Teuchos::RCP<inputAdapter_t> problem_adapter =  Teuchos::rcp(new inputAdapter_t(xpetra_vector));
+    Teuchos::RCP<inputAdapter_t> problem_adapter =  Teuchos::rcp(new inputAdapter_t(tpetra_vector));
+
+    // Create parameters for an RCB problem
+
+    double tolerance = 1.05;
+
+    Teuchos::ParameterList params("Node Partition Params");
+    params.set("debug_level", "basic_status");
+    params.set("debug_procs", "0");
+    params.set("error_check_level", "debug_mode_assertions");
+
+    // params.set("algorithm", "rcb");
+    params.set("algorithm", "multijagged");
+    params.set("imbalance_tolerance", tolerance);
+    params.set("num_global_parts", nranks);
+    params.set("partitioning_objective", "minimize_cut_edge_count");
+
+    Teuchos::RCP<Zoltan2::PartitioningProblem<inputAdapter_t>> problem =
+        Teuchos::rcp(new Zoltan2::PartitioningProblem<inputAdapter_t>(&(*problem_adapter), &params));
+
+    // Solve the problem
+
+    problem->solve();
+
+    // create metric object where communicator is Teuchos default
+
+    quality_t* metricObject1 = new quality_t(&(*problem_adapter), &params, // problem1->getComm(),
+                                             &problem->getSolution());
+    // // Check the solution.
+
+    if (process_rank == 0)
+    {
+        metricObject1->printMetrics(std::cout);
+    }
+
+    if (process_rank == 0)
+    {
+        real_t imb = metricObject1->getObjectCountImbalance();
+        if (imb <= tolerance)
+        {
+            std::cout << "pass: " << imb << std::endl;
+        }
+        else
+        {
+            std::cout << "fail: " << imb << std::endl;
+        }
+        std::cout << std::endl;
+    }
+    delete metricObject1;
+
+    // // migrate rows of the vector so they correspond to the partition recommended by Zoltan2
+
+    // Teuchos::RCP<MV> partitioned_node_coords_distributed = Teuchos::rcp(new MV(map, num_dim));
+
+    // Teuchos::RCP<xvector_t> xpartitioned_node_coords_distributed =
+    //     Teuchos::rcp(new Xpetra::TpetraMultiVector<real_t, LO, GO, node_type>(partitioned_node_coords_distributed));
+    
+    TArray1D this_array_temp = TArray1D(this_array_.d_view.label(), dims_[0], component_length_);
+    Teuchos::RCP<MV> temp_tpetra_vector = Teuchos::rcp(new MV(tpetra_pmap, this_array_temp));
+    problem_adapter->applyPartitioningSolution(*tpetra_vector, temp_tpetra_vector, problem->getSolution());
+    
+    // std::ostream &out = std::cout;
+    // Teuchos::RCP<Teuchos::FancyOStream> fos;
+    // fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(out));
+    // temp_tpetra_vector->describe(*fos,Teuchos::VERB_EXTREME);
+    // temp_tpetra_vector->getMap()->describe(*fos,Teuchos::VERB_EXTREME);
+    
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>(*(temp_tpetra_vector->getMap())));
+    tpetra_vector = temp_tpetra_vector;
+    // *partitioned_node_coords_distributed = Xpetra::toTpetra<real_t, LO, GO, node_type>(*xpartitioned_node_coords_distributed);
+
+    // Teuchos::RCP<Tpetra::Map<LO, GO, node_type>> partitioned_map = Teuchos::rcp(new Tpetra::Map<LO, GO, node_type>(*(partitioned_node_coords_distributed->getMap())));
+
+    Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> partitioned_map_one_to_one;
+    partitioned_map_one_to_one = Tpetra::createOneToOne<tpetra_LO, tpetra_GO, tpetra_node_type>(tpetra_pmap);
+    temp_tpetra_vector = Teuchos::rcp(new MV(partitioned_map_one_to_one, num_dim));
+
+    Tpetra::Import<tpetra_LO, tpetra_GO> importer_one_to_one(tpetra_vector->getMap(), partitioned_map_one_to_one);
+    temp_tpetra_vector->doImport(*tpetra_vector, importer_one_to_one, Tpetra::INSERT);
+    // node_coords_distributed = partitioned_node_coords_one_to_one_distributed;
+    tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>(*partitioned_map_one_to_one));
+    pmap = TpetraPartitionMap<ExecSpace,MemoryTraits>(tpetra_pmap);
+    own_comms = false; //reset submap setup now that full map is different
+    dims_[0] = tpetra_pmap->getLocalNumElements();
+    length_ = (dims_[0] * component_length_);
+
+    //copy new partitioned vector into another one constructed with our managed dual view
+    this_array_temp = TArray1D(this_array_.d_view.label(), dims_[0], component_length_);
+    tpetra_vector = Teuchos::rcp(new MV(tpetra_pmap, this_array_temp));
+    tpetra_vector->assign(*temp_tpetra_vector);
+    this_array_ = this_array_temp;
+
+    //for whatever reason, when using one process the device contains the updated data, when using several the host does
+    //so we need this if block
+    if(this_array_.template need_sync<typename TArray1D::execution_space>()){
+        this_array_.template sync<typename TArray1D::execution_space>();
+    }
+    else{
+        this_array_.template sync<typename TArray1D::host_mirror_space>();
+    }
+
+}
+
+// Return size of the submap
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+size_t TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::submap_size() const {
+    return submap_size_;
+}
+
+//MPI_Barrier wrapper
+//template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+//void TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::barrier(MPI_Comm comm) {
+//    MPI_Barrier(comm); 
+//}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+TpetraDFArray<T,Layout,ExecSpace,MemoryTraits>::~TpetraDFArray() {}
+
+////////////////////////////////////////////////////////////////////////////////
+// End of TpetraDFArray
+////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////
+// TpetraCRSMatrix:  CRS Matrix Tpetra wrapper.
+/////////////////////////
+template <typename T, typename Layout = tpetra_array_layout, typename ExecSpace = tpetra_execution_space, typename MemoryTraits = tpetra_memory_traits>
+class TpetraCRSMatrix {
+
+    // this is manage
+    using  TArray1D = RaggedRightArrayKokkos<T*, Kokkos::LayoutRight, ExecSpace, MemoryTraits>;
+    using  TArray1D_Host = RaggedRightArrayKokkos<T*, Kokkos::LayoutRight, HostSpace, MemoryTraits>;
+    using  row_map_type = Kokkos::View<size_t*, ExecSpace>;
+    using  input_row_map_type = DCArrayKokkos<size_t,ExecSpace>;
+    using  values_array = Kokkos::View<T*, Kokkos::LayoutRight, ExecSpace, MemoryTraits>;
+    using  global_indices_array = Kokkos::View<tpetra_GO*, Layout, ExecSpace, MemoryTraits>;
+    using  indices_array = Kokkos::View<tpetra_LO*, Layout, ExecSpace, MemoryTraits>;
+
+    size_t dim1_;
+    size_t global_dim1_;
+    size_t column_map_size_;
+    size_t length_;
+    MPI_Comm mpi_comm_;
+    MPI_Datatype mpi_datatype_;
+    TArray1D this_array_;
+    row_map_type mystrides_;
+    row_map_type start_index_;
+    indices_array crs_local_indices_;
+    
+    // Trilinos type definitions
+    typedef Tpetra::CrsMatrix<real_t, tpetra_LO, tpetra_GO> MAT; //stands for matrix
+    typedef const Tpetra::CrsMatrix<real_t, tpetra_LO, tpetra_GO> const_MAT;
+    typedef Tpetra::MultiVector<real_t, tpetra_LO, tpetra_GO> MV;
+    typedef MV::dual_view_type::t_dev vec_array;
+    typedef MV::dual_view_type::t_host host_vec_array;
+    typedef Kokkos::View<const real_t**, tpetra_array_layout, HostSpace, tpetra_memory_traits> const_host_vec_array;
+    typedef Kokkos::View<const real_t**, tpetra_array_layout, tpetra_device_type, tpetra_memory_traits> const_vec_array;
+    typedef Kokkos::View<const int**, tpetra_array_layout, HostSpace, tpetra_memory_traits> const_host_ivec_array;
+    typedef Kokkos::View<int**, tpetra_array_layout, HostSpace, tpetra_memory_traits> host_ivec_array;
+    typedef MV::dual_view_type dual_vec_array;
+
+    Teuchos::RCP<Tpetra::Import<tpetra_LO, tpetra_GO>> importer; // tpetra comms object
+    
+
+public:
+    
+    //data for arrays that own both shared and local data and aren't intended to communicate with another MATAR type
+    //This is simplifying for cases such as a local + ghost storage vector where you need to update the ghost entries
+    bool own_comms; //This Mapped MPI Array contains its own communication plan; just call array_comms()
+    
+    void set_mpi_type();
+    TpetraPartitionMap<ExecSpace, MemoryTraits> pmap;
+    TpetraPartitionMap<ExecSpace, MemoryTraits> column_pmap;
+    TpetraPartitionMap<ExecSpace, MemoryTraits> comm_pmap;
+    Teuchos::RCP<Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> tpetra_pmap;
+    Teuchos::RCP<Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> tpetra_column_pmap;
+    Teuchos::RCP<Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> tpetra_comm_pmap;
+    Teuchos::RCP<MAT>       tpetra_crs_matrix;
+
+    TpetraCRSMatrix();
+    
+    //Copy Constructor
+    KOKKOS_INLINE_FUNCTION
+    TpetraCRSMatrix(const TpetraCRSMatrix<T, Layout, ExecSpace,MemoryTraits> &temp){
+        *this = temp;
+    }
+
+    //CRS matrix constructor for banded matrix case
+    // TpetraCRSMatrix(size_t dim1, size_t dim2,
+    //                 const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+    
+    // //CRS row distributed matrix constructor for rectangular matrix
+    // TpetraCRSMatrix(size_t global_dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+
+    // TpetraCRSMatrix(size_t dim1, input_row_map_type input_strides, DCArrayKokkos<tpetra_GO,Layout,ExecSpace,MemoryTraits> crs_graph,
+    //                  const std::string& tag_string = DEFAULTSTRINGARRAY, MPI_Comm mpi_comm = MPI_COMM_WORLD);
+
+    //CRS matrix constructor with arbitrary row graph and column map supplied
+    TpetraCRSMatrix(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    //CRS matric constructor with arbitrary row graph; builds column map for you and thus one less arg
+    TpetraCRSMatrix(Teuchos::RCP<Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY);
+
+    KOKKOS_INLINE_FUNCTION
+    T& operator()(size_t i, size_t j) const;
+
+    // T& host(size_t i, size_t j) const;
+    
+    KOKKOS_INLINE_FUNCTION
+    TpetraCRSMatrix& operator=(const TpetraCRSMatrix& temp);
+
+    // GPU Method
+    // Method that returns size
+    KOKKOS_INLINE_FUNCTION
+    size_t size() const;
+
+    KOKKOS_INLINE_FUNCTION
+    long long int getColumnMapGlobalIndex(int local_index) const;
+
+    KOKKOS_INLINE_FUNCTION
+    long long int getMapGlobalIndex(int local_index) const;
+
+    KOKKOS_INLINE_FUNCTION
+    int getColumnMapLocalIndex(long long int local_index) const;
+
+    KOKKOS_INLINE_FUNCTION
+    int getMapLocalIndex(long long int local_index) const;
+
+    // Host Method
+    // Method that returns size
+    KOKKOS_INLINE_FUNCTION
+    size_t extent() const;
+
+    KOKKOS_INLINE_FUNCTION
+    size_t dim1() const;
+
+    size_t global_dim() const;
+ 
+    // Method returns the raw device pointer of the Kokkos DualView
+    KOKKOS_INLINE_FUNCTION
+    T* device_pointer() const;
+
+    // Method returns the raw host pointer of the Kokkos DualView
+    // KOKKOS_INLINE_FUNCTION
+    // T* host_pointer() const;
+
+    // Method returns kokkos dual view
+    KOKKOS_INLINE_FUNCTION
+    Kokkos::View <T**, Layout, ExecSpace, MemoryTraits> get_kokkos_view() const;
+
+    // // Method that update host view
+    // void update_host();
+
+    // Method that update device view
+    void update_device();
+
+    //print vector data
+    void print() const;
+
+    // Deconstructor
+    virtual KOKKOS_INLINE_FUNCTION
+    ~TpetraCRSMatrix ();
+}; // End of TpetraCRSMatrix
+
+
+// Default constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::TpetraCRSMatrix(): tpetra_pmap(NULL){
+    length_ = 0;
+    for (int i = 0; i < 7; i++) {
+        dim1_ = 0;
+    }
+}
+
+// // Constructor that takes local data in a matar ragged type
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::TpetraCRSMatrix(size_t global_dim1, size_t dim2, const std::string& tag_string, MPI_Comm mpi_comm) {
+//     mpi_comm_ = mpi_comm;
+//     global_dim1_ = global_dim1;
+//     Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+//     tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) global_dim1, 0, teuchos_comm));
+//     pmap = TpetraPartitionMap<tpetra_GO,Layout,ExecSpace,MemoryTraits>(tpetra_pmap);
+//     dim1_ = tpetra_pmap->getLocalNumElements();
+//     //construct strides that are constant
+//     mystrides_ = row_map_type("mystrides_",dim1_);
+//     for(int irow = 0; irow < dim1_; irow++){
+//         mystrides_(irow) = dim2;
+//     }
+//     this_array_ = input_values;
+//     global_indices_array input_crs_graph = crs_graph.get_kokkos_dual_view().d_view;
+
+    
+//     //build column map for the global conductivity matrix
+//     Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type> > colmap;
+//     const Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type> > dommap = tpetra_pmap;
+
+//     Tpetra::Details::makeColMap<tpetra_LO, tpetra_GO, tpetra_node_type>(colmap, tpetra_pmap, input_crs_graph.get_kokkos_dual_view().d_view, nullptr);
+//     tpetra_column_pmap = colmap;
+//     size_t nnz = input_crs_graph.size();
+
+//     //debug print
+//     //std::cout << "DOF GRAPH SIZE ON RANK " << myrank << " IS " << nnz << std::endl;
+    
+//     //local indices in the graph using the constructed column map
+//     crs_local_indices_ = indices_array("crs_local_indices", nnz);
+    
+//     //row offsets with compatible template arguments
+//         row_map_type row_offsets_pass("row_offsets", dim1_ + 1);
+//         for(int ipass = 0; ipass < dim1_ + 1; ipass++){
+//             row_offsets_pass(ipass) = input_values.start_index_(ipass);
+//         }
+
+//     size_t entrycount = 0;
+//     for(int irow = 0; irow < dim1_; irow++){
+//         for(int istride = 0; istride < mystrides_(irow); istride++){
+//             crs_local_indices_(entrycount) = tpetra_column_pmap->getLocalElement(crs_graph(entrycount));
+//             entrycount++;
+//         }
+//     }
+    
+//     //sort values and indices
+//     Tpetra::Import_Util::sortCrsEntries<row_map_type, indices_array, values_array>(row_offsets_pass, crs_local_indices_.d_view, this_array_.get_kokkos_view());
+
+//     tpetra_crs_matrix = Teuchos::rcp(new MAT(tpetra_pmap, tpetra_column_pmap, start_index_.d_view, crs_local_indices_.d_view, this_array_.get_kokkos_view()));
+//     tpetra_crs_matrix->fillComplete();
+// }
+
+// Constructor that takes local data in a matar ragged type
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::TpetraCRSMatrix(size_t dim0, input_row_map_type input_strides, DCArrayKokkos<tpetra_GO,Layout,ExecSpace,MemoryTraits> crs_graph,
+//                                                                   TArray1D input_values, const std::string& tag_string, MPI_Comm mpi_comm) {
+//     mpi_comm_ = mpi_comm;
+//     global_dim1_ = dim0;
+//     Teuchos::RCP<const Teuchos::Comm<int>> teuchos_comm = Teuchos::rcp(new Teuchos::MpiComm<int>(mpi_comm_));
+//     tpetra_pmap = Teuchos::rcp(new Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>((long long int) dim0, 0, teuchos_comm));
+//     pmap = TpetraPartitionMap<tpetra_GO,Layout,ExecSpace,MemoryTraits>(tpetra_pmap);
+//     dim1_ = tpetra_pmap->getLocalNumElements();
+//     mystrides_ = input_strides;
+//     this_array_ = input_values;
+//     global_indices_array input_crs_graph = crs_graph.get_kokkos_dual_view().d_view;
+
+    
+//     //build column map for the global conductivity matrix
+//     Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type> > colmap;
+//     const Teuchos::RCP<const Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type> > dommap = tpetra_pmap;
+
+//     Tpetra::Details::makeColMap<tpetra_LO, tpetra_GO, tpetra_node_type>(colmap, tpetra_pmap, input_crs_graph.get_kokkos_dual_view().d_view, nullptr);
+//     tpetra_column_pmap = colmap;
+//     size_t nnz = input_crs_graph.size();
+
+//     //debug print
+//     //std::cout << "DOF GRAPH SIZE ON RANK " << myrank << " IS " << nnz << std::endl;
+    
+//     //local indices in the graph using the constructed column map
+//     crs_local_indices_ = indices_array("crs_local_indices", nnz);
+    
+//     //row offsets with compatible template arguments
+//         row_map_type row_offsets_pass("row_offsets", dim1_ + 1);
+//         for(int ipass = 0; ipass < dim1_ + 1; ipass++){
+//             row_offsets_pass(ipass) = input_values.start_index_(ipass);
+//         }
+
+//     size_t entrycount = 0;
+//     for(int irow = 0; irow < dim1_; irow++){
+//         for(int istride = 0; istride < mystrides_(irow); istride++){
+//             crs_local_indices_(entrycount) = tpetra_column_pmap->getLocalElement(crs_graph(entrycount));
+//             entrycount++;
+//         }
+//     }
+    
+//     //sort values and indices
+//     Tpetra::Import_Util::sortCrsEntries<row_map_type, indices_array, values_array>(row_offsets_pass, crs_local_indices_.d_view, this_array_.get_kokkos_view());
+
+//     tpetra_crs_matrix = Teuchos::rcp(new MAT(tpetra_pmap, tpetra_column_pmap, start_index_.d_view, crs_local_indices_.d_view, this_array_.get_kokkos_view()));
+//     tpetra_crs_matrix->fillComplete();
+// }
+
+// Overloaded 2D constructor where you provide a partition map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::TpetraCRSMatrix(TpetraPartitionMap<ExecSpace,MemoryTraits> &input_pmap,
+                                                              size_t dim1, const std::string& tag_string) {
+    // mpi_comm_ = input_pmap.mpi_comm_;
+    // global_dim1_ = input_pmap.num_global_;
+    // tpetra_pmap = input_pmap.tpetra_map;
+    // pmap = input_pmap;
+    // dims_[0] = tpetra_pmap->getLocalNumElements();
+    // dims_[1] = dim1;
+    // order_ = 2;
+    // length_ = (dims_[0] * dims_[1]);
+    // // Create host ViewCArray
+    // set_mpi_type();
+    // this_array_ = TArray1D(tag_string, dims_[0], dim1);
+    // tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+// Overloaded 2D constructor taking an RPC pointer to a Tpetra Map
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::TpetraCRSMatrix(Teuchos::RCP<Tpetra::Map<tpetra_LO, tpetra_GO, tpetra_node_type>> input_pmap,
+                                                              size_t dim1, const std::string& tag_string) {
+    
+    // global_dim1_ = input_pmap->getGlobalNumElements();
+    // dims_[0] = input_pmap->getLocalNumElements();
+    // dims_[1] = dim1;
+    // tpetra_pmap = input_pmap;
+    // pmap = TpetraPartitionMap<tpetra_GO,Layout,ExecSpace,MemoryTraits>(tpetra_pmap);
+    // order_ = 2;
+    // length_ = (dims_[0] * dims_[1]);
+    // // Create host ViewCArray
+    // set_mpi_type();
+    // this_array_ = TArray1D(tag_string, dims_[0], dim1);
+    // tpetra_vector   = Teuchos::rcp(new MV(tpetra_pmap, this_array_));
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::set_mpi_type() {
+    if (typeid(T).name() == typeid(bool).name()) {
+        mpi_datatype_ = MPI_C_BOOL;
+    }
+    else if (typeid(T).name() == typeid(int).name()) {
+        mpi_datatype_ = MPI_INT;
+    }
+    else if (typeid(T).name() == typeid(long int).name()) {
+        mpi_datatype_ = MPI_LONG;
+    }
+    else if (typeid(T).name() == typeid(long long int).name()) {
+        mpi_datatype_ = MPI_LONG_LONG_INT;
+    }
+    else if (typeid(T).name() == typeid(float).name()) {
+        mpi_datatype_ = MPI_FLOAT;
+    }
+    else if (typeid(T).name() == typeid(double).name()) {
+        mpi_datatype_ = MPI_DOUBLE;
+    }
+    else {
+        printf("Your entered TpetraCRSMatrix type is not a supported type for MPI communications and is being set to int\n");
+        mpi_datatype_ = MPI_INT;
+    }
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T& TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::operator()(size_t i, size_t j) const {
+    assert(i >= 0 && i < dim1_ && "i is out of bounds in TpetraCRSMatrix!");
+    assert(j >= 0 && j < mystrides_(i) && "j is out of bounds in TpetraCRSMatrix!");
+    return this_array_(i,j);
+}
+
+// Return global index corresponding to the input local (on this process/rank) index for the sub map this vector comms from
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+long long int TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::getColumnMapGlobalIndex(int local_index) const {
+    long long int global_index = tpetra_column_pmap->getGlobalElement(local_index);
+    return global_index;
+}
+
+// Return global index corresponding to the input local (on this process/rank) index
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+long long int TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::getMapGlobalIndex(int local_index) const {
+    long long int global_index = tpetra_pmap->getGlobalElement(local_index);
+    return global_index;
+}
+
+// Return global index corresponding to the input local (on this process/rank) index for the sub map this vector comms from
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+int TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::getColumnMapLocalIndex(long long int global_index) const {
+    int local_index = tpetra_column_pmap->getLocalElement(global_index);
+    return local_index;
+}
+
+// Return global index corresponding to the input local (on this process/rank) index
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+int TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::getMapLocalIndex(long long int global_index) const {
+    int local_index = tpetra_pmap->getLocalElement(global_index);
+    return local_index;
+}
+
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// T& TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::host(size_t i, size_t j) const {
+//     assert(i >= 0 && i < dim1_ && "i is out of bounds in TpetraCRSMatrix");
+//     assert(j >= 0 && j < mystrides_(i) && "j is out of bounds in TpetraCRSMatrix");
+//     return this_array_.h_view(i,j);
+// }
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>& TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::operator= (const TpetraCRSMatrix& temp) {
+    
+    // Do nothing if the assignment is of the form x = x
+    if (this != &temp) {
+        dim1_ = temp.dim1_;
+        mystrides_ = temp.mystrides_;
+        start_index_ = temp.start_index_;
+        crs_local_indices_ = temp.crs_local_indices_;
+        global_dim1_ = temp.global_dim1_;
+        length_ = temp.length_;
+        this_array_ = temp.this_array_;
+        mpi_comm_ = temp.mpi_comm_;
+        mpi_datatype_ = temp.mpi_datatype_;
+        tpetra_crs_matrix = temp.tpetra_crs_matrix;
+        pmap = temp.pmap;
+        column_pmap = temp.column_pmap;
+        tpetra_pmap = temp.tpetra_pmap;
+        tpetra_column_pmap = temp.tpetra_column_pmap;
+        importer = temp.importer;
+        own_comms = temp.own_comms;
+        column_map_size_ = temp.column_map_size_;
+    }
+    
+    return *this;
+}
+
+// Return size
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::size() const {
+    return length_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::extent() const {
+    return length_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+size_t TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::dim1() const {
+    return dim1_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+size_t TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::global_dim() const {
+    return global_dim1_;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+T* TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::device_pointer() const {
+    return this_array_.pointer();
+}
+
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// KOKKOS_INLINE_FUNCTION
+// T* TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::host_pointer() const {
+//     return this_array_.h_view.data();
+// }
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+Kokkos::View <T**, Layout, ExecSpace, MemoryTraits> TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::get_kokkos_view() const {
+  return this_array_.get_kokkos_view();
+}
+
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// void TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::update_host() {
+
+//     this_array_.template modify<typename TArray1D::execution_space>();
+//     this_array_.template sync<typename TArray1D::host_mirror_space>();
+// }
+
+// template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+// void TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::update_device() {
+
+//     this_array_.template modify<typename TArray1D::host_mirror_space>();
+//     this_array_.template sync<typename TArray1D::execution_space>();
+// }
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::print() const {
+        std::ostream &out = std::cout;
+        Teuchos::RCP<Teuchos::FancyOStream> fos;
+        fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(out));
+        tpetra_crs_matrix->describe(*fos,Teuchos::VERB_EXTREME);
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+TpetraCRSMatrix<T,Layout,ExecSpace,MemoryTraits>::~TpetraCRSMatrix() {}
+
+////////////////////////////////////////////////////////////////////////////////
+// End of TpetraCRSMatrix
+////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////
+/* TpetraTpetraCommunicationPlan:  Class storing relevant data and functions to perform comms between two different Tpetra MATAR MPI types.
+                       The object for this class should not be reconstructed if the same comm plan is needed repeatedly; the setup is expensive.
+                       The comms routines such as execute_comms can be called repeatedly to avoid repeated setup of the plan.*/
+/////////////////////////
+template <typename T, typename Layout = tpetra_array_layout, typename ExecSpace = tpetra_execution_space, typename MemoryTraits = tpetra_memory_traits>
+class TpetraCommunicationPlan {
+    
+protected:
+    TpetraDFArray<T, Layout, ExecSpace, MemoryTraits> destination_vector_;
+    TpetraDFArray<T, Layout, ExecSpace, MemoryTraits> source_vector_;
+
+    /*forward comms means communicating data to a vector that doesn't have a unique distribution of its global
+      indices amongst processes from a vector that does have a unique distribution amongst processes.
+      An example of forward comms in a finite element application would be communicating ghost data from 
+      the vector of local data.
+
+      reverse comms means communicating data to a vector that has a unique distribution of its global
+      indices amongst processes from a vector that does not have a unique distribution amongst processes.
+      An example of reverse comms in a finite element application would be communicating force contributions from ghost
+      indices via summation to the entries of the uniquely owned vector that stores final tallies of forces.
+    */
+    bool reverse_comms_flag; //default is false
+    Teuchos::RCP<Tpetra::Import<tpetra_LO, tpetra_GO>> importer; // tpetra comm object
+    Teuchos::RCP<Tpetra::Export<tpetra_LO, tpetra_GO>> exporter; // tpetra reverse comm object
+
+public:
+    
+    enum combine_mode { INSERT, SUM, ABSMAX, REPLACE, MIN, ADD_REPLACE };
+    combine_mode combine_mode_;
+
+    TpetraCommunicationPlan();
+
+    //Copy Constructor
+    TpetraCommunicationPlan(const TpetraCommunicationPlan<T, Layout, ExecSpace,MemoryTraits> &temp){
+        *this = temp;
+    }
+    
+    TpetraCommunicationPlan(TpetraDFArray<T, Layout, ExecSpace, MemoryTraits> destination_vector,
+                            TpetraDFArray<T, Layout, ExecSpace, MemoryTraits> source_vector, bool reverse_comms=false, combine_mode mode=INSERT);
+
+    KOKKOS_INLINE_FUNCTION
+    TpetraCommunicationPlan& operator=(const TpetraCommunicationPlan& temp);
+
+    // Deconstructor
+    virtual KOKKOS_INLINE_FUNCTION
+    ~TpetraCommunicationPlan ();
+
+    void execute_comms();
+}; // End of TpetraCommunicationPlan
+
+
+// Default constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraCommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::TpetraCommunicationPlan() {
+    
+}
+
+// Overloaded 1D constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraCommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::TpetraCommunicationPlan(TpetraDFArray<T, Layout, ExecSpace, MemoryTraits> destination_vector,
+                            TpetraDFArray<T, Layout, ExecSpace, MemoryTraits> source_vector, bool reverse_comms, combine_mode mode) {
+    combine_mode_ = mode;
+    reverse_comms_flag = reverse_comms;
+    destination_vector_ = destination_vector;
+    source_vector_ = source_vector;
+
+    //setup Tpetra comm object
+    if(reverse_comms){
+        // create export object; completes setup
+        exporter = Teuchos::rcp(new Tpetra::Export<tpetra_LO, tpetra_GO>(source_vector_.tpetra_pmap, destination_vector_.tpetra_pmap));
+    }
+    else{
+        // create import object; completes setup
+        importer = Teuchos::rcp(new Tpetra::Import<tpetra_LO, tpetra_GO>(source_vector_.tpetra_pmap, destination_vector_.tpetra_pmap));
+    }
+}
+
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+TpetraCommunicationPlan<T,Layout,ExecSpace,MemoryTraits>& TpetraCommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::operator= (const TpetraCommunicationPlan& temp) {
+    
+    // Do nothing if the assignment is of the form x = x
+    if (this != &temp) {
+        reverse_comms_flag = temp.reverse_comms_flag;
+        combine_mode_ = temp.combine_mode_;
+        destination_vector_ = temp.destination_vector_;
+        source_vector_ = temp.source_vector_;
+        if(reverse_comms_flag){
+            exporter = temp.exporter;
+        }
+        else{
+            importer = temp.importer;
+        }
+    }
+    
+    return *this;
+}
+
+//perform comms
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraCommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::execute_comms(){
+    if(reverse_comms_flag){
+        destination_vector_.tpetra_vector->doExport(*(source_vector_.tpetra_vector), *exporter, Tpetra::INSERT, true);\
+        if(destination_vector_.this_array_.template need_sync<typename decltype(destination_vector_)::TArray1D::execution_space>()){
+            destination_vector_.this_array_.template sync<typename decltype(destination_vector_)::TArray1D::execution_space>();
+        }
+        else{
+            destination_vector_.this_array_.template sync<typename decltype(destination_vector_)::TArray1D::host_mirror_space>();
+        }
+    }
+    else{
+        destination_vector_.tpetra_vector->doImport(*(source_vector_.tpetra_vector), *importer, Tpetra::INSERT);
+        if(destination_vector_.this_array_.template need_sync<typename decltype(destination_vector_)::TArray1D::execution_space>()){
+            destination_vector_.this_array_.template sync<typename decltype(destination_vector_)::TArray1D::execution_space>();
+        }
+        else{
+            destination_vector_.this_array_.template sync<typename decltype(destination_vector_)::TArray1D::host_mirror_space>();
+        }
+    }
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+TpetraCommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::~TpetraCommunicationPlan() {}
+
+////////////////////////////////////////////////////////////////////////////////
+// End of TpetraCommunicationPlan
+////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////
+/* TpetraTpetraLRCommunicationPlan:  Class storing relevant data and functions to perform comms between two different Tpetra MATAR MPI types.
+                       The object for this class should not be reconstructed if the same comm plan is needed repeatedly; the setup is expensive.
+                       The comms routines such as execute_comms can be called repeatedly to avoid repeated setup of the plan.*/
+/////////////////////////
+template <typename T, typename Layout = Kokkos::LayoutRight, typename ExecSpace = tpetra_execution_space, typename MemoryTraits = tpetra_memory_traits>
+class TpetraLRCommunicationPlan {
+    
+protected:
+    TpetraDCArray<T, Layout, ExecSpace, MemoryTraits> destination_vector_;
+    TpetraDCArray<T, Layout, ExecSpace, MemoryTraits> source_vector_;
+
+    /*forward comms means communicating data to a vector that doesn't have a unique distribution of its global
+      indices amongst processes from a vector that does have a unique distribution amongst processes.
+      An example of forward comms in a finite element application would be communicating ghost data from 
+      the vector of local data.
+
+      reverse comms means communicating data to a vector that has a unique distribution of its global
+      indices amongst processes from a vector that does not have a unique distribution amongst processes.
+      An example of reverse comms in a finite element application would be communicating force contributions from ghost
+      indices via summation to the entries of the uniquely owned vector that stores final tallies of forces.
+    */
+    bool reverse_comms_flag; //default is false
+    Teuchos::RCP<Tpetra::Import<tpetra_LO, tpetra_GO>> importer; // tpetra comm object
+    Teuchos::RCP<Tpetra::Export<tpetra_LO, tpetra_GO>> exporter; // tpetra reverse comm object
+
+public:
+    
+    enum combine_mode { INSERT, SUM, ABSMAX, REPLACE, MIN, ADD_REPLACE };
+    combine_mode combine_mode_;
+
+    TpetraLRCommunicationPlan();
+
+    //Copy Constructor
+    TpetraLRCommunicationPlan(const TpetraLRCommunicationPlan<T, Layout, ExecSpace,MemoryTraits> &temp){
+        *this = temp;
+    }
+    
+    TpetraLRCommunicationPlan(TpetraDCArray<T, Layout, ExecSpace, MemoryTraits> destination_vector,
+                            TpetraDCArray<T, Layout, ExecSpace, MemoryTraits> source_vector, bool reverse_comms=false, combine_mode mode=INSERT);
+
+    KOKKOS_INLINE_FUNCTION
+    TpetraLRCommunicationPlan& operator=(const TpetraLRCommunicationPlan& temp);
+
+    // Deconstructor
+    virtual KOKKOS_INLINE_FUNCTION
+    ~TpetraLRCommunicationPlan ();
+
+    void execute_comms();
+}; // End of TpetraLRCommunicationPlan
+
+
+// Default constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraLRCommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::TpetraLRCommunicationPlan() {
+    
+}
+
+// Overloaded 1D constructor
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+TpetraLRCommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::TpetraLRCommunicationPlan(TpetraDCArray<T, Layout, ExecSpace, MemoryTraits> destination_vector,
+                            TpetraDCArray<T, Layout, ExecSpace, MemoryTraits> source_vector, bool reverse_comms, combine_mode mode) {
+    combine_mode_ = mode;
+    reverse_comms_flag = reverse_comms;
+    destination_vector_ = destination_vector;
+    source_vector_ = source_vector;
+
+    //setup Tpetra comm object
+    if(reverse_comms){
+        // create export object; completes setup
+        exporter = Teuchos::rcp(new Tpetra::Export<tpetra_LO, tpetra_GO>(source_vector_.tpetra_pmap, destination_vector_.tpetra_pmap));
+    }
+    else{
+        // create import object; completes setup
+        importer = Teuchos::rcp(new Tpetra::Import<tpetra_LO, tpetra_GO>(source_vector_.tpetra_pmap, destination_vector_.tpetra_pmap));
+    }
+}
+
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+TpetraLRCommunicationPlan<T,Layout,ExecSpace,MemoryTraits>& TpetraLRCommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::operator= (const TpetraLRCommunicationPlan& temp) {
+    
+    // Do nothing if the assignment is of the form x = x
+    if (this != &temp) {
+        reverse_comms_flag = temp.reverse_comms_flag;
+        combine_mode_ = temp.combine_mode_;
+        destination_vector_ = temp.destination_vector_;
+        source_vector_ = temp.source_vector_;
+        if(reverse_comms_flag){
+            exporter = temp.exporter;
+        }
+        else{
+            importer = temp.importer;
+        }
+    }
+    
+    return *this;
+}
+
+//perform comms
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+void TpetraLRCommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::execute_comms(){
+    if(reverse_comms_flag){
+        destination_vector_.tpetra_vector->doExport(*(source_vector_.tpetra_vector), *exporter, Tpetra::INSERT, true);
+        if(destination_vector_.this_array_.template need_sync<typename decltype(destination_vector_)::TArray1D::execution_space>()){
+            destination_vector_.this_array_.template sync<typename decltype(destination_vector_)::TArray1D::execution_space>();
+        }
+        else{
+            destination_vector_.this_array_.template sync<typename decltype(destination_vector_)::TArray1D::host_mirror_space>();
+        }
+    }
+    else{
+        destination_vector_.tpetra_vector->doImport(*(source_vector_.tpetra_vector), *importer, Tpetra::INSERT);
+        if(destination_vector_.this_array_.template need_sync<typename decltype(destination_vector_)::TArray1D::execution_space>()){
+            destination_vector_.this_array_.template sync<typename decltype(destination_vector_)::TArray1D::execution_space>();
+        }
+        else{
+            destination_vector_.this_array_.template sync<typename decltype(destination_vector_)::TArray1D::host_mirror_space>();
+        }
+    }
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+TpetraLRCommunicationPlan<T,Layout,ExecSpace,MemoryTraits>::~TpetraLRCommunicationPlan() {}
 
 ////////////////////////////////////////////////////////////////////////////////
-// End of TpetraMVArray
+// End of TpetraLRCommunicationPlan
 ////////////////////////////////////////////////////////////////////////////////
 
 } // end namespace