Muesli
 All Classes Namespaces Files Functions Typedefs Enumerations
dmatrix.h
1 /*
2  * DMatrix.h
3  *
4  * Author: Steffen Ernsting <s.ernsting@uni-muenster.de>
5  *
6  * -------------------------------------------------------------------------------
7  *
8  * The MIT License
9  *
10  * Copyright 2014 Steffen Ernsting <s.ernsting@uni-muenster.de>,
11  * Herbert Kuchen <kuchen@uni-muenster.de.
12  *
13  * Permission is hereby granted, free of charge, to any person obtaining a copy
14  * of this software and associated documentation files (the "Software"), to deal
15  * in the Software without restriction, including without limitation the rights
16  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17  * copies of the Software, and to permit persons to whom the Software is
18  * furnished to do so, subject to the following conditions:
19  *
20  * The above copyright notice and this permission notice shall be included in
21  * all copies or substantial portions of the Software.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29  * THE SOFTWARE.
30  *
31  */
32 
33 #pragma once
34 
35 #include "omp.h"
36 #include "mpi.h"
37 #include "muesli.h"
38 #include "exception.h"
39 #include "functors.h"
40 #include "lmatrix.h"
41 #include "plmatrix.h"
42 #include "exec_plan.h"
43 #ifdef __CUDACC__
44 #include "map_kernels.cuh"
45 #include "zip_kernels.cuh"
46 #include "fold_kernels.cuh"
47 #include "copy_kernel.cuh"
48 #include "properties.cuh"
49 #endif
50 
51 namespace msl {
52 
63 template <typename T>
64 class DMatrix
65 {
66 public:
67 
68  //
69  // CONSTRUCTORS / DESTRUCTOR
70  //
71 
75  DMatrix();
76 
86  DMatrix(int n0, int m0, int rows, int cols, Distribution d = DIST);
87 
99  DMatrix(int n0, int m0, int rows, int cols, const T& initial_value, Distribution d = DIST);
100 
113  DMatrix(int n0, int m0, int rows, int cols, T* const initial_matrix, Distribution d = DIST);
114 
115 // DMatrix(int n0, int m0, int rows, int cols, const T* const * const initial_matrix, Distribution d = DIST);
116 
129  DMatrix(int n0, int m0, int rows, int cols, T (*f)(int, int), Distribution d = DIST);
130 
143  template <typename F2>
144  DMatrix(int n0, int m0, int rows, int cols, const F2& f, Distribution d = DIST);
145 
153  DMatrix(int n0, int m0);
154 
163  DMatrix(int n0, int m0, const T& initial_value);
164 
174  DMatrix(int n0, int m0, T* const initial_matrix);
175 
176 // DMatrix(int n0, int m0, const T* const * const initial_matrix);
177 
187  DMatrix(int n0, int m0, T (*f)(int, int));
188 
198  template <typename F2>
199  DMatrix(int n0, int m0, const F2& f);
200 
204  DMatrix(const DMatrix<T>& cs);
205 
209  ~DMatrix();
210 
211  // ASSIGNMENT OPERATOR
215  DMatrix<T>& operator=(const DMatrix<T>& rhs);
216 
217 
218  //
219  // FILL
220  //
221 
228  void fill(const T& value);
229 
237  void fill(T* const values);
238 
239 // void fill(T** values);
240 
248  void fill(T (*f)(int, int));
249 
257  template <typename F2>
258  void fill(const F2& f);
259 
260 
261  //
262  // SKELETONS / COMPUTATION
263  //
264 
265  // SKELETONS / COMPUTATION / MAP
266 
272  template <typename MapFunctor>
273  void mapInPlace(MapFunctor& f);
274 
282  template <typename MapIndexFunctor>
283  void mapIndexInPlace(MapIndexFunctor& f);
284 
291  template <typename R, typename MapFunctor>
292  msl::DMatrix<R> map(MapFunctor& f);
293 
301  template <typename R, typename MapIndexFunctor>
302  DMatrix<R> mapIndex(MapIndexFunctor& f);
303 
311  template <typename MapStencilFunctor>
312  void mapStencilInPlace(MapStencilFunctor& f, T neutral_value);
313 
321  template <typename R, typename MapStencilFunctor>
322  DMatrix<R> mapStencil(MapStencilFunctor& f, T neutral_value);
323 
324 #ifndef __CUDACC__
325 
326  // SKELETONS / COMPUTATION / MAP / INPLACE
333  template <typename F>
334  void mapInPlace(const msl::Fct1<T, T, F>& f);
335 
342  void mapInPlace(T(*f)(T));
343 
344  // SKELETONS / COMPUTATION / MAP / INDEXINPLACE
352  template <typename F>
353  void mapIndexInPlace(const msl::Fct3<int, int, T, T, F>& f);
354 
362  void mapIndexInPlace(T(*f)(int, int, T));
363 
364  // SKELETONS / COMPUTATION / MAP
372  template <typename R, typename F>
373  msl::DMatrix<R> map(const msl::Fct1<T, R, F>& f);
374 
381  template <typename R>
382  msl::DMatrix<R> map(R(*f)(T));
383 
384  // SKELETONS / COMPUTATION / MAP / INDEX
392  template <typename R, typename F>
393  DMatrix<R> mapIndex(const msl::Fct3<int, int, T, R, F>& f);
394 
402  template <typename R>
403  DMatrix<R> mapIndex(R(*f)(int, int, T));
404 
405 #endif
406 
407  // SKELETONS / COMPUTATION / ZIP
408 
416  template <typename T2, typename ZipFunctor>
417  void zipInPlace(DMatrix<T2>& b, ZipFunctor& f);
418 
426  template <typename T2, typename ZipIndexFunctor>
427  void zipIndexInPlace(DMatrix<T2>& b, ZipIndexFunctor& f);
428 
435  template <typename R, typename T2, typename ZipFunctor>
436  DMatrix<R> zip(DMatrix<T2>& b, ZipFunctor& f);
437 
444  template <typename R, typename T2, typename ZipIndexFunctor>
445  DMatrix<R> zipIndex(DMatrix<T2>& b, ZipIndexFunctor& f);
446 
447 #ifndef __CUDACC__
448 
449  // SKELETONS / COMPUTATION / ZIP / INPLACE
457  template <typename T2, typename F>
458  void zipInPlace(DMatrix<T2>& b, const Fct2<T, T2, T, F>& f);
459 
467  template <typename T2>
468  void zipInPlace(DMatrix<T2>& b, T(*f)(T, T2));
469 
470  // SKELETONS / COMPUTATION / ZIP / INDEXINPLACE
479  template <typename T2, typename F>
480  void zipIndexInPlace(DMatrix<T2>& b, const Fct4<int, int, T, T2, T, F>& f);
481 
490  template <typename T2>
491  void zipIndexInPlace(DMatrix<T2>& b, T(*f)(int, int, T, T2));
492 
493  // SKELETONS / COMPUTATION / ZIP
501  template <typename R, typename T2, typename F>
502  DMatrix<R> zip(DMatrix<T2>& b, const Fct2<T, T2, R, F>& f);
503 
511  template <typename R, typename T2>
512  DMatrix<R> zip(DMatrix<T2>& b, R(*f)(T, T2));
513 
514  // SKELETONS / COMPUTATION / ZIP / INDEX
522  template <typename R, typename T2, typename F>
523  DMatrix<R> zipIndex(DMatrix<T2>& b, const Fct4<int, int, T, T2, R, F>& f);
524 
532  template <typename R, typename T2>
533  DMatrix<R> zipIndex(DMatrix<T2>& b, R(*f)(int, int, T, T2));
534 #endif
535 
536  // SKELETONS / COMPUTATION / FOLD
537 
538 #ifndef __CUDACC__
539 
552  template <typename FoldFunctor>
553  T fold(FoldFunctor& f, bool final_fold_on_cpu = 1);
554 
564  template <typename F>
565  T fold(const Fct2<T, T, T, F>& f);
566 
576  T fold(T(*f)(T, T));
577 
578 #else
579 
590  template <typename FoldFunctor>
591  T fold(FoldFunctor& f, bool final_fold_on_cpu = 0);
592 
593 #endif
594 
595 
596  //
597  // SKELETONS / COMMUNICATION
598  //
599 
600  // SKELETONS / COMMUNICATION / BROADCAST PARTITION
601 
611  void broadcastPartition(int blockRow, int blockCol);
612 
613  // SKELETONS / COMMUNICATION / GATHER
614 
623  void gather(T** b);
624 
633  void gather(DMatrix<T>& dm);
634 
635  // SKELETONS / COMMUNICATION / PERMUTE PARTITION
636 
649  template <class F1, class F2>
650  void permutePartition(const Fct2<int, int, int, F1>& newRow, const Fct2<int, int, int, F2>& newCol);
651 
662  void permutePartition(int (*f)(int, int), int (*g)(int, int));
663 
675  template <class F>
676  void permutePartition(int (*f)(int, int), const Fct2<int, int, int, F>& g);
677 
689  template <class F>
690  void permutePartition(const Fct2<int, int, int, F>& f, int (*g)(int, int));
691 
692  // SKELETONS / COMMUNICATION / ROTATE
693 
694  // SKELETONS / COMMUNICATION / ROTATE / ROTATE COLUMNS
695 
709  template <class F>
710  void rotateCols(const Fct1<int, int, F>& f);
711 
724  void rotateCols(int (*f)(int));
725 
737  void rotateCols(int rows);
738 
739  // SKELETONS / COMMUNICATION / ROTATE / ROTATE ROWS
740 
754  template <class F>
755  void rotateRows(const Fct1<int, int, F>& f);
756 
769  void rotateRows(int (*f)(int));
770 
782  void rotateRows(int cols);
783 
789 
790 
791  //
792  // GETTERS AND SETTERS
793  //
794 
800  T* getLocalPartition() const;
808  T get(size_t row, size_t col) const;
816  void set(int row, int col, const T& v);
817 
823  int getFirstRow() const;
824 
830  int getFirstCol() const;
831 
837  int getLocalCols() const;
838 
844  int getLocalRows() const;
845 
851  int getLocalSize() const;
852 
858  int getRows() const;
859 
865  int getCols() const;
866 
872  int getBlocksInCol() const;
873 
879  int getBlocksInRow() const;
880 
889  bool isLocal(int row, int col) const;
890 
900  T getLocal(int row, int col) const;
901 
910  void setLocal(int row, int col, const T& v);
911 
918  std::vector<GPUExecutionPlan<T> > getExecPlans();
919 
923  void setCopyDistribution();
924 
932  void setDistribution(int rows, int cols);
933 
934  //
935  // AUXILIARY
936  //
937 
944  std::vector<T*> upload(bool allocOnly = 0);
945 
949  void download();
950 
954  void freeDevice();
955 
962  void setGpuDistribution(Distribution dist);
963 
970 
977  void show(const std::string& descr = std::string());
978 
982  void printLocal();
983 
984 private:
985  // local partition
986  T* localPartition;
987  // position of processor in data parallel group of processors; zero-base
988  int id;
989  // number of rows
990  int n;
991  // number of columns
992  int m;
993  // number of local rows
994  int nLocal;
995  // number of local columns
996  int mLocal;
997  // nLocal * mLocal;
998  int localsize;
999  // number of (local) partitions per row
1000  int blocksInRow;
1001  // number of (local) partitions per column
1002  int blocksInCol;
1003  // X position of processor in data parallel group of processors
1004  int localColPosition;
1005  // Y position of processor in data parallel group of processors
1006  int localRowPosition;
1007  // position of processor in data parallel group of processors
1008  int localPosition;
1009  // first column index in local partition; assuming division mode: block
1010  int firstRow;
1011  // first row index in local partition; assuming division mode: block
1012  int firstCol;
1013  // index of first row in next partition
1014  int nextRow;
1015  // index of first column in next partition
1016  int nextCol;
1017  // first (global) index of local partition
1018  int firstIndex;
1019  // total number of MPI processes
1020  int np;
1021  // checks whether data is up to date in main (cpu) memory
1022  bool cpuMemoryFlag;
1023  // execution plans for each gpu
1024  GPUExecutionPlan<T>* plans = 0;
1025  // checks whether data is copy distributed among all processes
1026  Distribution dist;
1027  // checks whether data is copy distributed among all gpus
1028  bool gpuCopyDistributed = 0;
1029 
1030 
1031  //
1032  // AUXILIARY
1033  //
1034 
1035  // initializes distributed matrix (used in constructors).
1036  void init(int rows, int cols);
1037 
1038  // initializes the GPU execution plans.
1039  void initGPU();
1040 
1041  // returns the GPU id that locally stores the element at index index.
1042  int getGpuId(int row, int col) const;
1043 
1044 };
1045 
1046 } // namespace msl
1047 
1048 #include "../src/dmatrix_common.cpp"
1049 
1050 #ifdef __CUDACC__
1051 #include "../src/dmatrix.cu"
1052 #else
1053 #include "../src/dmatrix.cpp"
1054 #endif
1055 
1056 
1057 
void setDistribution(int rows, int cols)
Switch the distribution scheme from copy distributed to distributed. Note that rows * cols = numProce...
void set(int row, int col, const T &v)
Sets the element at the given global indices (row, col) to the given value v.
DMatrix< T > & operator=(const DMatrix< T > &rhs)
Assignment operator.
int getFirstCol() const
Returns the index of the first column of the local partition.
void show(const std::string &descr=std::string())
Prints the distributed array to standard output. Optionally, the user may pass a description that wil...
T * getLocalPartition() const
Returns the local partition.
Definition: distribution.h:39
int getLocalRows() const
Returns the number of rows of the local partition.
int getBlocksInCol() const
Returns the number of blocks (local partitions) in a column.
void mapIndexInPlace(MapIndexFunctor &f)
Replaces each element m[i][j] of the distributed matrix with f(i, j, m[i][j]). Note that besides the ...
void download()
Manually download the local partition from GPU memory.
Distribution getGpuDistribution()
Returns the current GPU distribution scheme.
void setLocal(int row, int col, const T &v)
Sets the element at the given local indices (row, col) to the given value v.
Class DMatrix represents a distributed matrix.
Definition: dmatrix.h:64
void setGpuDistribution(Distribution dist)
Set how the local partition is distributed among the GPUs. Current distribution schemes are: distribu...
DMatrix< R > zip(DMatrix< T2 > &b, ZipFunctor &f)
Non-inplace variant of the zip skeleton.
std::vector< GPUExecutionPlan< T > > getExecPlans()
Returns the GPU execution plans that store information about size, etc. for the GPU partitions...
void mapStencilInPlace(MapStencilFunctor &f, T neutral_value)
Replaces each element m[i][j] of the distributed matrix with f(i, j, m). Note that the index i and th...
T fold(FoldFunctor &f, bool final_fold_on_cpu=1)
Reduces all elements of the distributed matrix to a single element by successively applying the given...
Definition: exec_plan.h:36
DMatrix()
Default constructor.
int getLocalSize() const
Returns the size of the local partition.
void rotateRows(const Fct1< int, int, F > &f)
Rotates the partitions of the distributed matrix cyclically in horizontal direction.
DMatrix< R > zipIndex(DMatrix< T2 > &b, ZipIndexFunctor &f)
Non-inplace variant of the zipIndex skeleton.
DMatrix< R > mapIndex(MapIndexFunctor &f)
Returns a new distributed matrix with m_new[i] = f(i, j, m[i][j]). Note that besides the element itse...
void permutePartition(const Fct2< int, int, int, F1 > &newRow, const Fct2< int, int, int, F2 > &newCol)
Permutes the partitions of the distributed array according to the given functions newRow and newCol...
msl::DMatrix< R > map(MapFunctor &f)
Returns a new distributed matrix with m_new[i][j] = f(m[i][j]).
DMatrix< R > mapStencil(MapStencilFunctor &f, T neutral_value)
Non-inplace variant of the mapStencil skeleton.
void zipIndexInPlace(DMatrix< T2 > &b, ZipIndexFunctor &f)
Replaces each element m[i][j] of the distributed matrix with f(i, j, m[i][j], b[i][j]). Note that besides the elements themselves also the indices are passed to the functor.
void zipInPlace(DMatrix< T2 > &b, ZipFunctor &f)
Replaces each element m[i][j] of the distributed array with f(m[i][j], b[i][j]) with b being another ...
void gather(T **b)
Transforms a distributed matrix to an ordinary (two-dimnesional) array by copying each element to the...
void setCopyDistribution()
Switch the distribution scheme from distributed to copy distributed.
void rotateCols(const Fct1< int, int, F > &f)
Rotates the partitions of the distributed matrix cyclically in vertical direction.
void mapInPlace(MapFunctor &f)
Replaces each element m[i][j] of the distributed matrix with f(m[i][j]).
T getLocal(int row, int col) const
Returns the element at the given local indices (row, col). Note that 0 <= row < nLocal and 0 <= col <...
void printLocal()
Each process prints its local partition of the distributed array.
int getRows() const
Returns the number of rows of the distributed matrix.
int getBlocksInRow() const
Returns the number of blocks (local partitions) in a row.
int getCols() const
Returns the number of columns of the distributed matrix.
bool isLocal(int row, int col) const
Checks whether the element at the given global indices (row, col) is locally stored.
void transposeLocalPartition()
Transposes the local partition. Currently only implemented for nLocal == mLocal.
Contains global definitions such as macros, functions, enums and classes, and constants in order to c...
int getLocalCols() const
Returns the number of columns of the local partition.
void fill(const T &value)
Initializes the elements of the distributed matrix with the value value.
void freeDevice()
Manually free device memory.
int getFirstRow() const
Returns the index of the first row of the local partition.
void broadcastPartition(int blockRow, int blockCol)
Broadcasts the partition with index (blockRow, blockCol to all processes. Afterwards, each partition of the distributed matrix stores the same values. Note that 0 <= blockRow < n and 0 <= blockCol < m.
std::vector< T * > upload(bool allocOnly=0)
Manually upload the local partition to GPU memory.
~DMatrix()
Destructor.