Muesli
 All Classes Namespaces Files Functions Typedefs Enumerations
darray.h
1 /*
2  * darray.h
3  *
4  * Author: Steffen Ernsting <s.ernsting@uni-muenster.de>
5  *
6  * -------------------------------------------------------------------------------
7  *
8  * The MIT License
9  *
10  * Copyright 2014 Steffen Ernsting <s.ernsting@uni-muenster.de>,
11  * Herbert Kuchen <kuchen@uni-muenster.de.
12  *
13  * Permission is hereby granted, free of charge, to any person obtaining a copy
14  * of this software and associated documentation files (the "Software"), to deal
15  * in the Software without restriction, including without limitation the rights
16  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17  * copies of the Software, and to permit persons to whom the Software is
18  * furnished to do so, subject to the following conditions:
19  *
20  * The above copyright notice and this permission notice shall be included in
21  * all copies or substantial portions of the Software.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29  * THE SOFTWARE.
30  *
31  */
32 
33 #pragma once
34 
35 #include "omp.h"
36 #include "mpi.h"
37 #include "muesli.h"
38 #include "exception.h"
39 #include "functors.h"
40 #include "larray.h"
41 #include "plarray.h"
42 #include "exec_plan.h"
43 #ifdef __CUDACC__
44 #include "map_kernels.cuh"
45 #include "zip_kernels.cuh"
46 #include "fold_kernels.cuh"
47 #include "copy_kernel.cuh"
48 #include "properties.cuh"
49 #include "exec_plan.h"
50 #endif
51 
52 namespace msl {
63 template <typename T>
64 class DArray
65 {
66 public:
67 
68  //
69  // CONSTRUCTORS / DESTRUCTOR
70  //
71 
75  DArray();
76 
83  DArray(int size, Distribution d = DIST);
84 
93  DArray(int size, const T& initial_value, Distribution d = DIST);
94 
104  DArray(int size, T* const initial_array, Distribution d = DIST);
105 
115  DArray(int size, T (*f)(int), Distribution d = DIST);
116 
126  template <typename F>
127  DArray(int size, const F& f, Distribution d = DIST);
128 
134  DArray(const DArray<T>& cs);
135 
139  ~DArray();
140 
141  // ASSIGNMENT OPERATOR
147  DArray<T>& operator=(const DArray<T>& rhs);
148 
149 
150  //
151  // FILL
152  //
153 
160  void fill(const T& value);
161 
169  void fill(T* const values);
170 
178  void fill(T (*f)(int));
179 
187  template <typename F>
188  void fill(const F& f);
189 
190 
191  //
192  // SKELETONS / COMPUTATION
193  //
194 
195  // SKELETONS / COMPUTATION / MAP
196 
202  template <typename MapFunctor>
203  void mapInPlace(MapFunctor& f);
204 
212  template <typename MapIndexFunctor>
213  void mapIndexInPlace(MapIndexFunctor& f);
214 
221  template <typename R, typename MapFunctor>
222  msl::DArray<R> map(MapFunctor& f);
223 
231  template <typename R, typename MapIndexFunctor>
232  DArray<R> mapIndex(MapIndexFunctor& f);
233 
241  template <typename MapStencilFunctor>
242  void mapStencilInPlace(MapStencilFunctor& f, T neutral_value);
243 
251  template <typename R, typename MapStencilFunctor>
252  DArray<R> mapStencil(MapStencilFunctor& f, T neutral_value);
253 
254 #ifndef __CUDACC__
255 
256  // SKELETONS / COMPUTATION / MAP / INPLACE
263  template <typename F>
264  void mapInPlace(const msl::Fct1<T, T, F>& f);
265 
272  void mapInPlace(T(*f)(T));
273 
274  // SKELETONS / COMPUTATION / MAP / INDEXINPLACE
282  template <typename F>
283  void mapIndexInPlace(const msl::Fct2<int, T, T, F>& f);
284 
292  void mapIndexInPlace(T(*f)(int, T));
293 
294  // SKELETONS / COMPUTATION / MAP
302  template <typename R, typename F>
303  msl::DArray<R> map(const msl::Fct1<T, R, F>& f);
304 
311  template <typename R>
312  msl::DArray<R> map(R(*f)(T));
313 
314  // SKELETONS / COMPUTATION / MAP / INDEX
322  template <typename R, typename F>
323  DArray<R> mapIndex(const msl::Fct2<int, T, R, F>& f);
324 
332  template <typename R>
333  DArray<R> mapIndex(R(*f)(int, T));
334 
335 #endif
336 
337  // SKELETONS / COMPUTATION / ZIP
338 
345  template <typename T2, typename ZipFunctor>
346  void zipInPlace(DArray<T2>& b, ZipFunctor& f);
347 
355  template <typename T2, typename ZipIndexFunctor>
356  void zipIndexInPlace(DArray<T2>& b, ZipIndexFunctor& f);
357 
364  template <typename R, typename T2, typename ZipFunctor>
365  DArray<R> zip(DArray<T2>& b, ZipFunctor& f);
366 
373  template <typename R, typename T2, typename ZipIndexFunctor>
374  DArray<R> zipIndex(DArray<T2>& b, ZipIndexFunctor& f);
375 
376 #ifndef __CUDACC__
377 
378  // SKELETONS / COMPUTATION / ZIP / INPLACE
386  template <typename T2, typename F>
387  void zipInPlace(DArray<T2>& b, const Fct2<T, T2, T, F>& f);
388 
396  template <typename T2>
397  void zipInPlace(DArray<T2>& b, T(*f)(T, T2));
398 
399  // SKELETONS / COMPUTATION / ZIP / INDEXINPLACE
407  template <typename T2, typename F>
408  void zipIndexInPlace(DArray<T2>& b, const Fct3<int, T, T2, T, F>& f);
409 
417  template <typename T2>
418  void zipIndexInPlace(DArray<T2>& b, T(*f)(int, T, T2));
419 
420  // SKELETONS / COMPUTATION / ZIP
428  template <typename R, typename T2, typename F>
429  DArray<R> zip(DArray<T2>& b, const Fct2<T, T2, R, F>& f);
430 
438  template <typename R, typename T2>
439  DArray<R> zip(DArray<T2>& b, R(*f)(T, T2));
440 
441  // SKELETONS / COMPUTATION / ZIP / INDEX
449  template <typename R, typename T2, typename F>
450  DArray<R> zipIndex(DArray<T2>& b, const Fct3<int, T, T2, R, F>& f);
451 
459  template <typename R, typename T2>
460  DArray<R> zipIndex(DArray<T2>& b, R(*f)(int, T, T2));
461 #endif
462 
463  // SKELETONS / COMPUTATION / FOLD
464 
465 #ifndef __CUDACC__
466 
479  template <typename FoldFunctor>
480  T fold(FoldFunctor& f, bool final_fold_on_cpu = 1);
481 
491  template <typename F>
492  T fold(const Fct2<T, T, T, F>& f);
493 
503  T fold(T(*f)(T, T));
504 
505 #else
506 
517  template <typename FoldFunctor>
518  T fold(FoldFunctor& f, bool final_fold_on_cpu = 0);
519 
520 #endif
521 
522 
523  //
524  // SKELETONS / COMMUNICATION
525  //
526 
527  // SKELETONS / COMMUNICATION / BROADCAST PARTITION
528 
536  void broadcastPartition(int partitionIndex);
537 
538  // SKELETONS / COMMUNICATION / GATHER
539 
547  void gather(T* b);
548 
557  void gather(msl::DArray<T>& da);
558 
559  // SKELETONS / COMMUNICATION / PERMUTE PARTITION
560 
568  template <class F>
569  inline void permutePartition(const Fct1<int, int, F>& f);
570 
578  inline void permutePartition(int (*f)(int));
579 
580 
581  //
582  // GETTERS AND SETTERS
583  //
584 
590  T* getLocalPartition() const;
591 
598  T get(int index) const;
599 
607  void set(int globalIndex, const T& v);
608 
614  int getSize() const;
615 
621  int getLocalSize() const;
622 
628  int getFirstIndex() const;
629 
637  bool isLocal(int index) const;
638 
646  T getLocal(int localIndex) const;
647 
655  void setLocal(int localIndex, const T& v);
656 
663  std::vector<GPUExecutionPlan<T> > getExecPlans();
664 
668  void setCopyDistribution();
669 
673  void setDistribution();
674 
675 
676  //
677  // AUXILIARY
678  //
679 
686  std::vector<T*> upload(bool allocOnly = 0);
687 
691  void download();
692 
696  void freeDevice();
697 
704  void setGpuDistribution(Distribution dist);
705 
712 
719  void show(const std::string& descr = std::string());
720 
724  void printLocal();
725 
726 private:
727  // local partition
728  T* localPartition;
729  // position of processor in data parallel group of processors; zero-base
730  int id;
731  // number of elements
732  int n;
733  // number of local elements
734  int nLocal;
735  // first (global) index of local partition
736  int firstIndex;
737  // total number of MPI processes
738  int np;
739  // checks whether data is up to date in main (cpu) memory
740  bool cpuMemoryFlag;
741  // execution plans for each gpu
742  GPUExecutionPlan<T>* plans = 0;
743  // checks whether data is copy distributed among all processes
744  Distribution dist;
745  // checks whether data is copy distributed among all gpus
746  bool gpuCopyDistributed = 0;
747 
748 
749  //
750  // AUXILIARY
751  //
752 
753  // initializes distributed matrix (used in constructors).
754  void init();
755 
756  // initializes the GPU execution plans.
757  void initGPU();
758 
759  // returns the GPU id that locally stores the element at index index.
760  int getGpuId(int index) const;
761 };
762 
763 } // namespace msl
764 
765 #include "../src/darray_common.cpp"
766 
767 #ifdef __CUDACC__
768 #include "../src/darray.cu"
769 #else
770 #include "../src/darray.cpp"
771 #endif
772 
773 
774 
DArray< R > mapStencil(MapStencilFunctor &f, T neutral_value)
Non-inplace variant of the mapStencil skeleton.
~DArray()
Destructor.
int getFirstIndex() const
Returns the first (global) index of the local partition.
T fold(FoldFunctor &f, bool final_fold_on_cpu=1)
Reduces all elements of the distributed array to a single element by successively applying the given ...
Definition: distribution.h:39
void fill(const T &value)
Initializes the elements of the distributed array with the value value.
Class DArray represents a distributed array.
Definition: darray.h:64
void setGpuDistribution(Distribution dist)
Set how the local partition is distributed among the GPUs. Current distribution schemes are: distribu...
void setCopyDistribution()
Switch the distribution scheme from distributed to copy distributed.
void show(const std::string &descr=std::string())
Prints the distributed array to standard output. Optionally, the user may pass a description that wil...
bool isLocal(int index) const
Checks whether the element at the given global index index is locally stored.
void download()
Manually download the local partition from GPU memory.
void mapIndexInPlace(MapIndexFunctor &f)
Replaces each element a[i] of the distributed array with f(i, a[i]). Note that besides the element it...
DArray< R > zipIndex(DArray< T2 > &b, ZipIndexFunctor &f)
Non-inplace variant of the zipIndex skeleton.
void printLocal()
Each process prints its local partition of the distributed array.
void setLocal(int localIndex, const T &v)
Sets the element at the given local index localIndex to the given value v.
Definition: exec_plan.h:36
void broadcastPartition(int partitionIndex)
Broadcasts the partition with index partitionIndex to all processes. Afterwards, each partition of th...
void gather(T *b)
Transforms a distributed array to an ordinary array by copying each element to the given array b...
DArray()
Default constructor.
int getLocalSize() const
Returns the size of local partitions of the distributed array.
void setDistribution()
Switch the distribution scheme from copy distributed to distributed.
msl::DArray< R > map(MapFunctor &f)
Returns a new distributed array with a_new[i] = f(a[i]).
void set(int globalIndex, const T &v)
Sets the element at the given global index globalIndex to the given value v, with 0 <= globalIndex < ...
T getLocal(int localIndex) const
Returns the element at the given local index index. Note that 0 <= index < getLocalSize() must hold (...
void zipIndexInPlace(DArray< T2 > &b, ZipIndexFunctor &f)
Replaces each element a[i] of the distributed array with f(i, a[i], b[i]). Note that besides the elem...
std::vector< T * > upload(bool allocOnly=0)
Manually upload the local partition to GPU memory.
void permutePartition(const Fct1< int, int, F > &f)
Permutes the partitions of the distributed array according to the given function f. f must be bijective and return the ID of the new process p_i to store the partition, with 0 <= i < np.
void mapStencilInPlace(MapStencilFunctor &f, T neutral_value)
Replaces each element a[i] of the distributed array with f(i, a). Note that the index i and the local...
void zipInPlace(DArray< T2 > &b, ZipFunctor &f)
Replaces each element a[i] of the distributed array with f(a[i], b[i]) with b being another distribut...
int getSize() const
Returns the global size of the distributed array.
DArray< T > & operator=(const DArray< T > &rhs)
Assignment operator.
DArray< R > mapIndex(MapIndexFunctor &f)
Returns a new distributed array with a_new[i] = f(i, a[i]). Note that besides the element itself also...
Contains global definitions such as macros, functions, enums and classes, and constants in order to c...
DArray< R > zip(DArray< T2 > &b, ZipFunctor &f)
Non-inplace variant of the zip skeleton.
void mapInPlace(MapFunctor &f)
Replaces each element a[i] of the distributed array with f(a[i]).
Distribution getGpuDistribution()
Returns the current GPU distribution scheme.
void freeDevice()
Manually free device memory.
T * getLocalPartition() const
Returns the local partition.
std::vector< GPUExecutionPlan< T > > getExecPlans()
Returns the GPU execution plans that store information about size, etc. for the GPU partitions...