Muesli
 All Classes Namespaces Files Functions Typedefs Enumerations
muesli.h
Go to the documentation of this file.
1 /*
2  * muesli.h
3  *
4  * Author: Steffen Ernsting <s.ernsting@uni-muenster.de>
5  *
6  * -------------------------------------------------------------------------------
7  *
8  * The MIT License
9  *
10  * Copyright 2014 Steffen Ernsting <s.ernsting@uni-muenster.de>,
11  * Herbert Kuchen <kuchen@uni-muenster.de.
12  *
13  * Permission is hereby granted, free of charge, to any person obtaining a copy
14  * of this software and associated documentation files (the "Software"), to deal
15  * in the Software without restriction, including without limitation the rights
16  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17  * copies of the Software, and to permit persons to whom the Software is
18  * furnished to do so, subject to the following conditions:
19  *
20  * The above copyright notice and this permission notice shall be included in
21  * all copies or substantial portions of the Software.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29  * THE SOFTWARE.
30  *
31  */
32 
33 #pragma once
34 
35 #include <mpi.h>
36 #include <iostream>
37 #include <cstdio>
38 #include <cstdlib>
39 #include <string>
40 #include <limits>
41 #include <omp.h>
42 #include <sstream>
43 #include <cstdarg>
44 #include <vector>
45 
46 #include "exception.h"
47 #include "curry.h"
48 #include "conversion.h"
49 #include "timer.h"
50 
56 #ifdef __CUDACC__
57 
64 #define MSL_USERFUNC __host__ __device__
65 
72 #define MSL_GPUFUNC __device__
73 
80 #define MSL_CPUFUNC __host__
81 
85 #define CUDA_CHECK_RETURN(value) { \
86  cudaError_t _m_cudaStat = value; \
87  if (_m_cudaStat != cudaSuccess) { \
88  fprintf(stderr, "Error %s at line %d in file %s\n", \
89  cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__); \
90  exit(EXIT_FAILURE); \
91  } \
92  }
93 #else
94 // when compiled with compilers other than nvcc, the function type qualifier macros
95 // expand to an empty word.
96 #define MSL_USERFUNC
97 #define MSL_GPUFUNC
98 #define MSL_CPUFUNC
99 #endif
100 
104 namespace msl {
105 
107 typedef int ProcessorNo;
108 
119 enum Distribution {DIST, COPY};
120 
126 class Muesli
127 {
128 public:
129  static int proc_id; // process id
130  static int proc_entrance; // process entrance (farm skeleton)
131  static int running_proc_no; // running process number (farm skeleton)
132  static int num_total_procs; // number of total processes
133  static int num_local_procs; // equals num_total_procs except when nesting DP into TP skeletons
134  static double start_time; // start time of an application
135  static char* program_name; // program name of an application
136  static int distribution_mode; // for farm skeleton
137  static int task_group_size; // aggregated task group size (farm skeleton)
138  static int num_conc_kernels; // number of concurrent kernels (farm skeleton)
139  static int num_threads; // number of CPU threads
140  static int num_runs; // number of runs, for benchmarking
141  static int num_gpus; // number of GPUs
142  static int max_gpus; // maximum number of GPUs of each process
143  static int threads_per_block; // for one dimensional GPU thread blocks (DArray)
144  static int tpb_x; // for two dimensional GPU thread blocks (DMatrix)
145  static int tpb_y; // for two dimensional GPU thread blocks (DMatrix)
146  static bool debug_communication; // farm skeleton
147  static bool use_timer; // use a timer?
148  static bool farm_statistics; // collect statistics of how many task were processed by CPU/GPU
149 #ifdef __CUDACC__
150  static std::vector<cudaStream_t> streams; // cuda streams for multi-gpu
151 #endif
152 };
153 
154 static const int ANY_TAG = MPI_ANY_TAG;
155 static const int MYTAG = 1; // used for ordinary messages containing data
156 static const int STOPTAG = 2; // used to stop the following process
157 static const int TERMINATION_TEST = 3;
158 static const int RANDOM_DISTRIBUTION = 1;
159 static const int CYCLIC_DISTRIBUTION = 2;
160 static const int DEFAULT_DISTRIBUTION = CYCLIC_DISTRIBUTION;
161 static const int UNDEFINED = -1;
162 static const int DEFAULT_TASK_GOUP_SIZE = 256;
163 static const int DEFAULT_NUM_CONC_KERNELS = 16;
164 static const int DEFAULT_NUM_RUNS = 1;
165 static const int DEFAUL_TILE_WIDTH = 16;
166 
167 
171 void initSkeletons(int argc, char** argv, bool debug_communication = 0);
172 
176 void terminateSkeletons();
177 
181 void printv(const char* format, ...);
182 
188 void setNumThreads(int num_threads);
189 
195 void setNumRuns(int num_runs);
196 
202 void setNumGpus(int num_gpus);
203 
210 void setThreadsPerBlock(int threads_per_block);
211 
219 void setThreadsPerBlock(int tpbX, int tpbY);
220 
227 void setNumConcurrentKernels(int num_kernels);
228 
235 void setTaskGroupSize(int size);
236 
240 void syncStreams();
241 
245 void startTiming();
246 
250 void splitTime(int run);
251 
257 double stopTiming();
258 
264 bool isRootProcess();
265 
270 void setFarmStatistics(bool val);
271 
277 MSL_USERFUNC
278 inline size_t getUniqueID()
279 {
280 #ifdef __CUDA_ARCH__
281  return blockIdx.x * blockDim.x + threadIdx.x +
282  blockIdx.y * blockDim.y + threadIdx.y;
283 #else
284  return omp_get_thread_num();
285 #endif
286 }
287 
296 template<typename T>
298 {
299  // given type has a value for infinity
300  if(std::numeric_limits<T>::has_infinity) {
301  return - std::numeric_limits<T>::infinity();
302  }
303  else { // given type has no value for infinity
304  return std::numeric_limits<T>::min();
305  }
306 }
307 
316 template<typename T>
318 {
319  // given type has a value for infinity
320  if(std::numeric_limits<T>::has_infinity) {
321  return std::numeric_limits<T>::infinity();
322  }
323  else { // given type has no value for infinity
324  return std::numeric_limits<T>::max();
325  }
326 }
327 
328 //
329 // SEND/RECV TAGS
330 //
331 
339 inline void MSL_SendTag(int destination, int tag);
340 
348 inline void MSL_ReceiveTag(int source, int tag);
349 
350 
351 //
352 // SEND/RECV FOR DATA PARALLEL SKELETONS
353 //
354 
364 template <typename T>
365 inline void MSL_Send(int destination, T* send_buffer, size_t size, int tag = MYTAG);
366 
377 template <typename T>
378 inline void MSL_ISend(int destination, T* send_buffer, MPI_Request& req, size_t size, int tag = MYTAG);
379 
389 template <typename T>
390 inline void MSL_Recv(int source, T* recv_buffer, size_t size, int tag = MYTAG);
391 
402 template <typename T>
403 inline void MSL_Recv(int source, T* recv_buffer, MPI_Status& stat, size_t size, int tag = MYTAG);
404 
415 template <typename T>
416 inline void MSL_IRecv(int source, T* recv_buffer, MPI_Request& req, size_t size, int tag = MYTAG);
417 
418 // Send/receive function for sending a buffer of type T to process \em destination and
419 // receiving a buffer of type T from the same process (destination).
420 template<typename T>
421 inline void MSL_SendReceive(int destination, T* send_buffer, T* recv_buffer, size_t size = 1);
422 
434 template<typename T>
435 void broadcast(T* buffer, int* const ids, int np, int idRoot, size_t count);
436 
448 template<typename T>
449 void allgather(T* send_buffer, T* recv_buffer, int* const ids, int np, size_t count);
450 
460 template<typename T>
461 void allgather(T* send_buffer, T* recv_buffer, size_t count);
462 
472 template <typename T>
473 inline void MSL_Broadcast(int source, T* buffer, int size);
474 
475 
476 //
477 // SEND/RECV FOR TASK PARALLEL SKELETONS
478 //
479 
488 template <typename T>
489 inline void MSL_Send(int destination, std::vector<T>& send_buffer, int tag = MYTAG);
490 
499 template <typename T>
500 inline void MSL_Recv(int source, std::vector<T>& recv_buffer, int tag = MYTAG);
501 
502 
503 //
504 // AUXILIARY FUNCTIONS
505 //
506 
510 void fail_exit();
511 
517 void throws(const detail::Exception& e);
518 
519 template <typename C1, typename C2>
520 inline C1 proj1_2(C1 a, C2 b);
521 
522 template <typename C1, typename C2>
523 inline C2 proj2_2(C1 a, C2 b);
524 
525 template <typename F>
526 inline int auxRotateRows(const Fct1<int, int, F>& f, int blocks, int row, int col);
527 
528 template <typename F>
529 inline int auxRotateCols(const Fct1<int, int, F>& f, int blocks, int row, int col);
530 
531 template <typename T>
532 inline void show(T* a, int size);
533 
534 }
535 
536 #include "../src/muesli_com.cpp"
537 
bool isRootProcess()
Checks whether this is process with id 0.
T getNegativeInfinity()
Returns the value which represents the negative infinity for the given type T. In case the given type...
Definition: muesli.h:297
void setNumRuns(int num_runs)
Sets the number of runs for a benchmark application.
MSL_USERFUNC size_t getUniqueID()
Returns a unique thread id.
Definition: muesli.h:278
Class Muesli contains globally available variables that determine the properties (number of running p...
Definition: muesli.h:126
void setNumConcurrentKernels(int num_kernels)
Sets the number of concurrent kernels per GPU. Only for the farm skeleton.
void fail_exit()
Used to quit the program on failure, must be used after initSkeletons()
void MSL_Broadcast(int source, T *buffer, int size)
Wrapper for the MPI_Broadcast routine. Every process in MPI_COMM WORLD participates.
void setNumGpus(int num_gpus)
Sets the number of GPUs to be used by each process.
void setNumThreads(int num_threads)
Sets the number of CPU threads.
void MSL_Send(int destination, T *send_buffer, size_t size, int tag=MYTAG)
Sends a buffer of type T to process destination.
Definition: distribution.h:39
void startTiming()
Starts timing.
void printv(const char *format,...)
Wrapper for printf. Only process with id 0 prints the given format string.
void allgather(T *send_buffer, T *recv_buffer, int *const ids, int np, size_t count)
Implementation of the MPI_Allgather routine. Only the processes in ȩm ids participate.
T getPositiveInfinity()
Returns the value which represents the positive infinity for the given type T. In case the given type...
Definition: muesli.h:317
void MSL_SendTag(int destination, int tag)
Sends a message without content. Mainly used for control messages such as stop messages.
void setThreadsPerBlock(int threads_per_block)
Sets the number of threads per (one dimensional) block. Note that threads_per_block <= 1024...
void setFarmStatistics(bool val)
Switches on or off (depending on the value of val) collecting farm statistics.
void syncStreams()
Synchronizes the CUDA streams.
double stopTiming()
Ends timing.
void terminateSkeletons()
Terminates Muesli. Needs to be called at the end of a Muesli application.
void MSL_ReceiveTag(int source, int tag)
Receives a message without content. Mainly used for control messages such as stop messages...
void MSL_Recv(int source, T *recv_buffer, size_t size, int tag=MYTAG)
Receives a buffer of type T from process source.
void setTaskGroupSize(int size)
Sets the task group size (i.e. size of sets to be processed) for the heterogeneous farm skeleton...
void MSL_IRecv(int source, T *recv_buffer, MPI_Request &req, size_t size, int tag=MYTAG)
Receives (non-blockig) a buffer of type T from process source.
void MSL_ISend(int destination, T *send_buffer, MPI_Request &req, size_t size, int tag=MYTAG)
Sends (non-blocking) a buffer of type T to process destination.
void broadcast(T *buffer, int *const ids, int np, int idRoot, size_t count)
Implementation of the MPI_Broadcast routine. Only the processes in ids participate.
void initSkeletons(int argc, char **argv, bool debug_communication=0)
Initializes Muesli. Needs to be called before any skeleton is used.
void splitTime(int run)
Prints the time elapsed since last split time.
int ProcessorNo
Typedef for process numbers.
Definition: muesli.h:107