46 #include "exception.h"
48 #include "conversion.h"
64 #define MSL_USERFUNC __host__ __device__
72 #define MSL_GPUFUNC __device__
80 #define MSL_CPUFUNC __host__
85 #define CUDA_CHECK_RETURN(value) { \
86 cudaError_t _m_cudaStat = value; \
87 if (_m_cudaStat != cudaSuccess) { \
88 fprintf(stderr, "Error %s at line %d in file %s\n", \
89 cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__); \
130 static int proc_entrance;
131 static int running_proc_no;
132 static int num_total_procs;
133 static int num_local_procs;
134 static double start_time;
135 static char* program_name;
136 static int distribution_mode;
137 static int task_group_size;
138 static int num_conc_kernels;
139 static int num_threads;
143 static int threads_per_block;
146 static bool debug_communication;
147 static bool use_timer;
148 static bool farm_statistics;
150 static std::vector<cudaStream_t> streams;
154 static const int ANY_TAG = MPI_ANY_TAG;
155 static const int MYTAG = 1;
156 static const int STOPTAG = 2;
157 static const int TERMINATION_TEST = 3;
158 static const int RANDOM_DISTRIBUTION = 1;
159 static const int CYCLIC_DISTRIBUTION = 2;
160 static const int DEFAULT_DISTRIBUTION = CYCLIC_DISTRIBUTION;
161 static const int UNDEFINED = -1;
162 static const int DEFAULT_TASK_GOUP_SIZE = 256;
163 static const int DEFAULT_NUM_CONC_KERNELS = 16;
164 static const int DEFAULT_NUM_RUNS = 1;
165 static const int DEFAUL_TILE_WIDTH = 16;
171 void initSkeletons(
int argc,
char** argv,
bool debug_communication = 0);
181 void printv(
const char* format, ...);
281 return blockIdx.x * blockDim.x + threadIdx.x +
282 blockIdx.y * blockDim.y + threadIdx.y;
284 return omp_get_thread_num();
300 if(std::numeric_limits<T>::has_infinity) {
301 return - std::numeric_limits<T>::infinity();
304 return std::numeric_limits<T>::min();
320 if(std::numeric_limits<T>::has_infinity) {
321 return std::numeric_limits<T>::infinity();
324 return std::numeric_limits<T>::max();
364 template <
typename T>
365 inline void MSL_Send(
int destination, T* send_buffer,
size_t size,
int tag = MYTAG);
377 template <
typename T>
378 inline void MSL_ISend(
int destination, T* send_buffer, MPI_Request& req,
size_t size,
int tag = MYTAG);
389 template <
typename T>
390 inline void MSL_Recv(
int source, T* recv_buffer,
size_t size,
int tag = MYTAG);
402 template <
typename T>
403 inline void MSL_Recv(
int source, T* recv_buffer, MPI_Status& stat,
size_t size,
int tag = MYTAG);
415 template <
typename T>
416 inline void MSL_IRecv(
int source, T* recv_buffer, MPI_Request& req,
size_t size,
int tag = MYTAG);
421 inline void MSL_SendReceive(
int destination, T* send_buffer, T* recv_buffer,
size_t size = 1);
435 void broadcast(T* buffer,
int*
const ids,
int np,
int idRoot,
size_t count);
449 void allgather(T* send_buffer, T* recv_buffer,
int*
const ids,
int np,
size_t count);
461 void allgather(T* send_buffer, T* recv_buffer,
size_t count);
472 template <
typename T>
488 template <
typename T>
489 inline void MSL_Send(
int destination, std::vector<T>& send_buffer,
int tag = MYTAG);
499 template <
typename T>
500 inline void MSL_Recv(
int source, std::vector<T>& recv_buffer,
int tag = MYTAG);
517 void throws(
const detail::Exception& e);
519 template <
typename C1,
typename C2>
520 inline C1 proj1_2(C1 a, C2 b);
522 template <
typename C1,
typename C2>
523 inline C2 proj2_2(C1 a, C2 b);
525 template <
typename F>
526 inline int auxRotateRows(
const Fct1<int, int, F>& f,
int blocks,
int row,
int col);
528 template <
typename F>
529 inline int auxRotateCols(
const Fct1<int, int, F>& f,
int blocks,
int row,
int col);
531 template <
typename T>
532 inline void show(T* a,
int size);
536 #include "../src/muesli_com.cpp"
bool isRootProcess()
Checks whether this is process with id 0.
T getNegativeInfinity()
Returns the value which represents the negative infinity for the given type T. In case the given type...
Definition: muesli.h:297
void setNumRuns(int num_runs)
Sets the number of runs for a benchmark application.
MSL_USERFUNC size_t getUniqueID()
Returns a unique thread id.
Definition: muesli.h:278
Class Muesli contains globally available variables that determine the properties (number of running p...
Definition: muesli.h:126
void setNumConcurrentKernels(int num_kernels)
Sets the number of concurrent kernels per GPU. Only for the farm skeleton.
void fail_exit()
Used to quit the program on failure, must be used after initSkeletons()
void MSL_Broadcast(int source, T *buffer, int size)
Wrapper for the MPI_Broadcast routine. Every process in MPI_COMM WORLD participates.
void setNumGpus(int num_gpus)
Sets the number of GPUs to be used by each process.
void setNumThreads(int num_threads)
Sets the number of CPU threads.
void MSL_Send(int destination, T *send_buffer, size_t size, int tag=MYTAG)
Sends a buffer of type T to process destination.
Definition: distribution.h:39
void startTiming()
Starts timing.
void printv(const char *format,...)
Wrapper for printf. Only process with id 0 prints the given format string.
void allgather(T *send_buffer, T *recv_buffer, int *const ids, int np, size_t count)
Implementation of the MPI_Allgather routine. Only the processes in ȩm ids participate.
T getPositiveInfinity()
Returns the value which represents the positive infinity for the given type T. In case the given type...
Definition: muesli.h:317
void MSL_SendTag(int destination, int tag)
Sends a message without content. Mainly used for control messages such as stop messages.
void setThreadsPerBlock(int threads_per_block)
Sets the number of threads per (one dimensional) block. Note that threads_per_block <= 1024...
void setFarmStatistics(bool val)
Switches on or off (depending on the value of val) collecting farm statistics.
void syncStreams()
Synchronizes the CUDA streams.
double stopTiming()
Ends timing.
void terminateSkeletons()
Terminates Muesli. Needs to be called at the end of a Muesli application.
void MSL_ReceiveTag(int source, int tag)
Receives a message without content. Mainly used for control messages such as stop messages...
void MSL_Recv(int source, T *recv_buffer, size_t size, int tag=MYTAG)
Receives a buffer of type T from process source.
void setTaskGroupSize(int size)
Sets the task group size (i.e. size of sets to be processed) for the heterogeneous farm skeleton...
void MSL_IRecv(int source, T *recv_buffer, MPI_Request &req, size_t size, int tag=MYTAG)
Receives (non-blockig) a buffer of type T from process source.
void MSL_ISend(int destination, T *send_buffer, MPI_Request &req, size_t size, int tag=MYTAG)
Sends (non-blocking) a buffer of type T to process destination.
void broadcast(T *buffer, int *const ids, int np, int idRoot, size_t count)
Implementation of the MPI_Broadcast routine. Only the processes in ids participate.
void initSkeletons(int argc, char **argv, bool debug_communication=0)
Initializes Muesli. Needs to be called before any skeleton is used.
void splitTime(int run)
Prints the time elapsed since last split time.
int ProcessorNo
Typedef for process numbers.
Definition: muesli.h:107