Muesli
 All Classes Namespaces Files Functions Typedefs Enumerations
plmatrix.h
1 /*
2  * plmatrix.h
3  *
4  * Author: Steffen Ernsting <s.ernsting@uni-muenster.de>
5  *
6  * -------------------------------------------------------------------------------
7  *
8  * The MIT License
9  *
10  * Copyright 2014 Steffen Ernsting <s.ernsting@uni-muenster.de>,
11  * Herbert Kuchen <kuchen@uni-muenster.de.
12  *
13  * Permission is hereby granted, free of charge, to any person obtaining a copy
14  * of this software and associated documentation files (the "Software"), to deal
15  * in the Software without restriction, including without limitation the rights
16  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17  * copies of the Software, and to permit persons to whom the Software is
18  * furnished to do so, subject to the following conditions:
19  *
20  * The above copyright notice and this permission notice shall be included in
21  * all copies or substantial portions of the Software.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29  * THE SOFTWARE.
30  *
31  */
32 
33 #pragma once
34 
35 #include "argtype.h"
36 
37 namespace msl {
38 
48 template <typename T>
49 class PLMatrix : public ArgumentType
50 {
51 public:
55  PLMatrix(int n, int m, int r, int c, int ss, int tw, T nv)
56  : ArgumentType(), current_data(0), shared_data(0), n(n), m(m), rows(r),
57  cols(c), stencil_size(ss), firstRow(Muesli::proc_id*r), firstRowGPU(0),
58  tile_width(tw), width(2*stencil_size+tile_width), neutral_value(nv)
59  {
60  }
61 
66  {
67  }
68 
73  void addDevicePtr(T* d_ptr)
74  {
75  ptrs.push_back(d_ptr);
76  if (it != ptrs.begin()) {
77  it = ptrs.begin();
78  current_data = *it;
79  }
80  }
81 
86  void update()
87  {
88  if (++it == ptrs.end()) {
89  it = ptrs.begin();
90  }
91  current_data = *it;
92  }
93 
99  MSL_USERFUNC
100  int getRows() const
101  {
102  return rows;
103  }
104 
110  MSL_USERFUNC
111  int getCols() const
112  {
113  return cols;
114  }
115 
122  MSL_USERFUNC
123  T get(int row, int col) const
124  {
125 #ifdef __CUDA_ARCH__
126  // GPU version: read from shared memory.
127  int r = blockIdx.y * blockDim.y + threadIdx.y;
128  int c = blockIdx.x * blockDim.x + threadIdx.x;
129  int rowIndex = (row-firstRowGPU-r+stencil_size)+threadIdx.y;
130  int colIndex = (col-c+stencil_size)+threadIdx.x;
131  return shared_data[rowIndex*width + colIndex];
132 // if ((col < 0) || (col >= m)) {
133 // // out of bounds -> return neutral value
134 // return neutral_value;
135 // } else { // in bounds -> return desired value
136 // return current_data[(row-firstRow+stencil_size)*cols + col];
137 // }
138 #else
139  // CPU version: read from main memory.
140  // bounds check
141  if ((col < 0) || (col >= m)) {
142  // out of bounds -> return neutral value
143  return neutral_value;
144  } else { // in bounds -> return desired value
145  return current_data[(row-firstRow+stencil_size)*cols + col];
146  }
147 #endif
148  }
149 
150 #ifdef __CUDACC__
151 
155  __device__
156  void readToSharedMem(T* smem, int r, int c, int tile_width)
157  {
158  int tx = threadIdx.x; int ty = threadIdx.y;
159  int row = r-firstRow;
160 
161  // read assigned value into shared memory
162  smem[(ty+stencil_size)*width + tx+stencil_size] = current_data[(row+stencil_size)*cols + c];
163 
164  // read halo values
165  // first row of tile needs to read upper stencil_size rows of halo values
166  if (ty == 0) {
167  for (int i = 0; i < stencil_size; i++) {
168  smem[i*width + stencil_size+tx] = current_data[(row+i)*cols + c];
169  }
170  }
171 
172  // last row of tile needs to read lower stencil_size rows of halo values
173  if (ty == tile_width-1) {
174  for (int i = 0; i < stencil_size; i++) {
175  smem[(i+stencil_size+tile_width)*width + stencil_size+tx] =
176  current_data[(row+stencil_size+i+1)*cols + c];
177  }
178  }
179 
180  // first column of tile needs to read left hand side stencil_size columns of halo values
181  if (tx == 0) {
182  for (int i = 0; i < stencil_size; i++) {
183  if (c+i-stencil_size < 0) {
184  smem[(ty+stencil_size)*width + i] = neutral_value;
185  }
186  else
187  smem[(ty+stencil_size)*width + i] =
188  current_data[(row+stencil_size)*cols + c+i-stencil_size];
189  }
190  }
191 
192  // last column of tile needs to read right hand side stencil_size columns of halo values
193  if (tx == tile_width-1) {
194  for (int i = 0; i < stencil_size; i++) {
195  if (c+i+1 > m-1)
196  smem[(ty+stencil_size)*width + i+tile_width+stencil_size] = neutral_value;
197  else
198  smem[(ty+stencil_size)*width + i+tile_width+stencil_size] =
199  current_data[(row+stencil_size)*cols + c+i+1];
200  }
201  }
202 
203  // upper left corner
204  if (tx == 0 && ty == 0) {
205  for (int i = 0; i < stencil_size; i++) {
206  for (int j = 0; j < stencil_size; j++) {
207  if (c+j-stencil_size < 0)
208  smem[i*width + j] = neutral_value;
209  else
210  smem[i*width + j] = current_data[(row+i)*cols + c+j-stencil_size];
211  }
212  }
213  }
214 
215  // upper right corner
216  if (tx == tile_width-1 && ty == 0) {
217  for (int i = 0; i < stencil_size; i++) {
218  for (int j = 0; j < stencil_size; j++) {
219  if (c+j+1 > m-1)
220  smem[i*width + j+stencil_size+tile_width] = neutral_value;
221  else
222  smem[i*width + j+stencil_size+tile_width] = current_data[(row+i)*cols + c+j+1];
223  }
224  }
225  }
226 
227  // lower left corner
228  if (tx == 0 && ty == tile_width-1) {
229  for (int i = 0; i < stencil_size; i++) {
230  for (int j = 0; j < stencil_size; j++) {
231  if (c+j-stencil_size < 0)
232  smem[(i+stencil_size+tile_width)*width + j] = neutral_value;
233  else
234  smem[(i+stencil_size+tile_width)*width + j] =
235  current_data[(row+i+stencil_size+1)*cols + c+j-stencil_size];
236  }
237  }
238  }
239 
240  // lower right corner
241  if (tx == tile_width-1 && ty == tile_width-1) {
242  for (int i = 0; i < stencil_size; i++) {
243  for (int j = 0; j < stencil_size; j++) {
244  if (c+j+1 > m-1)
245  smem[(i+stencil_size+tile_width)*width + j+stencil_size+tile_width] = neutral_value;
246  else
247  smem[(i+stencil_size+tile_width)*width + j+stencil_size+tile_width] =
248  current_data[(row+i+stencil_size+1)*cols + c+j+1];
249  }
250  }
251  }
252 
253  __syncthreads();
254 
255  shared_data = smem;
256  }
257 #endif
258 
264  void setFirstRowGPU(int fr)
265  {
266  firstRowGPU = fr;
267  }
268 
269 private:
270  std::vector<T*> ptrs;
271  typename std::vector<T*>::iterator it;
272  T* current_data, *shared_data;
273  int n, m, rows, cols, stencil_size, firstRow, firstRowGPU, tile_width, width;
274  T neutral_value;
275 };
276 
277 }
278 
279 
280 
281 
282 
Class Muesli contains globally available variables that determine the properties (number of running p...
Definition: muesli.h:126
PLMatrix(int n, int m, int r, int c, int ss, int tw, T nv)
Constructor: creates a PLMatrix.
Definition: plmatrix.h:55
MSL_USERFUNC int getCols() const
Returns the number of columns of the padded local matrix.
Definition: plmatrix.h:111
void addDevicePtr(T *d_ptr)
Adds another pointer to data residing in GPU or in CPU memory, respectively.
Definition: plmatrix.h:73
void update()
Updates the pointer to point to current data (that resides in one of the GPUs memory or in CPU memory...
Definition: plmatrix.h:86
void setFirstRowGPU(int fr)
Sets the first row index for the current device.
Definition: plmatrix.h:264
Base class for argument types of functors.
Definition: argtype.h:47
MSL_USERFUNC int getRows() const
Returns the number of rows of the padded local matrix.
Definition: plmatrix.h:100
~PLMatrix()
Destructor.
Definition: plmatrix.h:65