Actual source code: vecscattercusp.cu
petsc-3.7.4 2016-10-02
1: /*
2: Implements the various scatter operations on cusp vectors
3: */
5: #define PETSC_SKIP_COMPLEX
6: #define PETSC_SKIP_SPINLOCK
8: #include <petscconf.h>
9: #include <petsc/private/vecimpl.h> /*I "petscvec.h" I*/
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>
13: #include <cuda_runtime.h>
17: PetscErrorCode VecScatterCUSPIndicesCreate_StoS(PetscInt n,PetscInt toFirst,PetscInt fromFirst,PetscInt toStep, PetscInt fromStep,PetscInt *tslots, PetscInt *fslots,PetscCUSPIndices *ci) {
19: PetscCUSPIndices cci;
20: VecScatterCUSPIndices_StoS stos_scatter;
21: cudaError_t err = cudaSuccess;
22: cudaStream_t stream;
23: PetscInt *intVecGPU;
24: int device;
25: cudaDeviceProp props;
28: cci = new struct _p_PetscCUSPIndices;
29: stos_scatter = new struct _p_VecScatterCUSPIndices_StoS;
31: /* create the "from" indices */
32: stos_scatter->fslots = 0;
33: stos_scatter->fromFirst = 0;
34: stos_scatter->fromStep = 0;
35: if (n) {
36: if (fslots) {
37: /* allocate GPU memory for the to-slots */
38: err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUSP((int)err);
39: err = cudaMemcpy(intVecGPU,fslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUSP((int)err);
41: /* assign the pointer to the struct */
42: stos_scatter->fslots = intVecGPU;
43: stos_scatter->fromMode = VEC_SCATTER_CUSP_GENERAL;
44: } else if (fromStep) {
45: stos_scatter->fromFirst = fromFirst;
46: stos_scatter->fromStep = fromStep;
47: stos_scatter->fromMode = VEC_SCATTER_CUSP_STRIDED;
48: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide fslots or fromStep.");
49: }
51: /* create the "to" indices */
52: stos_scatter->tslots = 0;
53: stos_scatter->toFirst = 0;
54: stos_scatter->toStep = 0;
55: if (n) {
56: if (tslots) {
57: /* allocate GPU memory for the to-slots */
58: err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUSP((int)err);
59: err = cudaMemcpy(intVecGPU,tslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUSP((int)err);
61: /* assign the pointer to the struct */
62: stos_scatter->tslots = intVecGPU;
63: stos_scatter->toMode = VEC_SCATTER_CUSP_GENERAL;
64: } else if (toStep) {
65: stos_scatter->toFirst = toFirst;
66: stos_scatter->toStep = toStep;
67: stos_scatter->toMode = VEC_SCATTER_CUSP_STRIDED;
68: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide tslots or toStep.");
69: }
71: /* allocate the stream variable */
72: err = cudaStreamCreate(&stream);CHKERRCUSP((int)err);
73: stos_scatter->stream = stream;
75: /* the number of indices */
76: stos_scatter->n = n;
78: /* get the maximum number of coresident thread blocks */
79: cudaGetDevice(&device);
80: cudaGetDeviceProperties(&props, device);
81: stos_scatter->MAX_CORESIDENT_THREADS = props.maxThreadsPerMultiProcessor;
82: if (props.major>=3) {
83: stos_scatter->MAX_BLOCKS = 16*props.multiProcessorCount;
84: } else {
85: stos_scatter->MAX_BLOCKS = 8*props.multiProcessorCount;
86: }
88: /* assign the indices */
89: cci->scatter = (VecScatterCUSPIndices_StoS)stos_scatter;
90: cci->scatterType = VEC_SCATTER_CUSP_STOS;
91: *ci = cci;
92: return(0);
93: }
97: PetscErrorCode VecScatterCUSPIndicesCreate_PtoP(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUSPIndices *ci)
98: {
99: PetscCUSPIndices cci;
100: VecScatterCUSPIndices_PtoP ptop_scatter;
103: cci = new struct _p_PetscCUSPIndices;
104: ptop_scatter = new struct _p_VecScatterCUSPIndices_PtoP;
106: /* this calculation assumes that the input indices are sorted */
107: ptop_scatter->ns = sendIndices[ns-1]-sendIndices[0]+1;
108: ptop_scatter->sendLowestIndex = sendIndices[0];
109: ptop_scatter->nr = recvIndices[nr-1]-recvIndices[0]+1;
110: ptop_scatter->recvLowestIndex = recvIndices[0];
112: /* assign indices */
113: cci->scatter = (VecScatterCUSPIndices_PtoP)ptop_scatter;
114: cci->scatterType = VEC_SCATTER_CUSP_PTOP;
116: *ci = cci;
117: return(0);
118: }
122: PetscErrorCode VecScatterCUSPIndicesDestroy(PetscCUSPIndices *ci)
123: {
125: if (!(*ci)) return(0);
126: try {
127: if (ci) {
128: if ((*ci)->scatterType == VEC_SCATTER_CUSP_PTOP) {
129: delete (VecScatterCUSPIndices_PtoP)(*ci)->scatter;
130: (*ci)->scatter = 0;
131: } else {
132: cudaError_t err = cudaSuccess;
133: VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)(*ci)->scatter;
134: if (stos_scatter->fslots) {
135: err = cudaFree(stos_scatter->fslots);CHKERRCUSP((int)err);
136: stos_scatter->fslots = 0;
137: }
139: /* free the GPU memory for the to-slots */
140: if (stos_scatter->tslots) {
141: err = cudaFree(stos_scatter->tslots);CHKERRCUSP((int)err);
142: stos_scatter->tslots = 0;
143: }
145: /* free the stream variable */
146: if (stos_scatter->stream) {
147: err = cudaStreamDestroy(stos_scatter->stream);CHKERRCUSP((int)err);
148: stos_scatter->stream = 0;
149: }
150: delete stos_scatter;
151: (*ci)->scatter = 0;
152: }
153: delete *ci;
154: *ci = 0;
155: }
156: } catch(char *ex) {
157: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
158: }
159: return(0);
160: }
162: /* Insert operator */
163: class Insert {
164: public:
165: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
166: return a;
167: }
168: };
170: /* Add operator */
171: class Add {
172: public:
173: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
174: return a+b;
175: }
176: };
178: /* Add operator */
179: class Max {
180: public:
181: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
182: #if !defined(PETSC_USE_COMPLEX)
183: return PetscMax(a,b);
184: #endif
185: }
186: };
188: /* Sequential general to sequential general GPU kernel */
189: template<class OPERATOR>
190: __global__ void VecScatterCUSP_SGtoSG_kernel(PetscInt n,PetscInt *xind,PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
191: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
192: const int grid_size = gridDim.x * blockDim.x;
193: for (int i = tidx; i < n; i += grid_size) {
194: y[yind[i]] = OP(x[xind[i]],y[yind[i]]);
195: }
196: }
198: /* Sequential general to sequential strided GPU kernel */
199: template<class OPERATOR>
200: __global__ void VecScatterCUSP_SGtoSS_kernel(PetscInt n,PetscInt *xind,PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
201: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
202: const int grid_size = gridDim.x * blockDim.x;
203: for (int i = tidx; i < n; i += grid_size) {
204: y[toFirst+i*toStep] = OP(x[xind[i]],y[toFirst+i*toStep]);
205: }
206: }
208: /* Sequential strided to sequential strided GPU kernel */
209: template<class OPERATOR>
210: __global__ void VecScatterCUSP_SStoSS_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
211: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
212: const int grid_size = gridDim.x * blockDim.x;
213: for (int i = tidx; i < n; i += grid_size) {
214: y[toFirst+i*toStep] = OP(x[fromFirst+i*fromStep],y[toFirst+i*toStep]);
215: }
216: }
218: /* Sequential strided to sequential general GPU kernel */
219: template<class OPERATOR>
220: __global__ void VecScatterCUSP_SStoSG_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
221: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
222: const int grid_size = gridDim.x * blockDim.x;
223: for (int i = tidx; i < n; i += grid_size) {
224: y[yind[i]] = OP(x[fromFirst+i*fromStep],y[yind[i]]);
225: }
226: }
228: template<class OPERATOR>
229: void VecScatterCUSP_StoS_Dispatcher(CUSPARRAY *xarray,CUSPARRAY *yarray,PetscCUSPIndices ci,ScatterMode mode,OPERATOR OP) {
231: PetscInt nBlocks=0,nThreads=128;
232: VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)ci->scatter;
234: nBlocks=(int)ceil(((float) stos_scatter->n)/((float) nThreads))+1;
235: if (nBlocks>stos_scatter->MAX_CORESIDENT_THREADS/nThreads) {
236: nBlocks = stos_scatter->MAX_CORESIDENT_THREADS/nThreads;
237: }
238: dim3 block(nThreads,1,1);
239: dim3 grid(nBlocks,1,1);
241: if (mode == SCATTER_FORWARD) {
242: if (stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL) {
243: VecScatterCUSP_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray->data().get(),stos_scatter->tslots,yarray->data().get(),OP);
244: } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED) {
245: VecScatterCUSP_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray->data().get(),stos_scatter->toFirst,stos_scatter->toStep,yarray->data().get(),OP);
246: } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED) {
247: VecScatterCUSP_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray->data().get(),stos_scatter->toFirst,stos_scatter->toStep,yarray->data().get(),OP);
248: } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL) {
249: VecScatterCUSP_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray->data().get(),stos_scatter->tslots,yarray->data().get(),OP);
250: }
251: } else {
252: if (stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL) {
253: VecScatterCUSP_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray->data().get(),stos_scatter->fslots,yarray->data().get(),OP);
254: } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED) {
255: VecScatterCUSP_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray->data().get(),stos_scatter->fromFirst,stos_scatter->fromStep,yarray->data().get(),OP);
256: } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED) {
257: VecScatterCUSP_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray->data().get(),stos_scatter->fromFirst,stos_scatter->fromStep,yarray->data().get(),OP);
258: } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL) {
259: VecScatterCUSP_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray->data().get(),stos_scatter->fslots,yarray->data().get(),OP);
260: }
261: }
262: }
266: PetscErrorCode VecScatterCUSP_StoS(Vec x,Vec y,PetscCUSPIndices ci,InsertMode addv,ScatterMode mode)
267: {
268: PetscErrorCode ierr;
269: CUSPARRAY *xarray,*yarray;
270: VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)ci->scatter;
271: cudaError_t err = cudaSuccess;
274: VecCUSPAllocateCheck(x);
275: VecCUSPAllocateCheck(y);
276: VecCUSPGetArrayRead(x,&xarray);
277: VecCUSPGetArrayReadWrite(y,&yarray);
278: if (stos_scatter->n) {
279: if (addv == INSERT_VALUES)
280: VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Insert());
281: else if (addv == ADD_VALUES)
282: VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Add());
283: #if !defined(PETSC_USE_COMPLEX)
284: else if (addv == MAX_VALUES)
285: VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Max());
286: #endif
287: else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Wrong insert option");
288: err = cudaGetLastError();CHKERRCUSP((int)err);
289: err = cudaStreamSynchronize(stos_scatter->stream);CHKERRCUSP((int)err);
290: }
291: VecCUSPRestoreArrayRead(x,&xarray);
292: VecCUSPRestoreArrayReadWrite(y,&yarray);
293: return(0);
294: }