Actual source code: veccuda2.cu
petsc-3.7.4 2016-10-02
1: /*
2: Implements the sequential cuda vectors.
3: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <petsc/private/vecimpl.h>
9: #include <../src/vec/vec/impls/dvecimpl.h>
10: #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>
12: #include <cuda_runtime.h>
13: #include <thrust/device_ptr.h>
14: #include <thrust/transform.h>
15: #include <thrust/functional.h>
19: /*
20: Allocates space for the vector array on the GPU if it does not exist.
21: Does NOT change the PetscCUDAFlag for the vector
22: Does NOT zero the CUDA array
24: */
25: PetscErrorCode VecCUDAAllocateCheck(Vec v)
26: {
28: cudaError_t err;
29: cudaStream_t stream;
30: Vec_CUDA *veccuda;
33: if (!v->spptr) {
34: PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
35: veccuda = (Vec_CUDA*)v->spptr;
36: err = cudaMalloc((void **) &veccuda->GPUarray, sizeof(PetscScalar) * ((PetscBLASInt)v->map->n));CHKERRCUDA(err);
37: err = cudaStreamCreate(&stream);CHKERRCUDA(err);
38: veccuda->stream = stream;
39: veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
40: v->ops->destroy = VecDestroy_SeqCUDA;
41: if (v->valid_GPU_array == PETSC_CUDA_UNALLOCATED) {
42: if (v->data && ((Vec_Seq*)v->data)->array) {
43: v->valid_GPU_array = PETSC_CUDA_CPU;
44: } else {
45: v->valid_GPU_array = PETSC_CUDA_GPU;
46: }
47: }
48: }
49: return(0);
50: }
54: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
55: PetscErrorCode VecCUDACopyToGPU(Vec v)
56: {
58: cudaError_t err;
59: Vec_CUDA *veccuda;
60: PetscScalar *varray;
63: VecCUDAAllocateCheck(v);
64: if (v->valid_GPU_array == PETSC_CUDA_CPU) {
65: PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
66: veccuda=(Vec_CUDA*)v->spptr;
67: varray=veccuda->GPUarray;
68: err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
69: PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
70: v->valid_GPU_array = PETSC_CUDA_BOTH;
71: }
72: return(0);
73: }
77: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci)
78: {
79: PetscScalar *varray;
81: cudaError_t err;
82: PetscScalar *cpuPtr, *gpuPtr;
83: Vec_Seq *s;
84: VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
87: VecCUDAAllocateCheck(v);
88: if (v->valid_GPU_array == PETSC_CUDA_CPU) {
89: s = (Vec_Seq*)v->data;
91: PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
92: varray = ((Vec_CUDA*)v->spptr)->GPUarray;
93: gpuPtr = varray + ptop_scatter->recvLowestIndex;
94: cpuPtr = s->array + ptop_scatter->recvLowestIndex;
96: /* Note : this code copies the smallest contiguous chunk of data
97: containing ALL of the indices */
98: err = cudaMemcpy(gpuPtr,cpuPtr,ptop_scatter->nr*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
100: // Set the buffer states
101: v->valid_GPU_array = PETSC_CUDA_BOTH;
102: PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
103: }
104: return(0);
105: }
110: /*
111: VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
112: */
113: PetscErrorCode VecCUDACopyFromGPU(Vec v)
114: {
116: cudaError_t err;
117: Vec_CUDA *veccuda;
118: PetscScalar *varray;
121: VecCUDAAllocateCheckHost(v);
122: if (v->valid_GPU_array == PETSC_CUDA_GPU) {
123: PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
124: veccuda=(Vec_CUDA*)v->spptr;
125: varray=veccuda->GPUarray;
126: err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
127: PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
128: v->valid_GPU_array = PETSC_CUDA_BOTH;
129: }
130: return(0);
131: }
135: /* Note that this function only copies *some* of the values up from the GPU to CPU,
136: which means that we need recombine the data at some point before using any of the standard functions.
137: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
138: where you have to always call in pairs
139: */
140: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci)
141: {
142: PetscScalar *varray;
144: cudaError_t err;
145: PetscScalar *cpuPtr, *gpuPtr;
146: Vec_Seq *s;
147: VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
150: VecCUDAAllocateCheckHost(v);
151: if (v->valid_GPU_array == PETSC_CUDA_GPU) {
152: PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);
154: varray=((Vec_CUDA*)v->spptr)->GPUarray;
155: s = (Vec_Seq*)v->data;
156: gpuPtr = varray + ptop_scatter->sendLowestIndex;
157: cpuPtr = s->array + ptop_scatter->sendLowestIndex;
159: /* Note : this code copies the smallest contiguous chunk of data
160: containing ALL of the indices */
161: err = cudaMemcpy(cpuPtr,gpuPtr,ptop_scatter->ns*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
163: VecCUDARestoreArrayRead(v,&varray);
164: PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
165: }
166: return(0);
167: }
169: /*MC
170: VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA
172: Options Database Keys:
173: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()
175: Level: beginner
177: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
178: M*/
182: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
183: {
184: PetscScalar *xarray,*yarray;
186: PetscBLASInt one=1,bn;
187: PetscScalar sone=1.0;
188: cublasStatus_t cberr;
189: cudaError_t err;
192: PetscBLASIntCast(yin->map->n,&bn);
193: VecCUDAGetArrayRead(xin,&xarray);
194: VecCUDAGetArrayReadWrite(yin,&yarray);
195: if (alpha == (PetscScalar)0.0) {
196: err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
197: } else if (alpha == (PetscScalar)1.0) {
198: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
199: PetscLogFlops(2.0*yin->map->n);
200: } else {
201: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
202: cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
203: PetscLogFlops(2.0*yin->map->n);
204: }
205: WaitForGPU();CHKERRCUDA(ierr);
206: VecCUDARestoreArrayRead(xin,&xarray);
207: VecCUDARestoreArrayReadWrite(yin,&yarray);
208: return(0);
209: }
213: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
214: {
215: PetscScalar *xarray,*yarray;
217: PetscBLASInt one=1,bn;
218: cublasStatus_t cberr;
221: if (alpha != (PetscScalar)0.0) {
222: PetscBLASIntCast(yin->map->n,&bn);
223: VecCUDAGetArrayRead(xin,&xarray);
224: VecCUDAGetArrayReadWrite(yin,&yarray);
225: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
226: WaitForGPU();CHKERRCUDA(ierr);
227: VecCUDARestoreArrayRead(xin,&xarray);
228: VecCUDARestoreArrayReadWrite(yin,&yarray);
229: PetscLogFlops(2.0*yin->map->n);
230: }
231: return(0);
232: }
236: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
237: {
238: PetscInt n = xin->map->n;
239: PetscScalar *warray=NULL,*xarray=NULL,*yarray=NULL;
240: thrust::device_ptr<PetscScalar> wptr,xptr,yptr;
241: PetscErrorCode ierr;
244: VecCUDAGetArrayWrite(win,&warray);
245: VecCUDAGetArrayRead(xin,&xarray);
246: VecCUDAGetArrayRead(yin,&yarray);
247: try {
248: wptr = thrust::device_pointer_cast(warray);
249: xptr = thrust::device_pointer_cast(xarray);
250: yptr = thrust::device_pointer_cast(yarray);
251: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
252: WaitForGPU();CHKERRCUDA(ierr);
253: } catch (char *ex) {
254: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
255: }
256: PetscLogFlops(n);
257: VecCUDARestoreArrayRead(xin,&xarray);
258: VecCUDARestoreArrayRead(yin,&yarray);
259: VecCUDARestoreArrayWrite(win,&warray);
260: return(0);
261: }
265: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
266: {
267: PetscScalar *xarray=NULL,*yarray=NULL,*warray=NULL;
269: PetscBLASInt one=1,bn;
270: cublasStatus_t cberr;
271: cudaError_t err;
274: PetscBLASIntCast(win->map->n,&bn);
275: if (alpha == (PetscScalar)0.0) {
276: VecCopy_SeqCUDA(yin,win);
277: } else {
278: VecCUDAGetArrayRead(xin,&xarray);
279: VecCUDAGetArrayRead(yin,&yarray);
280: VecCUDAGetArrayWrite(win,&warray);
281: err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
282: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
283: PetscLogFlops(2*win->map->n);
284: WaitForGPU();CHKERRCUDA(ierr);
285: VecCUDARestoreArrayRead(xin,&xarray);
286: VecCUDARestoreArrayRead(yin,&yarray);
287: VecCUDARestoreArrayWrite(win,&warray);
288: }
289: return(0);
290: }
294: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
295: {
297: PetscInt n = xin->map->n,j,j_rem;
298: PetscScalar alpha0,alpha1,alpha2,alpha3;
301: PetscLogFlops(nv*2.0*n);
302: switch (j_rem=nv&0x3) {
303: case 3:
304: alpha0 = alpha[0];
305: alpha1 = alpha[1];
306: alpha2 = alpha[2];
307: alpha += 3;
308: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
309: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
310: VecAXPY_SeqCUDA(xin,alpha2,y[2]);
311: y += 3;
312: break;
313: case 2:
314: alpha0 = alpha[0];
315: alpha1 = alpha[1];
316: alpha +=2;
317: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
318: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
319: y +=2;
320: break;
321: case 1:
322: alpha0 = *alpha++;
323: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
324: y +=1;
325: break;
326: }
327: for (j=j_rem; j<nv; j+=4) {
328: alpha0 = alpha[0];
329: alpha1 = alpha[1];
330: alpha2 = alpha[2];
331: alpha3 = alpha[3];
332: alpha += 4;
333: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
334: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
335: VecAXPY_SeqCUDA(xin,alpha2,y[2]);
336: VecAXPY_SeqCUDA(xin,alpha3,y[3]);
337: y += 4;
338: }
339: WaitForGPU();CHKERRCUDA(ierr);
340: return(0);
341: }
345: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
346: {
347: PetscScalar *xarray,*yarray;
349: PetscBLASInt one=1,bn;
350: cublasStatus_t cberr;
353: PetscBLASIntCast(yin->map->n,&bn);
354: VecCUDAGetArrayRead(xin,&xarray);
355: VecCUDAGetArrayRead(yin,&yarray);
356: /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
357: cberr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cberr);
358: WaitForGPU();CHKERRCUDA(ierr);
359: if (xin->map->n >0) {
360: PetscLogFlops(2.0*xin->map->n-1);
361: }
362: VecCUDARestoreArrayRead(xin,&xarray);
363: VecCUDARestoreArrayRead(yin,&yarray);
364: return(0);
365: }
367: //
368: // CUDA kernels for MDot to follow
369: //
371: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
372: #define MDOT_WORKGROUP_SIZE 128
373: #define MDOT_WORKGROUP_NUM 128
375: #if !defined(PETSC_USE_COMPLEX)
376: // M = 2:
377: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
378: PetscInt size, PetscScalar *group_results)
379: {
380: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
381: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
382: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
383: PetscInt vec_start_index = blockIdx.x * entries_per_group;
384: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
386: PetscScalar entry_x = 0;
387: PetscScalar group_sum0 = 0;
388: PetscScalar group_sum1 = 0;
389: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
390: entry_x = x[i]; // load only once from global memory!
391: group_sum0 += entry_x * y0[i];
392: group_sum1 += entry_x * y1[i];
393: }
394: tmp_buffer[threadIdx.x] = group_sum0;
395: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
397: // parallel reduction
398: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
399: __syncthreads();
400: if (threadIdx.x < stride) {
401: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
402: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
403: }
404: }
406: // write result of group to group_results
407: if (threadIdx.x == 0) {
408: group_results[blockIdx.x] = tmp_buffer[0];
409: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
410: }
411: }
413: // M = 3:
414: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
415: PetscInt size, PetscScalar *group_results)
416: {
417: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
418: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
419: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
420: PetscInt vec_start_index = blockIdx.x * entries_per_group;
421: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
423: PetscScalar entry_x = 0;
424: PetscScalar group_sum0 = 0;
425: PetscScalar group_sum1 = 0;
426: PetscScalar group_sum2 = 0;
427: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
428: entry_x = x[i]; // load only once from global memory!
429: group_sum0 += entry_x * y0[i];
430: group_sum1 += entry_x * y1[i];
431: group_sum2 += entry_x * y2[i];
432: }
433: tmp_buffer[threadIdx.x] = group_sum0;
434: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
435: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
437: // parallel reduction
438: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
439: __syncthreads();
440: if (threadIdx.x < stride) {
441: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
442: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
443: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
444: }
445: }
447: // write result of group to group_results
448: if (threadIdx.x == 0) {
449: group_results[blockIdx.x ] = tmp_buffer[0];
450: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
451: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
452: }
453: }
455: // M = 4:
456: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
457: PetscInt size, PetscScalar *group_results)
458: {
459: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
460: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
461: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
462: PetscInt vec_start_index = blockIdx.x * entries_per_group;
463: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
465: PetscScalar entry_x = 0;
466: PetscScalar group_sum0 = 0;
467: PetscScalar group_sum1 = 0;
468: PetscScalar group_sum2 = 0;
469: PetscScalar group_sum3 = 0;
470: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
471: entry_x = x[i]; // load only once from global memory!
472: group_sum0 += entry_x * y0[i];
473: group_sum1 += entry_x * y1[i];
474: group_sum2 += entry_x * y2[i];
475: group_sum3 += entry_x * y3[i];
476: }
477: tmp_buffer[threadIdx.x] = group_sum0;
478: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
479: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
480: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
482: // parallel reduction
483: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
484: __syncthreads();
485: if (threadIdx.x < stride) {
486: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
487: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
488: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
489: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
490: }
491: }
493: // write result of group to group_results
494: if (threadIdx.x == 0) {
495: group_results[blockIdx.x ] = tmp_buffer[0];
496: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
497: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
498: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
499: }
500: }
502: // M = 8:
503: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
504: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
505: PetscInt size, PetscScalar *group_results)
506: {
507: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
508: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
509: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
510: PetscInt vec_start_index = blockIdx.x * entries_per_group;
511: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
513: PetscScalar entry_x = 0;
514: PetscScalar group_sum0 = 0;
515: PetscScalar group_sum1 = 0;
516: PetscScalar group_sum2 = 0;
517: PetscScalar group_sum3 = 0;
518: PetscScalar group_sum4 = 0;
519: PetscScalar group_sum5 = 0;
520: PetscScalar group_sum6 = 0;
521: PetscScalar group_sum7 = 0;
522: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
523: entry_x = x[i]; // load only once from global memory!
524: group_sum0 += entry_x * y0[i];
525: group_sum1 += entry_x * y1[i];
526: group_sum2 += entry_x * y2[i];
527: group_sum3 += entry_x * y3[i];
528: group_sum4 += entry_x * y4[i];
529: group_sum5 += entry_x * y5[i];
530: group_sum6 += entry_x * y6[i];
531: group_sum7 += entry_x * y7[i];
532: }
533: tmp_buffer[threadIdx.x] = group_sum0;
534: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
535: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
536: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
537: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
538: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
539: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
540: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
542: // parallel reduction
543: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
544: __syncthreads();
545: if (threadIdx.x < stride) {
546: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
547: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
548: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
549: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
550: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
551: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
552: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
553: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
554: }
555: }
557: // write result of group to group_results
558: if (threadIdx.x == 0) {
559: group_results[blockIdx.x ] = tmp_buffer[0];
560: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
561: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
562: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
563: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
564: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
565: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
566: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
567: }
568: }
569: #endif /* !defined(PETSC_USE_COMPLEX) */
573: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
574: {
576: PetscInt i,n = xin->map->n,current_y_index = 0;
577: PetscScalar *group_results_gpu,*xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
578: #if !defined(PETSC_USE_COMPLEX)
579: PetscInt j;
580: PetscScalar group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
581: #endif
582: cudaError_t cuda_ierr;
583: PetscBLASInt one=1,bn;
584: cublasStatus_t cberr;
587: PetscBLASIntCast(xin->map->n,&bn);
588: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
589: /* Handle the case of local size zero first */
590: if (!xin->map->n) {
591: for (i=0; i<nv; ++i) z[i] = 0;
592: return(0);
593: }
595: // allocate scratchpad memory for the results of individual work groups:
596: cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);
598: VecCUDAGetArrayRead(xin,&xptr);
600: while (current_y_index < nv)
601: {
602: switch (nv - current_y_index) {
604: case 7:
605: case 6:
606: case 5:
607: case 4:
608: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
609: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
610: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
611: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
613: #if defined(PETSC_USE_COMPLEX)
614: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
615: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
616: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
617: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
618: #else
619: // run kernel:
620: VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
622: // copy results back to
623: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
625: // sum group results into z:
626: for (j=0; j<4; ++j) {
627: z[current_y_index + j] = 0;
628: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
629: }
630: #endif
631: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
632: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
633: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
634: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
635: current_y_index += 4;
636: break;
638: case 3:
639: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
640: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
641: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
643: #if defined(PETSC_USE_COMPLEX)
644: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
645: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
646: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
647: #else
648: // run kernel:
649: VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
651: // copy results back to
652: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
654: // sum group results into z:
655: for (j=0; j<3; ++j) {
656: z[current_y_index + j] = 0;
657: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
658: }
659: #endif
661: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
662: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
663: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
664: current_y_index += 3;
665: break;
667: case 2:
668: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
669: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
671: #if defined(PETSC_USE_COMPLEX)
672: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
673: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
674: #else
675: // run kernel:
676: VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
678: // copy results back to
679: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
681: // sum group results into z:
682: for (j=0; j<2; ++j) {
683: z[current_y_index + j] = 0;
684: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
685: }
686: #endif
687: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
688: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
689: current_y_index += 2;
690: break;
692: case 1:
693: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
694: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
695: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
696: current_y_index += 1;
697: break;
699: default: // 8 or more vectors left
700: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
701: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
702: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
703: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
704: VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
705: VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
706: VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
707: VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
709: #if defined(PETSC_USE_COMPLEX)
710: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
711: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
712: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
713: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
714: cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
715: cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
716: cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
717: cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
718: #else
719: // run kernel:
720: VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
722: // copy results back to
723: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
725: // sum group results into z:
726: for (j=0; j<8; ++j) {
727: z[current_y_index + j] = 0;
728: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
729: }
730: #endif
731: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
732: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
733: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
734: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
735: VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
736: VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
737: VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
738: VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
739: current_y_index += 8;
740: break;
741: }
742: }
743: VecCUDARestoreArrayRead(xin,&xptr);
745: cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
746: PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
747: return(0);
748: }
750: #undef MDOT_WORKGROUP_SIZE
751: #undef MDOT_WORKGROUP_NUM
755: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
756: {
757: PetscInt n = xin->map->n;
758: PetscScalar *xarray=NULL;
759: thrust::device_ptr<PetscScalar> xptr;
760: PetscErrorCode ierr;
761: cudaError_t err;
764: VecCUDAGetArrayWrite(xin,&xarray);
765: if (alpha == (PetscScalar)0.0) {
766: err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
767: } else {
768: try {
769: xptr = thrust::device_pointer_cast(xarray);
770: thrust::fill(xptr,xptr+n,alpha);
771: } catch (char *ex) {
772: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
773: }
774: }
775: WaitForGPU();CHKERRCUDA(ierr);
776: VecCUDARestoreArrayWrite(xin,&xarray);
777: return(0);
778: }
782: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
783: {
784: PetscScalar *xarray;
786: PetscBLASInt one=1,bn;
787: cublasStatus_t cberr;
790: if (alpha == (PetscScalar)0.0) {
791: VecSet_SeqCUDA(xin,alpha);
792: } else if (alpha != (PetscScalar)1.0) {
793: PetscBLASIntCast(xin->map->n,&bn);
794: VecCUDAGetArrayReadWrite(xin,&xarray);
795: cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
796: VecCUDARestoreArrayReadWrite(xin,&xarray);
797: }
798: WaitForGPU();CHKERRCUDA(ierr);
799: PetscLogFlops(xin->map->n);
800: return(0);
801: }
805: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
806: {
807: PetscScalar *xarray,*yarray;
809: PetscBLASInt one=1,bn;
810: cublasStatus_t cberr;
813: PetscBLASIntCast(xin->map->n,&bn);
814: VecCUDAGetArrayRead(xin,&xarray);
815: VecCUDAGetArrayRead(yin,&yarray);
816: cberr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cberr);
817: WaitForGPU();CHKERRCUDA(ierr);
818: if (xin->map->n > 0) {
819: PetscLogFlops(2.0*xin->map->n-1);
820: }
821: VecCUDARestoreArrayRead(yin,&yarray);
822: VecCUDARestoreArrayRead(xin,&xarray);
823: return(0);
824: }
828: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
829: {
830: PetscScalar *xarray,*yarray;
832: cudaError_t err;
835: if (xin != yin) {
836: if (xin->valid_GPU_array == PETSC_CUDA_GPU) {
837: VecCUDAGetArrayRead(xin,&xarray);
838: VecCUDAGetArrayWrite(yin,&yarray);
839: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
840: WaitForGPU();CHKERRCUDA(ierr);
841: VecCUDARestoreArrayRead(xin,&xarray);
842: VecCUDARestoreArrayWrite(yin,&yarray);
844: } else if (xin->valid_GPU_array == PETSC_CUDA_CPU) {
845: /* copy in CPU if we are on the CPU*/
846: VecCopy_SeqCUDA_Private(xin,yin);
847: } else if (xin->valid_GPU_array == PETSC_CUDA_BOTH) {
848: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
849: if (yin->valid_GPU_array == PETSC_CUDA_CPU) {
850: /* copy in CPU */
851: VecCopy_SeqCUDA_Private(xin,yin);
853: } else if (yin->valid_GPU_array == PETSC_CUDA_GPU) {
854: /* copy in GPU */
855: VecCUDAGetArrayRead(xin,&xarray);
856: VecCUDAGetArrayWrite(yin,&yarray);
857: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
858: VecCUDARestoreArrayRead(xin,&xarray);
859: VecCUDARestoreArrayWrite(yin,&yarray);
860: } else if (yin->valid_GPU_array == PETSC_CUDA_BOTH) {
861: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
862: default to copy in GPU (this is an arbitrary choice) */
863: VecCUDAGetArrayRead(xin,&xarray);
864: VecCUDAGetArrayWrite(yin,&yarray);
865: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
866: VecCUDARestoreArrayRead(xin,&xarray);
867: VecCUDARestoreArrayWrite(yin,&yarray);
868: } else {
869: VecCopy_SeqCUDA_Private(xin,yin);
870: }
871: }
872: }
873: return(0);
874: }
878: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
879: {
881: PetscBLASInt one = 1,bn;
882: PetscScalar *xarray,*yarray;
883: cublasStatus_t cberr;
886: PetscBLASIntCast(xin->map->n,&bn);
887: if (xin != yin) {
888: VecCUDAGetArrayReadWrite(xin,&xarray);
889: VecCUDAGetArrayReadWrite(yin,&yarray);
890: cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
891: WaitForGPU();CHKERRCUDA(ierr);
892: VecCUDARestoreArrayReadWrite(xin,&xarray);
893: VecCUDARestoreArrayReadWrite(yin,&yarray);
894: }
895: return(0);
896: }
900: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
901: {
903: PetscScalar a = alpha,b = beta;
904: PetscScalar *xarray,*yarray;
905: PetscBLASInt one = 1, bn;
906: cublasStatus_t cberr;
907: cudaError_t err;
910: PetscBLASIntCast(yin->map->n,&bn);
911: if (a == (PetscScalar)0.0) {
912: VecScale_SeqCUDA(yin,beta);
913: } else if (b == (PetscScalar)1.0) {
914: VecAXPY_SeqCUDA(yin,alpha,xin);
915: } else if (a == (PetscScalar)1.0) {
916: VecAYPX_SeqCUDA(yin,beta,xin);
917: } else if (b == (PetscScalar)0.0) {
918: VecCUDAGetArrayRead(xin,&xarray);
919: VecCUDAGetArrayReadWrite(yin,&yarray);
920: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
921: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
922: PetscLogFlops(xin->map->n);
923: WaitForGPU();CHKERRCUDA(ierr);
924: VecCUDARestoreArrayRead(xin,&xarray);
925: VecCUDARestoreArrayReadWrite(yin,&yarray);
926: } else {
927: VecCUDAGetArrayRead(xin,&xarray);
928: VecCUDAGetArrayReadWrite(yin,&yarray);
929: cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
930: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
931: VecCUDARestoreArrayRead(xin,&xarray);
932: VecCUDARestoreArrayReadWrite(yin,&yarray);
933: WaitForGPU();CHKERRCUDA(ierr);
934: PetscLogFlops(3.0*xin->map->n);
935: }
936: return(0);
937: }
941: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
942: {
944: PetscInt n = zin->map->n;
947: if (gamma == (PetscScalar)1.0) {
948: /* z = ax + b*y + z */
949: VecAXPY_SeqCUDA(zin,alpha,xin);
950: VecAXPY_SeqCUDA(zin,beta,yin);
951: PetscLogFlops(4.0*n);
952: } else {
953: /* z = a*x + b*y + c*z */
954: VecScale_SeqCUDA(zin,gamma);
955: VecAXPY_SeqCUDA(zin,alpha,xin);
956: VecAXPY_SeqCUDA(zin,beta,yin);
957: PetscLogFlops(5.0*n);
958: }
959: WaitForGPU();CHKERRCUDA(ierr);
960: return(0);
961: }
965: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
966: {
967: PetscInt n = win->map->n;
968: PetscScalar *warray,*xarray,*yarray;
969: thrust::device_ptr<PetscScalar> wptr,xptr,yptr;
973: VecCUDAGetArrayReadWrite(win,&warray);
974: VecCUDAGetArrayRead(xin,&xarray);
975: VecCUDAGetArrayRead(yin,&yarray);
976: try {
977: wptr = thrust::device_pointer_cast(warray);
978: xptr = thrust::device_pointer_cast(xarray);
979: yptr = thrust::device_pointer_cast(yarray);
980: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
981: WaitForGPU();CHKERRCUDA(ierr);
982: } catch (char *ex) {
983: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
984: }
985: VecCUDARestoreArrayRead(xin,&xarray);
986: VecCUDARestoreArrayRead(yin,&yarray);
987: VecCUDARestoreArrayReadWrite(win,&warray);
988: PetscLogFlops(n);
989: return(0);
990: }
992: /* should do infinity norm in cuda */
996: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
997: {
998: PetscErrorCode ierr;
999: PetscInt n = xin->map->n;
1000: PetscBLASInt one = 1, bn;
1001: PetscScalar *xarray;
1002: cublasStatus_t cberr;
1003: cudaError_t err;
1006: PetscBLASIntCast(n,&bn);
1007: if (type == NORM_2 || type == NORM_FROBENIUS) {
1008: VecCUDAGetArrayRead(xin,&xarray);
1009: cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1010: WaitForGPU();CHKERRCUDA(ierr);
1011: VecCUDARestoreArrayRead(xin,&xarray);
1012: PetscLogFlops(PetscMax(2.0*n-1,0.0));
1013: } else if (type == NORM_INFINITY) {
1014: PetscInt i;
1015: VecCUDAGetArrayRead(xin,&xarray);
1016: cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1017: err = cudaMemcpy(z,xarray+i,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1018: VecCUDARestoreArrayRead(xin,&xarray);
1019: } else if (type == NORM_1) {
1020: VecCUDAGetArrayRead(xin,&xarray);
1021: cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1022: VecCUDARestoreArrayRead(xin,&xarray);
1023: WaitForGPU();CHKERRCUDA(ierr);
1024: PetscLogFlops(PetscMax(n-1.0,0.0));
1025: } else if (type == NORM_1_AND_2) {
1026: VecNorm_SeqCUDA(xin,NORM_1,z);
1027: VecNorm_SeqCUDA(xin,NORM_2,z+1);
1028: }
1029: return(0);
1030: }
1034: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1035: {
1036: PetscErrorCode ierr;
1037: PetscReal n=s->map->n;
1038: PetscScalar *sarray,*tarray;
1041: VecCUDAGetArrayRead(s,&sarray);
1042: VecCUDAGetArrayRead(t,&tarray);
1043: VecDot_SeqCUDA(s,t,dp);
1044: VecDot_SeqCUDA(t,t,nm);
1045: VecCUDARestoreArrayRead(s,&sarray);
1046: VecCUDARestoreArrayRead(t,&tarray);
1047: WaitForGPU();CHKERRCUDA(ierr);
1048: PetscLogFlops(4.0*n);
1049: return(0);
1050: }
1054: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1055: {
1057: cudaError_t err;
1060: if (v->spptr) {
1061: err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray);CHKERRCUDA(err);
1062: ((Vec_CUDA*)v->spptr)->GPUarray = NULL;
1063: err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1064: PetscFree(v->spptr);
1065: }
1066: VecDestroy_SeqCUDA_Private(v);
1067: return(0);
1068: }
1070: #if defined(PETSC_USE_COMPLEX)
1071: struct conjugate
1072: {
1073: __host__ __device__
1074: PetscScalar operator()(PetscScalar x)
1075: {
1076: return PetscConj(x);
1077: }
1078: };
1079: #endif
1083: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1084: {
1085: PetscScalar *xarray;
1086: PetscErrorCode ierr;
1087: #if defined(PETSC_USE_COMPLEX)
1088: PetscInt n = xin->map->n;
1089: thrust::device_ptr<PetscScalar> xptr;
1090: #endif
1093: VecCUDAGetArrayReadWrite(xin,&xarray);
1094: #if defined(PETSC_USE_COMPLEX)
1095: try {
1096: xptr = thrust::device_pointer_cast(xarray);
1097: thrust::transform(xptr,xptr+n,xptr,conjugate());
1098: WaitForGPU();CHKERRCUDA(ierr);
1099: } catch (char *ex) {
1100: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1101: }
1102: #endif
1103: VecCUDARestoreArrayReadWrite(xin,&xarray);
1104: return(0);
1105: }
1109: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1110: {
1111: VecType t;
1113: cudaError_t err;
1114: PetscBool flg;
1119: VecGetType(w,&t);
1120: PetscStrcmp(t,VECSEQCUDA,&flg);
1121: if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed to argument #2. Should be %s.\n",t,VECSEQCUDA);
1123: if (w->data) {
1124: if (((Vec_Seq*)w->data)->array_allocated) {
1125: PetscFree(((Vec_Seq*)w->data)->array_allocated);
1126: }
1127: ((Vec_Seq*)w->data)->array = NULL;
1128: ((Vec_Seq*)w->data)->unplacedarray = NULL;
1129: }
1130: if (w->spptr) {
1131: if (((Vec_CUDA*)w->spptr)->GPUarray) {
1132: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1133: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1134: }
1135: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1136: PetscFree(w->spptr);
1137: }
1139: if (v->petscnative) {
1140: PetscFree(w->data);
1141: w->data = v->data;
1142: w->valid_GPU_array = v->valid_GPU_array;
1143: w->spptr = v->spptr;
1144: PetscObjectStateIncrease((PetscObject)w);
1145: } else {
1146: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1147: w->valid_GPU_array = PETSC_CUDA_CPU;
1148: VecCUDAAllocateCheck(w);
1149: }
1150: return(0);
1151: }
1155: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1156: {
1157: VecType t;
1159: cudaError_t err;
1160: PetscBool flg;
1165: VecGetType(w,&t);
1166: PetscStrcmp(t,VECSEQCUDA,&flg);
1167: if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed to argument #2. Should be %s.\n",t,VECSEQCUDA);
1169: if (v->petscnative) {
1170: v->data = w->data;
1171: v->valid_GPU_array = w->valid_GPU_array;
1172: v->spptr = w->spptr;
1173: VecCUDACopyFromGPU(v);
1174: PetscObjectStateIncrease((PetscObject)v);
1175: w->data = 0;
1176: w->valid_GPU_array = PETSC_CUDA_UNALLOCATED;
1177: w->spptr = 0;
1178: } else {
1179: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1180: if ((Vec_CUDA*)w->spptr) {
1181: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1182: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1183: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1184: PetscFree(w->spptr);
1185: }
1186: }
1187: return(0);
1188: }
1192: /*@C
1193: VecCUDAGetArrayReadWrite - Provides access to the CUDA buffer inside a vector.
1195: This function has semantics similar to VecGetArray(): the pointer
1196: returned by this function points to a consistent view of the vector
1197: data. This may involve a copy operation of data from the host to the
1198: device if the data on the device is out of date. If the device
1199: memory hasn't been allocated previously it will be allocated as part
1200: of this function call. VecCUDAGetArrayReadWrite() assumes that
1201: the user will modify the vector data. This is similar to
1202: intent(inout) in fortran.
1204: The CUDA device pointer has to be released by calling
1205: VecCUDARestoreArrayReadWrite(). Upon restoring the vector data
1206: the data on the host will be marked as out of date. A subsequent
1207: access of the host data will thus incur a data transfer from the
1208: device to the host.
1211: Input Parameter:
1212: . v - the vector
1214: Output Parameter:
1215: . a - the CUDA device pointer
1217: Fortran note:
1218: This function is not currently available from Fortran.
1220: Level: intermediate
1222: .seealso: VecCUDARestoreArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1223: @*/
1224: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayReadWrite(Vec v, PetscScalar **a)
1225: {
1229: *a = 0;
1230: VecCUDACopyToGPU(v);
1231: *a = ((Vec_CUDA*)v->spptr)->GPUarray;
1232: return(0);
1233: }
1237: /*@C
1238: VecCUDARestoreArrayReadWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayReadWrite().
1240: This marks the host data as out of date. Subsequent access to the
1241: vector data on the host side with for instance VecGetArray() incurs a
1242: data transfer.
1244: Input Parameter:
1245: + v - the vector
1246: - a - the CUDA device pointer. This pointer is invalid after
1247: VecCUDARestoreArrayReadWrite() returns.
1249: Fortran note:
1250: This function is not currently available from Fortran.
1252: Level: intermediate
1254: .seealso: VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1255: @*/
1256: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayReadWrite(Vec v, PetscScalar **a)
1257: {
1261: v->valid_GPU_array = PETSC_CUDA_GPU;
1263: PetscObjectStateIncrease((PetscObject)v);
1264: return(0);
1265: }
1269: /*@C
1270: VecCUDAGetArrayRead - Provides read access to the CUDA buffer inside a vector.
1272: This function is analogous to VecGetArrayRead(): The pointer
1273: returned by this function points to a consistent view of the vector
1274: data. This may involve a copy operation of data from the host to the
1275: device if the data on the device is out of date. If the device
1276: memory hasn't been allocated previously it will be allocated as part
1277: of this function call. VecCUDAGetArrayRead() assumes that the
1278: user will not modify the vector data. This is analgogous to
1279: intent(in) in Fortran.
1281: The CUDA device pointer has to be released by calling
1282: VecCUDARestoreArrayRead(). If the data on the host side was
1283: previously up to date it will remain so, i.e. data on both the device
1284: and the host is up to date. Accessing data on the host side does not
1285: incur a device to host data transfer.
1287: Input Parameter:
1288: . v - the vector
1290: Output Parameter:
1291: . a - the CUDA pointer.
1293: Fortran note:
1294: This function is not currently available from Fortran.
1296: Level: intermediate
1298: .seealso: VecCUDARestoreArrayRead(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1299: @*/
1300: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayRead(Vec v, PetscScalar **a)
1301: {
1305: *a = 0;
1306: VecCUDACopyToGPU(v);
1307: *a = ((Vec_CUDA*)v->spptr)->GPUarray;
1308: return(0);
1309: }
1313: /*@C
1314: VecCUDARestoreArrayRead - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayRead().
1316: If the data on the host side was previously up to date it will remain
1317: so, i.e. data on both the device and the host is up to date.
1318: Accessing data on the host side e.g. with VecGetArray() does not
1319: incur a device to host data transfer.
1321: Input Parameter:
1322: + v - the vector
1323: - a - the CUDA device pointer. This pointer is invalid after
1324: VecCUDARestoreArrayRead() returns.
1326: Fortran note:
1327: This function is not currently available from Fortran.
1329: Level: intermediate
1331: .seealso: VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecCUDAGetArrayReadWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1332: @*/
1333: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayRead(Vec v, PetscScalar **a)
1334: {
1336: return(0);
1337: }
1341: /*@C
1342: VecCUDAGetArrayWrite - Provides write access to the CUDA buffer inside a vector.
1344: The data pointed to by the device pointer is uninitialized. The user
1345: may not read from this data. Furthermore, the entire array needs to
1346: be filled by the user to obtain well-defined behaviour. The device
1347: memory will be allocated by this function if it hasn't been allocated
1348: previously. This is analogous to intent(out) in Fortran.
1350: The device pointer needs to be released with
1351: VecCUDARestoreArrayWrite(). When the pointer is released the
1352: host data of the vector is marked as out of data. Subsequent access
1353: of the host data with e.g. VecGetArray() incurs a device to host data
1354: transfer.
1357: Input Parameter:
1358: . v - the vector
1360: Output Parameter:
1361: . a - the CUDA pointer
1363: Fortran note:
1364: This function is not currently available from Fortran.
1366: Level: advanced
1368: .seealso: VecCUDARestoreArrayWrite(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1369: @*/
1370: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayWrite(Vec v, PetscScalar **a)
1371: {
1375: *a = 0;
1376: VecCUDAAllocateCheck(v);
1377: *a = ((Vec_CUDA*)v->spptr)->GPUarray;
1378: return(0);
1379: }
1383: /*@C
1384: VecCUDARestoreArrayWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayWrite().
1386: Data on the host will be marked as out of date. Subsequent access of
1387: the data on the host side e.g. with VecGetArray() will incur a device
1388: to host data transfer.
1390: Input Parameter:
1391: + v - the vector
1392: - a - the CUDA device pointer. This pointer is invalid after
1393: VecCUDARestoreArrayWrite() returns.
1395: Fortran note:
1396: This function is not currently available from Fortran.
1398: Level: intermediate
1400: .seealso: VecCUDAGetArrayWrite(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1401: @*/
1402: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayWrite(Vec v, PetscScalar **a)
1403: {
1407: v->valid_GPU_array = PETSC_CUDA_GPU;
1409: PetscObjectStateIncrease((PetscObject)v);
1410: return(0);
1411: }
1415: /*@C
1416: VecCUDAPlaceArray - Allows one to replace the GPU array in a vector with a
1417: GPU array provided by the user. This is useful to avoid copying an
1418: array into a vector.
1420: Not Collective
1422: Input Parameters:
1423: + vec - the vector
1424: - array - the GPU array
1426: Notes:
1427: You can return to the original GPU array with a call to VecCUDAResetArray()
1428: It is not possible to use VecCUDAPlaceArray() and VecPlaceArray() at the
1429: same time on the same vector.
1431: Level: developer
1433: .seealso: VecPlaceArray(), VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAReplaceArray()
1435: @*/
1436: PetscErrorCode VecCUDAPlaceArray(Vec vin,PetscScalar *a)
1437: {
1441: VecCUDACopyToGPU(vin);
1442: if (((Vec_Seq*)vin->data)->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"VecCUDAPlaceArray()/VecPlaceArray() was already called on this vector, without a call to VecCUDAResetArray()/VecResetArray()");
1443: ((Vec_Seq*)vin->data)->unplacedarray = (PetscScalar *) ((Vec_CUDA*)vin->spptr)->GPUarray; /* save previous GPU array so reset can bring it back */
1444: ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1445: vin->valid_GPU_array = PETSC_CUDA_GPU;
1446: return(0);
1447: }
1451: /*@C
1452: VecCUDAReplaceArray - Allows one to replace the GPU array in a vector
1453: with a GPU array provided by the user. This is useful to avoid copying
1454: a GPU array into a vector.
1456: Not Collective
1458: Input Parameters:
1459: + vec - the vector
1460: - array - the GPU array
1462: Notes:
1463: This permanently replaces the GPU array and frees the memory associated
1464: with the old GPU array.
1466: The memory passed in CANNOT be freed by the user. It will be freed
1467: when the vector is destroyed.
1469: Not supported from Fortran
1471: Level: developer
1473: .seealso: VecGetArray(), VecRestoreArray(), VecPlaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAPlaceArray(), VecReplaceArray()
1475: @*/
1476: PetscErrorCode VecCUDAReplaceArray(Vec vin,PetscScalar *a)
1477: {
1478: cudaError_t err;
1481: err = cudaFree(((Vec_CUDA*)vin->spptr)->GPUarray);CHKERRCUDA(err);
1482: ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1483: vin->valid_GPU_array = PETSC_CUDA_GPU;
1484: return(0);
1485: }
1489: /*@C
1490: VecCUDAResetArray - Resets a vector to use its default memory. Call this
1491: after the use of VecCUDAPlaceArray().
1493: Not Collective
1495: Input Parameters:
1496: . vec - the vector
1498: Level: developer
1500: .seealso: VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecPlaceArray(), VecResetArray(), VecCUDAPlaceArray(), VecCUDAReplaceArray()
1502: @*/
1503: PetscErrorCode VecCUDAResetArray(Vec vin)
1504: {
1508: VecCUDACopyToGPU(vin);
1509: ((Vec_CUDA*)vin->spptr)->GPUarray = (PetscScalar *) ((Vec_Seq*)vin->data)->unplacedarray;
1510: ((Vec_Seq*)vin->data)->unplacedarray = 0;
1511: vin->valid_GPU_array = PETSC_CUDA_GPU;
1512: return(0);
1513: }